In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np

import warnings
import datetime as dt
warnings.filterwarnings('ignore')

In [2]:
events = pd.read_csv('data/events_up_to_01062018.csv')
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [15]:
events.count()

timestamp                   2341681
event                       2341681
person                      2341681
url                          191131
sku                         1320530
model                       1321513
condition                   1320530
storage                     1320530
color                       1320530
skus                         505949
search_term                  113763
staticpage                    11201
campaign_source              191286
search_engine                106406
channel                      204069
new_vs_returning             204069
city                         204069
region                       204069
country                      204069
device_type                  204069
screen_resolution            204066
operating_system_version     204069
browser_version              204069
dtype: int64

In [3]:
label_training = pd.read_csv('data/labels_training_set.csv')
label_training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [4]:
label_training.describe()

Unnamed: 0,label
count,19414.0
mean,0.050479
std,0.218937
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
training_set = label_training.merge(events, how= 'inner', on = 'person')
training_set.count()

person                      1171886
label                       1171886
timestamp                   1171886
event                       1171886
url                           94875
sku                          665336
model                        665767
condition                    665336
storage                      665336
color                        665336
skus                         249587
search_term                   55774
staticpage                     5660
campaign_source               94940
search_engine                 52829
channel                      102299
new_vs_returning             102299
city                         102299
region                       102299
country                      102299
device_type                  102299
screen_resolution            102297
operating_system_version     102299
browser_version              102299
dtype: int64

In [5]:
prediction_set = events.merge(label_training, how= 'left', on = 'person')
prediction_set = prediction_set[prediction_set.label.isnull()]
prediction_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",...,,,,,,,,,,


In [7]:
prediction_set.count()

timestamp                   1169795
event                       1169795
person                      1169795
url                           96256
sku                          655194
model                        655746
condition                    655194
storage                      655194
color                        655194
skus                         256362
search_term                   57989
staticpage                     5541
campaign_source               96346
search_engine                 53577
channel                      101770
new_vs_returning             101770
city                         101770
region                       101770
country                      101770
device_type                  101770
screen_resolution            101769
operating_system_version     101770
browser_version              101770
label                             0
dtype: int64

In [8]:
prediction_set.to_csv('data/set_prediccion.csv', encoding='utf-8', index=False)

In [6]:
del prediction_set

In [7]:
del events
del label_training

# Analisis TP1

In [18]:
training_set.columns

Index(['person', 'label', 'timestamp', 'event', 'url', 'sku', 'model',
       'condition', 'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'],
      dtype='object')

In [8]:
training_set.drop(['url','sku','skus','city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [9]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
training_set[['timestamp']] = training_set[['timestamp']].apply(pd.to_datetime)
training_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
training_set.head()

Unnamed: 0,person,label,fecha,event,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,0566e9c1,0,2018-05-24 16:42:28,staticpage,,,,,,CustomerService,,,,,,,
1,0566e9c1,0,2018-05-24 16:42:20,ad campaign hit,,,,,,,,,,,,,
2,0566e9c1,0,2018-05-24 11:29:04,ad campaign hit,,,,,,,,,,,,,
3,0566e9c1,0,2018-05-24 11:29:53,ad campaign hit,,,,,,,,,,,,,
4,0566e9c1,0,2018-05-24 00:15:29,ad campaign hit,,,,,,,,,,,,,


In [11]:
#Detalles de Productos que fueron comprados
productos = training_set.loc[:, ['event','fecha','person','sku', 'model', 'condition', 'storage', 'color']]
productos_comprados = productos.loc[(productos.event == 'checkout') | (productos.event == 'conversion')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados.head()

Unnamed: 0,fecha,person,sku,model,condition,storage,color
5,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco
7,2018-05-23 18:39:08,0566e9c1,,iPhone 4S,Bom,8GB,Branco
85,2018-05-29 23:39:14,abe7a2fb,,iPhone 6,Bom,64GB,Prateado
208,2018-05-18 23:44:19,87ed62de,,Samsung Galaxy J7,Excelente,16GB,Dourado
215,2018-05-18 23:41:45,87ed62de,,Samsung Galaxy J7,Excelente,16GB,Dourado


In [12]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[(productos.event == 'viewed product')]
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados.head()

Unnamed: 0,fecha,person,sku,model,condition,storage,color
8,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto
9,2018-05-23 18:38:05,0566e9c1,,iPhone 4S,Bom,8GB,Preto
10,2018-05-23 18:38:32,0566e9c1,,iPhone 4S,Bom,8GB,Branco
12,2018-05-22 17:55:05,0566e9c1,,iPhone 5s,Bom,16GB,Prateado
13,2018-05-22 17:54:54,0566e9c1,,iPhone 5s,Bom - Sem Touch ID,16GB,Prateado


In [13]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = training_set.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
51,2018-05-23 19:52:31,0566e9c1,Returning,Rio de Janeiro,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
52,2018-05-23 21:25:56,0566e9c1,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
53,2018-05-24 11:29:03,0566e9c1,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
54,2018-05-24 16:42:20,0566e9c1,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
55,2018-05-25 20:59:09,0566e9c1,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11


In [14]:
productos_comprados.count()

fecha        36103
person       36103
sku              0
model        36103
condition    36103
storage      36103
color        36103
dtype: int64

In [15]:
productos_visitados.count()

fecha        629233
person       629233
sku               0
model        629233
condition    629233
storage      629233
color        629233
dtype: int64

In [16]:
features_usuario.count()

fecha                       102299
person                      102299
new_vs_returning            102299
region                      102299
country                     102299
device_type                 102299
screen_resolution           102297
operating_system_version    102299
browser_version             102299
dtype: int64

In [17]:
del productos
del visitas

# Set de Datos de Entrenamiento

In [18]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="inner")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-23 19:52:31,Returning,Rio de Janeiro,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
1,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-23 21:25:56,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
2,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-24 11:29:03,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
3,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-24 16:42:20,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
4,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-25 20:59:09,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11


In [19]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="inner")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto,2018-05-23 19:52:31,Returning,Rio de Janeiro,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
1,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto,2018-05-23 21:25:56,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
2,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto,2018-05-24 11:29:03,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
3,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto,2018-05-24 16:42:20,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
4,2018-05-23 18:36:41,0566e9c1,,iPhone 4G,Muito Bom,16GB,Preto,2018-05-25 20:59:09,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11


In [31]:
user_prod_comprados['person'].count()

487040

In [32]:
user_prod_vistas['person'].count()

15144663

In [20]:
del features_usuario
del productos_comprados
del productos_visitados

In [21]:
#concateno
# user_prod_comprados['label'] = 1
# user_prod_vistas['label'] = 0
frames = [user_prod_comprados, user_prod_vistas]
features = pd.concat(frames)
features.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-23 19:52:31,Returning,Rio de Janeiro,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
1,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-23 21:25:56,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
2,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-24 11:29:03,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
3,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-24 16:42:20,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11
4,2018-05-23 18:44:26,0566e9c1,,iPhone 4S,Bom,8GB,Branco,2018-05-25 20:59:09,Returning,Parana,Brazil,Smartphone,320x568,iOS 11.3,Mobile Safari 11


In [22]:
del frames
del user_prod_comprados
del user_prod_vistas


In [23]:
features.count()

fecha_x                     15631703
person                      15631703
sku                                0
model                       15631703
condition                   15631703
storage                     15631703
color                       15631703
fecha_y                     15631703
new_vs_returning            15631703
region                      15631703
country                     15631703
device_type                 15631703
screen_resolution           15631559
operating_system_version    15631703
browser_version             15631703
dtype: int64

In [24]:
features = features.fillna(value='')

MemoryError: 

## Categorizacion de los features a entrenar

In [21]:
training_set = features 
del features

In [None]:
#Categorizamos los colores de 1 a n
training_set_list = training_set[['color']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [23]:
training_set[['color']] = training_set[['color']].applymap(lambda x: diccionary[x])

In [25]:
#Categorizamos el estado del usuario 0 para new y 1 para returning
training_set[['new_vs_returning']] = training_set[['new_vs_returning']].applymap(lambda x: 0 if (x == 'New') else 1)

In [26]:
#Categorizamos los modelos de 1 a n
training_set_list = training_set[['model']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [27]:
training_set[['model']] = training_set[['model']].applymap(lambda x: diccionary[x])

In [28]:
#Categorizamos las condiciones del producto de 1 a n
training_set_list = training_set[['condition']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [29]:
training_set[['condition']] = training_set[['condition']].applymap(lambda x: diccionary[x])

In [30]:
#Categorizamos la capacidad de almacenamiento del producto de 1 a n
training_set_list = training_set[['storage']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [31]:
training_set[['storage']] = training_set[['storage']].applymap(lambda x: diccionary[x])

In [37]:
#Categorizamos las regiones, paises y tipo de caracteristicas del usuario en un conjunto de 1 a n
training_set_list = training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [38]:
training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']] = training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']].applymap(lambda x: diccionary[x])

In [39]:
del training_set_list
del diccionary
training_set.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:29:24,15ea8012,12802.0,1,1,1,1,2018-05-17 16:13:21,0,1,2,3,4,5,6,1
1,2018-05-18 00:29:24,15ea8012,12802.0,1,1,1,1,2018-05-18 00:18:40,1,1,2,3,4,5,6,1
2,2018-05-18 00:29:24,15ea8012,12802.0,1,1,1,1,2018-05-18 02:55:44,1,1,2,3,4,5,6,1
3,2018-05-18 00:31:24,15ea8012,12788.0,1,1,1,2,2018-05-17 16:13:21,0,1,2,3,4,5,6,1
4,2018-05-18 00:31:24,15ea8012,12788.0,1,1,1,2,2018-05-18 00:18:40,1,1,2,3,4,5,6,1


In [40]:
#Categorizamos la fecha del visto del producto en dia mes hora y minuto (ya que el anio es 2018)
training_set['dia_x'] = training_set['fecha_x'].apply(lambda x: x.day)
training_set['mes_x'] = training_set['fecha_x'].apply(lambda x: x.month)
training_set['hora_x'] = training_set['fecha_x'].apply(lambda x: x.hour)
training_set['minuto_x'] = training_set['fecha_x'].apply(lambda x: x.minute)
training_set = training_set.drop('fecha_x',1)

In [41]:
#Categorizamos la fecha de visita del usuario en dia mes hora y minuto (ya que el anio es 2018)
training_set['dia_y'] = training_set['fecha_y'].apply(lambda x: x.day)
training_set['mes_y'] = training_set['fecha_y'].apply(lambda x: x.month)
training_set['hora_y'] = training_set['fecha_y'].apply(lambda x: x.hour)
training_set['minuto_y'] = training_set['fecha_y'].apply(lambda x: x.minute)
training_set = training_set.drop('fecha_y',1)

In [42]:
training_set.head()

Unnamed: 0,person,sku,model,condition,storage,color,new_vs_returning,region,country,device_type,...,browser_version,label,dia_x,mes_x,hora_x,minuto_x,dia_y,mes_y,hora_y,minuto_y
0,15ea8012,12802.0,1,1,1,1,0,1,2,3,...,6,1,18,5,0,29,17,5,16,13
1,15ea8012,12802.0,1,1,1,1,1,1,2,3,...,6,1,18,5,0,29,18,5,0,18
2,15ea8012,12802.0,1,1,1,1,1,1,2,3,...,6,1,18,5,0,29,18,5,2,55
3,15ea8012,12788.0,1,1,1,2,0,1,2,3,...,6,1,18,5,0,31,17,5,16,13
4,15ea8012,12788.0,1,1,1,2,1,1,2,3,...,6,1,18,5,0,31,18,5,0,18


In [43]:
training_set.to_csv('data/set_entrenamiento.csv', encoding='utf-8', index=False)

In [44]:
training_set.tail()

Unnamed: 0,person,sku,model,condition,storage,color,new_vs_returning,region,country,device_type,...,browser_version,label,dia_x,mes_x,hora_x,minuto_x,dia_y,mes_y,hora_y,minuto_y
29196626,300b0e1e,1302.0,10,3,4,11,0,21,2,3,...,35,0,15,3,21,27,15,3,21,16
29196627,300b0e1e,1302.0,10,3,4,11,1,21,2,3,...,35,0,15,3,21,27,16,3,1,32
29196628,300b0e1e,10882.0,44,2,3,3,0,21,2,3,...,35,0,15,3,21,23,15,3,21,16
29196629,300b0e1e,10882.0,44,2,3,3,1,21,2,3,...,35,0,15,3,21,23,16,3,1,32
29196630,9ce4b2a0,2725.0,54,2,4,5,0,1,2,3,...,48,0,20,2,12,33,20,2,12,32
