In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np

import warnings
import datetime as dt
warnings.filterwarnings('ignore')

In [2]:
events = pd.read_csv('data/events_up_to_01062018.csv')
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [3]:
events.count()

timestamp                   2341681
event                       2341681
person                      2341681
url                          191131
sku                         1320530
model                       1321513
condition                   1320530
storage                     1320530
color                       1320530
skus                         505949
search_term                  113763
staticpage                    11201
campaign_source              191286
search_engine                106406
channel                      204069
new_vs_returning             204069
city                         204069
region                       204069
country                      204069
device_type                  204069
screen_resolution            204066
operating_system_version     204069
browser_version              204069
dtype: int64

In [4]:
label_training = pd.read_csv('data/labels_training_set.csv')
label_training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [5]:
label_training.describe()

Unnamed: 0,label
count,19414.0
mean,0.050479
std,0.218937
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [6]:
training_set = events.merge(label_training, how= 'inner', on = 'person')
training_set.count()

timestamp                   1171886
event                       1171886
person                      1171886
url                           94875
sku                          665336
model                        665767
condition                    665336
storage                      665336
color                        665336
skus                         249587
search_term                   55774
staticpage                     5660
campaign_source               94940
search_engine                 52829
channel                      102299
new_vs_returning             102299
city                         102299
region                       102299
country                      102299
device_type                  102299
screen_resolution            102297
operating_system_version     102299
browser_version              102299
label                       1171886
dtype: int64

In [7]:
prediction_set = events.merge(label_training, how= 'left', on = 'person')
prediction_set = prediction_set[prediction_set.label.isnull()]
prediction_set.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",...,,,,,,,,,,


In [8]:
prediction_set.count()

timestamp                   1169795
event                       1169795
person                      1169795
url                           96256
sku                          655194
model                        655746
condition                    655194
storage                      655194
color                        655194
skus                         256362
search_term                   57989
staticpage                     5541
campaign_source               96346
search_engine                 53577
channel                      101770
new_vs_returning             101770
city                         101770
region                       101770
country                      101770
device_type                  101770
screen_resolution            101769
operating_system_version     101770
browser_version              101770
label                             0
dtype: int64

In [9]:
prediction_set.to_csv('data/set_prediccion.csv', encoding='utf-8', index=False)

In [10]:
del prediction_set
del events
del label_training

# Analisis TP1

In [11]:
training_set.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label'],
      dtype='object')

In [12]:
training_set.drop(['url','sku','skus','city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [13]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
training_set[['timestamp']] = training_set[['timestamp']].apply(pd.to_datetime)
training_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
training_set.head()

Unnamed: 0,fecha,event,person,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:27,viewed product,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,0
1,2018-05-18 00:23:33,viewed product,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado,,,,,,,,,,0
2,2018-05-18 00:16:10,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,,,,,,,,,0
3,2018-05-18 00:14:55,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,,,,,,,,0
4,2018-05-18 00:11:26,ad campaign hit,ad93850f,,,,,,,,,,,,,,0


In [14]:
#Detalles de Productos que fueron comprados
productos = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color']]
productos_comprados = productos.loc[(productos.event == 'checkout') | (productos.event == 'conversion')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado
211,2018-05-09 00:13:59,99abca5a,Motorola Moto G4 Plus,Excelente,32GB,Bambu


In [15]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[(productos.event == 'viewed product')]
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color
0,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial
1,2018-05-18 00:23:33,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado
2,2018-05-18 00:16:10,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial
3,2018-05-18 00:14:55,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado
5,2018-05-16 02:48:16,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial


In [16]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = training_set.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version','label']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
60,2018-05-14 23:50:22,ad93850f,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
61,2018-05-16 02:48:13,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
62,2018-05-18 00:11:26,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
63,2018-05-18 22:11:46,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
64,2018-05-22 22:41:31,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0


In [17]:
productos_comprados.count()

fecha        36103
person       36103
model        36103
condition    36103
storage      36103
color        36103
dtype: int64

In [18]:
productos_visitados.count()

fecha        629233
person       629233
model        629233
condition    629233
storage      629233
color        629233
dtype: int64

In [19]:
features_usuario.count()

fecha                       102299
person                      102299
new_vs_returning            102299
region                      102299
country                     102299
device_type                 102299
screen_resolution           102297
operating_system_version    102299
browser_version             102299
label                       102299
dtype: int64

In [20]:
del productos
del visitas

# Set de Datos de Entrenamiento

In [21]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="inner")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
1,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-16 02:48:13,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
2,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 00:11:26,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
3,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 22:11:46,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
4,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-22 22:41:31,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0


In [22]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="inner")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
1,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-16 02:48:13,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
2,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 00:11:26,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
3,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 22:11:46,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
4,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-22 22:41:31,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0


In [23]:
user_prod_comprados['person'].count()

487040

In [24]:
user_prod_vistas['person'].count()

15144663

In [25]:
del features_usuario
del productos_comprados
del productos_visitados

In [26]:
#concateno
# user_prod_comprados['label'] = 1
# user_prod_vistas['label'] = 0
frames = [user_prod_comprados, user_prod_vistas]
training_set = pd.concat(frames)
training_set.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
1,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-16 02:48:13,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
2,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 00:11:26,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
3,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-18 22:11:46,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0
4,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-22 22:41:31,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0


In [27]:
del frames
del user_prod_comprados
del user_prod_vistas


In [28]:
training_set.count()

fecha_x                     15631703
person                      15631703
model                       15631703
condition                   15631703
storage                     15631703
color                       15631703
fecha_y                     15631703
new_vs_returning            15631703
region                      15631703
country                     15631703
device_type                 15631703
screen_resolution           15631559
operating_system_version    15631703
browser_version             15631703
label                       15631703
dtype: int64

In [29]:
training_set = training_set.fillna(value='')

## Categorizacion de los features a entrenar

### opcion 1: numeros para categorizar por feature (los numeros se repiten entre features)

In [30]:
#Categorizamos el estado del usuario 0 para new y 1 para returning
training_set[['new_vs_returning']] = training_set[['new_vs_returning']].applymap(lambda x: 0 if (x == 'New') else 1)

In [31]:
#Categorizamos los colores de 1 a n
training_set_list = training_set[['color']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [32]:
training_set[['color']] = training_set[['color']].applymap(lambda x: diccionary[x])

In [33]:
#Categorizamos los modelos de 1 a n
training_set_list = training_set[['model']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [34]:
training_set[['model']] = training_set[['model']].applymap(lambda x: diccionary[x])

In [35]:
#Categorizamos las condiciones del producto de 1 a n
training_set_list = training_set[['condition']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [36]:
training_set[['condition']] = training_set[['condition']].applymap(lambda x: diccionary[x])

In [37]:
#Categorizamos la capacidad de almacenamiento del producto de 1 a n
training_set_list = training_set[['storage']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [38]:
training_set[['storage']] = training_set[['storage']].applymap(lambda x: diccionary[x])

In [39]:
#Categorizamos las regiones, paises y tipo de caracteristicas del usuario en un conjunto de 1 a n
training_set_list = training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [40]:
training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']] = training_set[['region','country','device_type','screen_resolution','operating_system_version','browser_version']].applymap(lambda x: diccionary[x])

In [41]:
del training_set_list
del diccionary
training_set.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-14 23:54:19,ad93850f,1,1,1,1,2018-05-14 23:50:22,0,1,2,3,4,5,6,0
1,2018-05-14 23:54:19,ad93850f,1,1,1,1,2018-05-16 02:48:13,1,1,2,3,4,5,6,0
2,2018-05-14 23:54:19,ad93850f,1,1,1,1,2018-05-18 00:11:26,1,1,2,3,4,5,6,0
3,2018-05-14 23:54:19,ad93850f,1,1,1,1,2018-05-18 22:11:46,1,1,2,3,4,5,6,0
4,2018-05-14 23:54:19,ad93850f,1,1,1,1,2018-05-22 22:41:31,1,1,2,3,4,5,6,0


In [42]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018 y la pre)
training_set['dia_x'] = training_set['fecha_x'].apply(lambda x: x.day)
training_set['mes_x'] = training_set['fecha_x'].apply(lambda x: x.month)
training_set['hora_x'] = training_set['fecha_x'].apply(lambda x: x.hour)
training_set['minuto_x'] = training_set['fecha_x'].apply(lambda x: x.minute)
training_set = training_set.drop('fecha_x',1)

In [43]:
#Categorizamos la fecha de visita del usuario en dia mes hora y minuto (ya que el anio es 2018)
training_set['dia_y'] = training_set['fecha_y'].apply(lambda x: x.day)
training_set['mes_y'] = training_set['fecha_y'].apply(lambda x: x.month)
training_set['hora_y'] = training_set['fecha_y'].apply(lambda x: x.hour)
training_set['minuto_y'] = training_set['fecha_y'].apply(lambda x: x.minute)
training_set = training_set.drop('fecha_y',1)

In [44]:
training_set.head()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,...,browser_version,label,dia_x,mes_x,hora_x,minuto_x,dia_y,mes_y,hora_y,minuto_y
0,ad93850f,1,1,1,1,0,1,2,3,4,...,6,0,14,5,23,54,14,5,23,50
1,ad93850f,1,1,1,1,1,1,2,3,4,...,6,0,14,5,23,54,16,5,2,48
2,ad93850f,1,1,1,1,1,1,2,3,4,...,6,0,14,5,23,54,18,5,0,11
3,ad93850f,1,1,1,1,1,1,2,3,4,...,6,0,14,5,23,54,18,5,22,11
4,ad93850f,1,1,1,1,1,1,2,3,4,...,6,0,14,5,23,54,22,5,22,41


In [45]:
training_set.tail()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,...,browser_version,label,dia_x,mes_x,hora_x,minuto_x,dia_y,mes_y,hora_y,minuto_y
15144658,aeee08e4,20,2,4,42,0,16,2,9,17,...,35,0,15,2,3,51,15,2,3,51
15144659,aeee08e4,20,2,4,13,0,16,2,9,17,...,35,0,15,2,3,52,15,2,3,51
15144660,aeee08e4,20,2,6,18,0,16,2,9,17,...,35,0,15,2,3,51,15,2,3,51
15144661,aeee08e4,20,2,6,16,0,16,2,9,17,...,35,0,15,2,3,51,15,2,3,51
15144662,9ce4b2a0,27,2,4,13,0,1,2,3,33,...,22,0,20,2,12,33,20,2,12,32


In [46]:
training_set[training_set['label'] == 1].shape

(1639578, 21)

In [47]:
training_set[training_set['label'] == 0].shape

(13992125, 21)

In [48]:
training_set.to_csv('data/set_entrenamiento.csv', encoding='utf-8', index=False)