In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np

import warnings
import datetime as dt
warnings.filterwarnings('ignore')

In [2]:
events = pd.read_csv('data/events_up_to_01062018.csv')
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [3]:
events.count()

timestamp                   2341681
event                       2341681
person                      2341681
url                          191131
sku                         1320530
model                       1321513
condition                   1320530
storage                     1320530
color                       1320530
skus                         505949
search_term                  113763
staticpage                    11201
campaign_source              191286
search_engine                106406
channel                      204069
new_vs_returning             204069
city                         204069
region                       204069
country                      204069
device_type                  204069
screen_resolution            204066
operating_system_version     204069
browser_version              204069
dtype: int64

# Analisis TP1

In [4]:
events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'],
      dtype='object')

In [5]:
events.drop(['url','sku','skus','city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [6]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
events[['timestamp']] = events[['timestamp']].apply(pd.to_datetime)
events.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
events.head()

Unnamed: 0,fecha,event,person,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,


In [7]:
#Detalles de Productos que fueron comprados
productos = events.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color']]
productos_comprados = productos.loc[(productos.event == 'checkout') | (productos.event == 'conversion')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color
33,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata
60,2018-05-18 00:22:58,43790d8f,Motorola Moto G4 Plus,Bom,32GB,Bambu
76,2018-05-18 00:38:51,d614c608,Samsung Galaxy J5,Bom,16GB,Branco
133,2018-05-18 00:57:32,55d1e0ee,Samsung Galaxy Core Plus Duos TV,Excelente,4GB,Preto
147,2018-05-18 01:12:43,bb78c182,Motorola Moto G3 4G,Bom,16GB,Preto


In [8]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[(productos.event == 'viewed product')]
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color
0,2018-05-18 00:11:59,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado
1,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial
2,2018-05-18 00:11:16,0297fc1e,iPhone 6S,Muito Bom,64GB,Prateado
3,2018-05-18 00:11:14,2d681dd8,iPhone 7,Bom,128GB,Vermelho
4,2018-05-18 00:11:09,cccea85e,LG G4 H818P,Excelente,32GB,Branco


In [9]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = events.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
2136629,2018-05-10 22:34:50,4640420b,Returning,Parana,Brazil,Smartphone,320x570,Android 5.0.2,Chrome Mobile 66.0
2136630,2018-05-15 02:39:45,4640420b,Returning,Parana,Brazil,Smartphone,320x570,Android 5.0.2,Chrome Mobile 66.0
2136631,2018-05-18 01:15:26,4640420b,Returning,Unknown,Brazil,Computer,1280x1024,Ubuntu,Firefox 57
2136632,2018-05-18 19:03:37,4640420b,Returning,Parana,Brazil,Tablet,800x1280,Android 7,Chrome 66.0
2136633,2018-05-18 19:35:12,4640420b,Returning,Parana,Brazil,Tablet,800x1280,Android 7,Chrome 66.0


In [10]:
productos_comprados.count()

fecha        72406
person       72406
model        72406
condition    72406
storage      72406
color        72406
dtype: int64

In [11]:
productos_visitados.count()

fecha        1248124
person       1248124
model        1248124
condition    1248124
storage      1248124
color        1248124
dtype: int64

In [12]:
features_usuario.count()

fecha                       204069
person                      204069
new_vs_returning            204069
region                      204069
country                     204069
device_type                 204069
screen_resolution           204066
operating_system_version    204069
browser_version             204069
dtype: int64

In [13]:
del events
del productos
del visitas

# Set de Datos de Entrenamiento

In [14]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="inner")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
1,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
2,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 02:55:44,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
3,2018-05-18 00:31:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
4,2018-05-18 00:31:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0


In [15]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="inner")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
1,2018-05-18 00:30:30,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
2,2018-05-18 00:11:35,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
3,2018-05-18 00:11:53,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
4,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0


In [16]:
del features_usuario
del productos_comprados
del productos_visitados

In [17]:
user_prod_comprados['person'].count()

1050846

In [18]:
user_prod_comprados = user_prod_comprados.drop_duplicates()
user_prod_comprados['person'].count()

1045740

In [19]:
user_prod_vistas['person'].count()

29196631

In [20]:
user_prod_vistas = user_prod_vistas.drop_duplicates()
user_prod_vistas['person'].count()

29122832

In [21]:
#concateno
# user_prod_comprados['label'] = 1
# user_prod_vistas['label'] = 0
frames = [user_prod_comprados, user_prod_vistas]
sets = pd.concat(frames)
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
1,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
2,2018-05-18 00:29:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 02:55:44,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
3,2018-05-18 00:31:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
4,2018-05-18 00:31:24,15ea8012,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0


In [22]:
del frames
del user_prod_comprados
del user_prod_vistas

In [23]:
sets.count()

fecha_x                     30168572
person                      30168572
model                       30168572
condition                   30168572
storage                     30168572
color                       30168572
fecha_y                     30168572
new_vs_returning            30168572
region                      30168572
country                     30168572
device_type                 30168572
screen_resolution           30168417
operating_system_version    30168572
browser_version             30168572
dtype: int64

In [24]:
sets = sets.fillna(value='')

## Categorizacion de los features a entrenar

### opcion 1: numeros para categorizar por feature (los numeros se repiten entre features)

In [28]:
#Categorizamos el estado del usuario 0 para new y 1 para returning
sets[['new_vs_returning']] = sets[['new_vs_returning']].applymap(lambda x: 0 if (x == 'New') else 1)

In [29]:
#Categorizamos los colores de 1 a n
training_set_list = sets[['color']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [30]:
sets[['color']] = sets[['color']].applymap(lambda x: diccionary[x])

In [31]:
#Categorizamos los modelos de 1 a n
training_set_list = sets[['model']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [32]:
sets[['model']] = sets[['model']].applymap(lambda x: diccionary[x])

In [33]:
#Categorizamos las condiciones del producto de 1 a n
training_set_list = sets[['condition']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [34]:
sets[['condition']] = sets[['condition']].applymap(lambda x: diccionary[x])

In [35]:
#Categorizamos la capacidad de almacenamiento del producto de 1 a n
training_set_list = sets[['storage']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [36]:
sets[['storage']] = sets[['storage']].applymap(lambda x: diccionary[x])

In [37]:
#Categorizamos las regiones, paises y tipo de caracteristicas del usuario en un conjunto de 1 a n
training_set_list = sets[['new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']].values.tolist()
i=1
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [38]:
sets[['region','country','device_type','screen_resolution','operating_system_version','browser_version']] = sets[['region','country','device_type','screen_resolution','operating_system_version','browser_version']].applymap(lambda x: diccionary[x])

### opcion 2: numeros para categorizar distintos por feature

In [25]:
#Categorizamos los features distintos de 61 (teniendo en cuenta el feature fecha-minuto) a n
training_set_list = sets[['model', 'condition', 'storage', 'color','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']].values.tolist()
i=61
diccionary={}

for lista in training_set_list:
    for element in lista:
        if type(element) != float:
            if element not in diccionary:
                diccionary[element]=i
                i+=1

In [26]:
sets[['model', 'condition', 'storage', 'color','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']] = sets[['model', 'condition', 'storage', 'color','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']].applymap(lambda x: diccionary[x])

In [27]:
del training_set_list
del diccionary
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:29:24,15ea8012,61,62,63,64,2018-05-17 16:13:21,65,66,67,68,69,70,71
1,2018-05-18 00:29:24,15ea8012,61,62,63,64,2018-05-18 00:18:40,72,66,67,68,69,70,71
2,2018-05-18 00:29:24,15ea8012,61,62,63,64,2018-05-18 02:55:44,72,66,67,68,69,70,71
3,2018-05-18 00:31:24,15ea8012,61,62,63,73,2018-05-17 16:13:21,65,66,67,68,69,70,71
4,2018-05-18 00:31:24,15ea8012,61,62,63,73,2018-05-18 00:18:40,72,66,67,68,69,70,71


In [28]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018)
sets['dia_view_prod'] = sets['fecha_x'].apply(lambda x: x.day)
sets['mes_view_prod'] = sets['fecha_x'].apply(lambda x: x.month)
sets['hora_view_prod'] = sets['fecha_x'].apply(lambda x: x.hour)
sets['minuto_view_prod'] = sets['fecha_x'].apply(lambda x: x.minute)
sets = sets.drop('fecha_x',1)

In [29]:
#Categorizamos la fecha de visita del usuario en dia mes hora y minuto (ya que el anio es 2018)
sets['dia_visit_user'] = sets['fecha_y'].apply(lambda x: x.day)
sets['mes_visit_user'] = sets['fecha_y'].apply(lambda x: x.month)
sets['hora_visit_user'] = sets['fecha_y'].apply(lambda x: x.hour)
sets['minuto_visit_user'] = sets['fecha_y'].apply(lambda x: x.minute)
sets = sets.drop('fecha_y',1)

In [30]:
sets.head()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,dia_view_prod,mes_view_prod,hora_view_prod,minuto_view_prod,dia_visit_user,mes_visit_user,hora_visit_user,minuto_visit_user
0,15ea8012,61,62,63,64,65,66,67,68,69,70,71,18,5,0,29,17,5,16,13
1,15ea8012,61,62,63,64,72,66,67,68,69,70,71,18,5,0,29,18,5,0,18
2,15ea8012,61,62,63,64,72,66,67,68,69,70,71,18,5,0,29,18,5,2,55
3,15ea8012,61,62,63,73,65,66,67,68,69,70,71,18,5,0,31,17,5,16,13
4,15ea8012,61,62,63,73,72,66,67,68,69,70,71,18,5,0,31,18,5,0,18


In [31]:
sets.tail()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,dia_view_prod,mes_view_prod,hora_view_prod,minuto_view_prod,dia_visit_user,mes_visit_user,hora_visit_user,minuto_visit_user
29196626,300b0e1e,111,97,83,122,65,106,67,68,69,140,134,15,3,21,27,15,3,21,16
29196627,300b0e1e,111,97,83,122,72,106,67,68,69,140,134,15,3,21,27,16,3,1,32
29196628,300b0e1e,200,75,79,77,65,106,67,68,69,140,134,15,3,21,23,15,3,21,16
29196629,300b0e1e,200,75,79,77,72,106,67,68,69,140,134,15,3,21,23,16,3,1,32
29196630,9ce4b2a0,236,75,83,84,65,66,67,68,190,81,150,20,2,12,33,20,2,12,32


# Separamos los set de entrenamiento y prediccion

In [32]:
label_training = pd.read_csv('data/labels_training_set.csv')
label_training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [33]:
# Set prediccion son los que NO estan en labels_training y pero SI en events_up_to_01062018
prediction_set = sets.merge(label_training, how= 'left', on = 'person')
prediction_set = prediction_set[prediction_set.label.isnull()]
prediction_set.drop('label', inplace=True, axis=1)
prediction_set.count()

person                      14571225
model                       14571225
condition                   14571225
storage                     14571225
color                       14571225
new_vs_returning            14571225
region                      14571225
country                     14571225
device_type                 14571225
screen_resolution           14571225
operating_system_version    14571225
browser_version             14571225
dia_view_prod               14571225
mes_view_prod               14571225
hora_view_prod              14571225
minuto_view_prod            14571225
dia_visit_user              14571225
mes_visit_user              14571225
hora_visit_user             14571225
minuto_visit_user           14571225
dtype: int64

In [34]:
prediction_set.to_csv('data/set_prediccion_2.csv', encoding='utf-8', index=False)

In [35]:
# Set entrenamiento son los que estan en labels_training y events_up_to_01062018
training_set = sets.merge(label_training, how= 'inner', on = 'person')
training_set.count()

person                      15597347
model                       15597347
condition                   15597347
storage                     15597347
color                       15597347
new_vs_returning            15597347
region                      15597347
country                     15597347
device_type                 15597347
screen_resolution           15597347
operating_system_version    15597347
browser_version             15597347
dia_view_prod               15597347
mes_view_prod               15597347
hora_view_prod              15597347
minuto_view_prod            15597347
dia_visit_user              15597347
mes_visit_user              15597347
hora_visit_user             15597347
minuto_visit_user           15597347
label                       15597347
dtype: int64

In [36]:
training_set[training_set['label'] == 1].shape

(1630594, 21)

In [37]:
training_set[training_set['label'] == 0].shape

(13966753, 21)

In [38]:
training_set.to_csv('data/set_entrenamiento_2.csv', encoding='utf-8', index=False)