In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np
import seaborn as sns
import warnings
import datetime as dt
warnings.filterwarnings('ignore')

In [2]:
events = pd.read_csv('data/events_up_to_01062018.csv')
events.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [3]:
training = pd.read_csv('data/labels_training_set.csv')
training.head()

Unnamed: 0,person,label
0,0566e9c1,0
1,6ec7ee77,0
2,abe7a2fb,0
3,34728364,0
4,87ed62de,0


In [4]:
training.describe()

Unnamed: 0,label
count,19414.0
mean,0.050479
std,0.218937
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


# Analisis TP1

In [5]:
events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'],
      dtype='object')

In [6]:
events.drop(['url','skus','city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [7]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
events[['timestamp']] = events[['timestamp']].apply(pd.to_datetime)
events.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
events.head()

Unnamed: 0,fecha,event,person,sku,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,


In [8]:
#Detalles de Productos que fueron comprados
productos = events.loc[:, ['event','fecha','person','sku', 'model', 'condition', 'storage', 'color']]
productos_comprados = productos.loc[(productos.event == 'checkout') | (productos.event == 'conversion')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados.head()

Unnamed: 0,fecha,person,sku,model,condition,storage,color
33,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata
60,2018-05-18 00:22:58,43790d8f,10378.0,Motorola Moto G4 Plus,Bom,32GB,Bambu
76,2018-05-18 00:38:51,d614c608,6343.0,Samsung Galaxy J5,Bom,16GB,Branco
133,2018-05-18 00:57:32,55d1e0ee,6663.0,Samsung Galaxy Core Plus Duos TV,Excelente,4GB,Preto
147,2018-05-18 01:12:43,bb78c182,7631.0,Motorola Moto G3 4G,Bom,16GB,Preto


In [9]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[(productos.event == 'viewed product')]
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados.head()

Unnamed: 0,fecha,person,sku,model,condition,storage,color
0,2018-05-18 00:11:59,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado
1,2018-05-18 00:11:27,ad93850f,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial
2,2018-05-18 00:11:16,0297fc1e,6888.0,iPhone 6S,Muito Bom,64GB,Prateado
3,2018-05-18 00:11:14,2d681dd8,11890.0,iPhone 7,Bom,128GB,Vermelho
4,2018-05-18 00:11:09,cccea85e,7517.0,LG G4 H818P,Excelente,32GB,Branco


In [10]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = events.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
2136629,2018-05-10 22:34:50,4640420b,Returning,Parana,Brazil,Smartphone,320x570,Android 5.0.2,Chrome Mobile 66.0
2136630,2018-05-15 02:39:45,4640420b,Returning,Parana,Brazil,Smartphone,320x570,Android 5.0.2,Chrome Mobile 66.0
2136631,2018-05-18 01:15:26,4640420b,Returning,Unknown,Brazil,Computer,1280x1024,Ubuntu,Firefox 57
2136632,2018-05-18 19:03:37,4640420b,Returning,Parana,Brazil,Tablet,800x1280,Android 7,Chrome 66.0
2136633,2018-05-18 19:35:12,4640420b,Returning,Parana,Brazil,Tablet,800x1280,Android 7,Chrome 66.0


In [11]:
productos_comprados.count()

fecha        72406
person       72406
sku          72406
model        72406
condition    72406
storage      72406
color        72406
dtype: int64

In [12]:
productos_visitados.count()

fecha        1248124
person       1248124
sku          1248124
model        1248124
condition    1248124
storage      1248124
color        1248124
dtype: int64

In [13]:
features_usuario.count()

fecha                       204069
person                      204069
new_vs_returning            204069
region                      204069
country                     204069
device_type                 204069
screen_resolution           204066
operating_system_version    204069
browser_version             204069
dtype: int64

In [14]:
del events
del productos
del visitas

# Set de Datos a Predecir

In [15]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="inner")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
1,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
2,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 02:55:44,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
3,2018-05-18 00:31:24,15ea8012,12788.0,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0
4,2018-05-18 00:31:24,15ea8012,12788.0,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0


In [16]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="inner")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
1,2018-05-18 00:30:30,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
2,2018-05-18 00:11:35,4886f805,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
3,2018-05-18 00:11:53,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0
4,2018-05-18 00:11:27,ad93850f,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0


In [17]:
user_prod_comprados['person'].count()

1050846

In [18]:
user_prod_vistas['person'].count()

29196631

In [19]:
#Para los productos comprados agrego label 1 y para las vistas label 0 y concateno
user_prod_comprados['label'] = 1
user_prod_vistas['label'] = 0
frames = [user_prod_comprados, user_prod_vistas]
user_prod = pd.concat(frames)
user_prod.head()

Unnamed: 0,fecha_x,person,sku,model,condition,storage,color,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,1
1,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,1
2,2018-05-18 00:29:24,15ea8012,12802.0,Samsung Galaxy S8 Plus,Excelente,64GB,Prata,2018-05-18 02:55:44,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,1
3,2018-05-18 00:31:24,15ea8012,12788.0,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-17 16:13:21,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,1
4,2018-05-18 00:31:24,15ea8012,12788.0,Samsung Galaxy S8 Plus,Excelente,64GB,Ametista,2018-05-18 00:18:40,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,1


In [20]:
user_prod.count()

fecha_x                     30247477
person                      30247477
sku                         30247477
model                       30247477
condition                   30247477
storage                     30247477
color                       30247477
fecha_y                     30247477
new_vs_returning            30247477
region                      30247477
country                     30247477
device_type                 30247477
screen_resolution           30247322
operating_system_version    30247477
browser_version             30247477
label                       30247477
dtype: int64

In [17]:
user_prod.to_csv('data/set_datos_entrenamiento.csv', encoding='utf-8', index=False)