In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np

import warnings
import datetime as dt
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [2]:
events = pd.read_csv('data/events_up_to_01062018.csv', low_memory = False)
events.person.describe()

count      2341681
unique       38829
top       c76b8417
freq          4438
Name: person, dtype: object

In [3]:
prediction_set = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory = False)
training_set = pd.read_csv('data/labels_training_set.csv', low_memory=False)

print('PREDICCION ' , prediction_set.person.describe())
print('ENTRENAMIENTO ', training_set.person.describe())
print('ENTRENAMIENTO 1s ', training_set[training_set['label'] == 1].shape)
print('ENTRENAMIENTO 0s ', training_set[training_set['label'] == 0].shape)

prediction_set = prediction_set.merge(events, on = 'person', how = 'inner')
prediction_set = prediction_set[(prediction_set['event'] == 'visited site') | (prediction_set['event'] == 'viewed product') | (prediction_set['event'] == 'conversion') | (prediction_set['event'] == 'checkout') | (prediction_set['event'] == 'brand listing') | (prediction_set['event'] == 'generic listing') | (prediction_set['event'] == 'lead') | (prediction_set['event'] == 'staticpage') ]                                                                       

training_set = events.merge(training_set, on = 'person', how = 'inner')

# info = training_set[['person']]
# info['frecuencia'] = 1
# data = pd.DataFrame({'frecuencia': info.groupby('person').aggregate(sum)['frecuencia']}).reset_index()

# training_set = training_set.merge(data, on = 'person', how = 'left')
# training_set= training_set[(training_set['label'] == 1) | ((training_set['frecuencia'] <= 200) & (training_set['label'] == 0))]


PREDICCION  count        19415
unique       19415
top       c6ab1bc3
freq             1
Name: person, dtype: object
ENTRENAMIENTO  count        19414
unique       19414
top       1836c63e
freq             1
Name: person, dtype: object
ENTRENAMIENTO 1s  (980, 2)
ENTRENAMIENTO 0s  (18434, 2)


In [5]:
# del info
# del data
del events

In [6]:
prediction_set.person.describe()

count       953099
unique       19415
top       5059f7fd
freq          2932
Name: person, dtype: object

In [7]:
training_set.person.describe()

count      1171886
unique       19414
top       ffee0f18
freq          3458
Name: person, dtype: object

# Set Datos Prediccion

## Analisis TP1

In [7]:
prediction_set.columns

Index(['person', 'timestamp', 'event', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'],
      dtype='object')

In [8]:
prediction_set.drop(['url','skus', 'sku', 'city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [9]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
prediction_set[['timestamp']] = prediction_set[['timestamp']].apply(pd.to_datetime)
prediction_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
prediction_set.head()

Unnamed: 0,person,fecha,event,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,4886f805,2018-05-18 00:11:59,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
1,4886f805,2018-05-18 00:30:30,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
3,4886f805,2018-05-18 00:11:56,checkout,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
4,4886f805,2018-05-18 00:11:35,viewed product,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,,,,,,,,
6,4886f805,2018-05-18 00:11:53,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,


vamos a crear las columnas compras_prod, vistas_prod, otros para productos y visitas_sitio para persona, de la forma de One Hot Encoder

In [10]:
#Detalles de Productos que fueron comprados
productos = prediction_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color']]
productos_comprados = productos[(productos['event'] == 'conversion') | (productos['event'] == 'checkout')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados['compras_prod'] = 1
productos_comprados['vistas_prod'] = 0
productos_comprados['otros'] = 0
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1,0,0
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,1,0,0
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,1,0,0
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,1,0,0


In [11]:
productos_comprados.count()

fecha           36303
person          36303
model           36303
condition       36303
storage         36303
color           36303
compras_prod    36303
vistas_prod     36303
otros           36303
dtype: int64

In [12]:
# productos_comprados = productos_comprados.drop_duplicates()
# productos_comprados.count()

In [13]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[productos.event == 'viewed product']
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados['compras_prod'] = 0
productos_visitados['vistas_prod'] = 1
productos_visitados['otros'] = 0
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
0,2018-05-18 00:11:59,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,0,1,0
1,2018-05-18 00:30:30,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,0,1,0
4,2018-05-18 00:11:35,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,0,1,0
6,2018-05-18 00:11:53,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,0,1,0
9,2018-05-18 00:11:16,0297fc1e,iPhone 6S,Muito Bom,64GB,Prateado,0,1,0


In [14]:
productos_visitados.count()

fecha           618891
person          618891
model           618891
condition       618891
storage         618891
color           618891
compras_prod    618891
vistas_prod     618891
otros           618891
dtype: int64

In [15]:
# productos_visitados = productos_visitados.drop_duplicates()
# productos_visitados.count()

In [16]:
# Eliminamos de productos visitados las personas que estan en productos comprados
productos_visitados = productos_visitados.merge(productos_comprados, on='person', how='left')
productos_visitados = productos_visitados[productos_visitados['fecha_y'].isnull()]
productos_visitados = productos_visitados.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x']]
productos_visitados.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros'}, inplace=True)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
6574,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0,1,0
6575,2018-05-18 00:25:27,686c49c9,iPhone 7,Excelente,128GB,Preto Matte,0,1,0
6576,2018-05-18 01:05:31,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0,1,0
6577,2018-05-15 20:11:48,686c49c9,iPhone 7,Excelente,128GB,Prateado,0,1,0
6578,2018-05-15 20:10:28,686c49c9,iPhone 6S,Excelente,16GB,Dourado,0,1,0


In [17]:
productos_visitados.count()

fecha           58106
person          58106
model           58106
condition       58106
storage         58106
color           58106
compras_prod    58106
vistas_prod     58106
otros           58106
dtype: int64

In [18]:
otros = prediction_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color']]
otros = otros[(otros['event'] == 'brand listing') | (otros['event'] == 'generic listing') | (otros['event'] == 'lead') | (otros['event'] == 'staticpage')]
otros.drop(['event'], inplace=True, axis=1)
otros['compras_prod'] = 0
otros['vistas_prod'] = 0
otros['otros'] = 1
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
7,2018-05-18 00:07:22,4886f805,,,,,0,0,1
455,2018-05-17 19:59:37,0297fc1e,,,,,0,0,1
456,2018-05-21 23:08:54,0297fc1e,,,,,0,0,1
457,2018-03-20 16:39:59,0297fc1e,,,,,0,0,1
458,2018-04-29 18:41:44,0297fc1e,,,,,0,0,1


In [19]:
otros.count()

fecha           196135
person          196135
model              552
condition            0
storage              0
color                0
compras_prod    196135
vistas_prod     196135
otros           196135
dtype: int64

In [20]:
# otros = otros.drop_duplicates()
# otros.count()

In [21]:
# Eliminamos de otros las personas que estan en productos comprados
otros = otros.merge(productos_comprados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
523,2018-05-07 12:45:15,686c49c9,,,,,0,0,1
524,2018-05-07 12:39:53,686c49c9,,,,,0,0,1
525,2018-05-07 12:41:38,686c49c9,,,,,0,0,1
526,2018-05-07 12:43:42,686c49c9,,,,,0,0,1
527,2018-05-07 12:43:43,686c49c9,,,,,0,0,1


In [22]:
otros.count()

fecha           18472
person          18472
model              58
condition           0
storage             0
color               0
compras_prod    18472
vistas_prod     18472
otros           18472
dtype: int64

In [23]:
# Eliminamos de otros las personas que estan en productos visitados
otros = otros.merge(productos_visitados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros
60413,2018-05-28 12:36:14,df80a783,,,,,0,0,1
60414,2018-05-28 22:28:24,df80a783,,,,,0,0,1
60415,2018-05-29 14:47:28,df80a783,,,,,0,0,1
63146,2018-05-19 16:30:03,5437de8d,,,,,0,0,1
69477,2018-02-07 13:52:08,626d2cd6,,,,,0,0,1


In [24]:
otros.count()

fecha           590
person          590
model             5
condition         0
storage           0
color             0
compras_prod    590
vistas_prod     590
otros           590
dtype: int64

In [25]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = prediction_set.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario['visitas_sitio'] = 1
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio
8,2018-05-18 00:07:22,4886f805,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1
480,2018-05-02 01:28:27,0297fc1e,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1
481,2018-05-02 14:28:20,0297fc1e,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1
482,2018-05-03 01:55:52,0297fc1e,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1
483,2018-05-10 00:38:17,0297fc1e,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1


In [26]:
features_usuario.count()

fecha                       101770
person                      101770
new_vs_returning            101770
region                      101770
country                     101770
device_type                 101770
screen_resolution           101769
operating_system_version    101770
browser_version             101770
visitas_sitio               101770
dtype: int64

In [27]:
features_usuario = features_usuario.drop_duplicates()
features_usuario.count()

fecha                       101770
person                      101770
new_vs_returning            101770
region                      101770
country                     101770
device_type                 101770
screen_resolution           101769
operating_system_version    101770
browser_version             101770
visitas_sitio               101770
dtype: int64

In [28]:
del productos
del visitas

In [29]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="outer")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1.0,0.0,0.0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0


In [30]:
user_prod_comprados.count()

fecha_x                     564204
person                      575085
model                       564204
condition                   564204
storage                     564204
color                       564204
compras_prod                564204
vistas_prod                 564204
otros                       564204
fecha_y                     574687
new_vs_returning            574687
region                      574687
country                     574687
device_type                 574687
screen_resolution           574686
operating_system_version    574687
browser_version             574687
visitas_sitio               574687
dtype: int64

In [31]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="outer")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio
0,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0.0,1.0,0.0,2018-05-07 12:39:52,New,Sao Paulo,Brazil,Smartphone,320x568,iOS 10.3.3,Mobile Safari 10,1.0
1,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0.0,1.0,0.0,2018-05-07 12:43:42,Returning,Sao Paulo,Brazil,Smartphone,320x568,iOS 10.3.3,Mobile Safari 10,1.0
2,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0.0,1.0,0.0,2018-05-15 20:07:26,Returning,Sao Paulo,Brazil,Smartphone,320x568,iOS 10.3.3,Mobile Safari 10,1.0
3,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0.0,1.0,0.0,2018-05-18 00:21:05,Returning,Sao Paulo,Brazil,Smartphone,320x568,iOS 10.3.3,Mobile Safari 10,1.0
4,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,0.0,1.0,0.0,2018-05-18 01:05:32,Returning,Sao Paulo,Brazil,Smartphone,320x568,iOS 10.3.3,Mobile Safari 10,1.0


In [32]:
user_prod_vistas.count()

fecha_x                     627953
person                      719287
model                       627953
condition                   627953
storage                     627953
color                       627953
compras_prod                627953
vistas_prod                 627953
otros                       627953
fecha_y                     719260
new_vs_returning            719260
region                      719260
country                     719260
device_type                 719260
screen_resolution           719259
operating_system_version    719260
browser_version             719260
visitas_sitio               719260
dtype: int64

In [33]:
otros = otros.merge(features_usuario, on="person", how="outer")
otros.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio
0,2018-05-28 12:36:14,df80a783,,,,,0.0,0.0,1.0,2018-05-28 12:36:14,New,Maranhao,Brazil,Smartphone,360x640,Android 5.1.1,Samsung Internet 6.4,1.0
1,2018-05-28 12:36:14,df80a783,,,,,0.0,0.0,1.0,2018-05-28 22:28:25,Returning,Minas Gerais,Brazil,Smartphone,360x640,Android 5.1.1,Samsung Internet 6.4,1.0
2,2018-05-28 12:36:14,df80a783,,,,,0.0,0.0,1.0,2018-05-29 14:47:28,Returning,Piaui,Brazil,Smartphone,360x640,Android 5.1.1,Samsung Internet 6.4,1.0
3,2018-05-28 12:36:14,df80a783,,,,,0.0,0.0,1.0,2018-05-29 17:00:15,Returning,Piaui,Brazil,Smartphone,360x640,Android 5.1.1,Samsung Internet 6.4,1.0
4,2018-05-28 22:28:24,df80a783,,,,,0.0,0.0,1.0,2018-05-28 12:36:14,New,Maranhao,Brazil,Smartphone,360x640,Android 5.1.1,Samsung Internet 6.4,1.0


In [34]:
otros.count()

fecha_x                       3877
person                      105273
model                            5
condition                        0
storage                          0
color                            0
compras_prod                  3877
vistas_prod                   3877
otros                         3877
fecha_y                     105260
new_vs_returning            105260
region                      105260
country                     105260
device_type                 105260
screen_resolution           105259
operating_system_version    105260
browser_version             105260
visitas_sitio               105260
dtype: int64

In [35]:
del features_usuario
del productos_comprados
del productos_visitados

In [36]:
#concateno
frames = [user_prod_comprados, user_prod_vistas, otros]
sets = pd.concat(frames)
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1.0,0.0,0.0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0


In [37]:
del frames
del user_prod_comprados
del user_prod_vistas
del otros

In [38]:
sets = sets.drop_duplicates()

In [39]:
sets.count()

fecha_x                     1192107
person                      1293877
model                       1188257
condition                   1188252
storage                     1188252
color                       1188252
compras_prod                1192107
vistas_prod                 1192107
otros                       1192107
fecha_y                     1293439
new_vs_returning            1293439
region                      1293439
country                     1293439
device_type                 1293439
screen_resolution           1293437
operating_system_version    1293439
browser_version             1293439
visitas_sitio               1293439
dtype: int64

In [40]:
# Rellenamos compras_prod  vistas_prod  otros y visitas_sitio con 0
sets['compras_prod'] = sets['compras_prod'].fillna(value=0) 
sets['vistas_prod'] = sets['vistas_prod'].fillna(value=0) 
sets['otros'] = sets['otros'].fillna(value=0) 
sets['visitas_sitio'] = sets['visitas_sitio'].fillna(value=0) 

In [41]:
sets.count()

fecha_x                     1192107
person                      1293877
model                       1188257
condition                   1188252
storage                     1188252
color                       1188252
compras_prod                1293877
vistas_prod                 1293877
otros                       1293877
fecha_y                     1293439
new_vs_returning            1293439
region                      1293439
country                     1293439
device_type                 1293439
screen_resolution           1293437
operating_system_version    1293439
browser_version             1293439
visitas_sitio               1293877
dtype: int64

In [42]:
# Rellenamos los demas con ''
sets = sets.fillna(value='') 

In [43]:
sets.count()

fecha_x                     1293877
person                      1293877
model                       1293877
condition                   1293877
storage                     1293877
color                       1293877
compras_prod                1293877
vistas_prod                 1293877
otros                       1293877
fecha_y                     1293877
new_vs_returning            1293877
region                      1293877
country                     1293877
device_type                 1293877
screen_resolution           1293877
operating_system_version    1293877
browser_version             1293877
visitas_sitio               1293877
dtype: int64

In [44]:
sets.person.describe()

count      1293877
unique       19415
top       5059f7fd
freq         96465
Name: person, dtype: object

## Categorizacion de los features

In [45]:
# creo una nueva columna person_int para entrenar transformando el valor alfanumerico de person a numerico
sets['person_int'] = labelencoder.fit_transform(sets['person'])
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1.0,0.0,0.0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0,5476
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1.0,0.0,0.0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189


In [46]:
# convertimos a integer el valor flotante de compras_prod
sets['compras_prod'] = sets['compras_prod'].astype('int')
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1,0.0,0.0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0,5476
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0.0,0.0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0.0,0.0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0.0,0.0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0.0,0.0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189


In [47]:
# convertimos a integer el valor flotante de vistas_prod
sets['vistas_prod'] = sets['vistas_prod'].astype('int')
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1,0,0.0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0,5476
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0.0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0.0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0.0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0.0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189


In [48]:
# convertimos a integer el valor flotante de otros
sets['otros'] = sets['otros'].astype('int')
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1,0,0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1.0,5476
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1.0,189


In [49]:
# convertimos a integer el valor flotante de visitas_sitio
sets['visitas_sitio'] = sets['visitas_sitio'].astype('int')
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,1,0,0,2018-05-18 00:07:22,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,1,5476
1,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-02 01:28:27,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1,189
2,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-02 14:28:20,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1,189
3,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-03 01:55:52,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1,189
4,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,1,0,0,2018-05-10 00:38:17,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,1,189


In [50]:
#Categorizamos los valores de los features con label encoder
sets['model'] = labelencoder.fit_transform(sets['model'])
sets['condition'] = labelencoder.fit_transform(sets['condition'])
sets['storage'] = labelencoder.fit_transform(sets['storage'])
sets['color'] = labelencoder.fit_transform(sets['color'])
sets['new_vs_returning'] = labelencoder.fit_transform(sets['new_vs_returning'])
sets['region'] = labelencoder.fit_transform(sets['region'])
sets['country'] = labelencoder.fit_transform(sets['country'])
sets['device_type'] = labelencoder.fit_transform(sets['device_type'])
sets['screen_resolution'] = labelencoder.fit_transform(sets['screen_resolution'])
sets['operating_system_version'] = labelencoder.fit_transform(sets['operating_system_version'])
sets['browser_version'] = labelencoder.fit_transform(sets['browser_version'])
sets.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,visitas_sitio,person_int
0,2018-05-18 00:11:56,4886f805,97,3,4,28,1,0,0,2018-05-18 00:07:22,1,78,7,2,170,24,110,1,5476
1,2018-05-22 20:29:35,0297fc1e,180,1,4,34,1,0,0,2018-05-02 01:28:27,2,78,7,2,170,22,111,1,189
2,2018-05-22 20:29:35,0297fc1e,180,1,4,34,1,0,0,2018-05-02 14:28:20,2,78,7,2,170,22,111,1,189
3,2018-05-22 20:29:35,0297fc1e,180,1,4,34,1,0,0,2018-05-03 01:55:52,2,78,7,2,170,22,111,1,189
4,2018-05-22 20:29:35,0297fc1e,180,1,4,34,1,0,0,2018-05-10 00:38:17,2,78,7,2,170,22,111,1,189


In [51]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018)
sets['dia_view_prod'] = sets['fecha_x'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes_view_prod'] = sets['fecha_x'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_sem_view_prod'] = sets['fecha_x'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha_x',1)

In [52]:
#Categorizamos la fecha de visita del usuario en dia mes hora y minuto (ya que el anio es 2018)
sets['dia_visit_user'] = sets['fecha_y'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes_visit_user'] = sets['fecha_y'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_sem_visit_user'] = sets['fecha_y'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha_y',1)

In [53]:
sets.head()

Unnamed: 0,person,model,condition,storage,color,compras_prod,vistas_prod,otros,new_vs_returning,region,...,operating_system_version,browser_version,visitas_sitio,person_int,dia_view_prod,mes_view_prod,dia_sem_view_prod,dia_visit_user,mes_visit_user,dia_sem_visit_user
0,4886f805,97,3,4,28,1,0,0,1,78,...,24,110,1,5476,18,5,4,18,5,4
1,0297fc1e,180,1,4,34,1,0,0,2,78,...,22,111,1,189,22,5,1,2,5,2
2,0297fc1e,180,1,4,34,1,0,0,2,78,...,22,111,1,189,22,5,1,2,5,2
3,0297fc1e,180,1,4,34,1,0,0,2,78,...,22,111,1,189,22,5,1,3,5,3
4,0297fc1e,180,1,4,34,1,0,0,2,78,...,22,111,1,189,22,5,1,10,5,3


In [54]:
sets.tail()

Unnamed: 0,person,model,condition,storage,color,compras_prod,vistas_prod,otros,new_vs_returning,region,...,operating_system_version,browser_version,visitas_sitio,person_int,dia_view_prod,mes_view_prod,dia_sem_view_prod,dia_visit_user,mes_visit_user,dia_sem_visit_user
3872,85e0f62a,0,0,0,0,0,0,1,1,79,...,20,112,1,10087,31,5,3,31,5,3
3873,85e0f62a,0,0,0,0,0,0,1,2,79,...,20,112,1,10087,31,5,3,31,5,3
3874,92f2d94b,67,0,0,0,0,0,1,0,0,...,0,0,0,11118,23,3,4,0,0,0
3875,40bf23ab,180,0,0,0,0,0,1,0,0,...,0,0,0,4896,30,5,2,0,0,0
3876,80aea0a0,151,0,0,0,0,0,1,0,0,...,0,0,0,9700,30,5,2,0,0,0


In [55]:
sets.person.describe()

count      1293877
unique       19415
top       5059f7fd
freq         96465
Name: person, dtype: object

In [56]:
sets.to_csv('data/set_prediccion_14.csv', encoding='utf-8', index=False)

# Set Datos Entrenamiento

# #Analisis TP1

In [8]:
training_set.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label'],
      dtype='object')

In [9]:
training_set.drop(['url','skus', 'sku', 'city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [10]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
training_set[['timestamp']] = training_set[['timestamp']].apply(pd.to_datetime)
training_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
training_set.head()

Unnamed: 0,fecha,event,person,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:27,viewed product,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,0
1,2018-05-18 00:23:33,viewed product,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado,,,,,,,,,,0
2,2018-05-18 00:16:10,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,,,,,,,,,0
3,2018-05-18 00:14:55,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,,,,,,,,0
4,2018-05-18 00:11:26,ad campaign hit,ad93850f,,,,,,,,,,,,,,0


vamos a crear las columnas compras_prod, vistas_prod, otros para productos y visitas_sitio para persona, de la forma de One Hot Encoder

In [11]:
#Detalles de Productos que fueron comprados
productos = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'label']]
productos_comprados = productos[(productos['event'] == 'conversion') | (productos['event'] == 'checkout')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados['compras_prod'] = 1
productos_comprados['vistas_prod'] = 0
productos_comprados['otros'] = 0
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,label,compras_prod,vistas_prod,otros
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0,1,0,0
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,0,1,0,0
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,0,1,0,0
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0,1,0,0
211,2018-05-09 00:13:59,99abca5a,Motorola Moto G4 Plus,Excelente,32GB,Bambu,0,1,0,0


In [12]:
productos_comprados.count()

fecha           36103
person          36103
model           36103
condition       36103
storage         36103
color           36103
label           36103
compras_prod    36103
vistas_prod     36103
otros           36103
dtype: int64

In [13]:
# productos_comprados = productos_comprados.drop_duplicates()
# productos_comprados.count()

In [14]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[productos.event == 'viewed product']
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados['compras_prod'] = 0
productos_visitados['vistas_prod'] = 1
productos_visitados['otros'] = 0
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,label,compras_prod,vistas_prod,otros
0,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0,0,1,0
1,2018-05-18 00:23:33,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado,0,0,1,0
2,2018-05-18 00:16:10,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,0,0,1,0
3,2018-05-18 00:14:55,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,0,0,1,0
5,2018-05-16 02:48:16,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0,0,1,0


In [15]:
productos_visitados.count()

fecha           629233
person          629233
model           629233
condition       629233
storage         629233
color           629233
label           629233
compras_prod    629233
vistas_prod     629233
otros           629233
dtype: int64

In [16]:
# productos_visitados = productos_comprados.drop_duplicates()
# productos_visitados.count()

In [17]:
# Eliminamos de productos visitados las personas que estan en productos comprados
productos_visitados = productos_visitados.merge(productos_comprados, on='person', how='left')
productos_visitados = productos_visitados[productos_visitados['fecha_y'].isnull()]
productos_visitados = productos_visitados.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x', 'label_x']]
productos_visitados.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros', 'label_x': 'label'}, inplace=True)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros,label
25576,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0,1,0,1
25577,2018-05-18 00:15:33,9bb3af27,Samsung Galaxy S5,Bom,16GB,Preto,0,1,0,1
25578,2018-05-18 00:20:10,9bb3af27,Samsung Galaxy S5,Bom,16GB,Branco,0,1,0,1
25579,2018-05-18 00:21:58,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0,1,0,1
25580,2018-05-18 00:14:35,9bb3af27,Samsung Galaxy S5,Bom,16GB,Dourado,0,1,0,1


In [18]:
productos_visitados.count()

fecha           52146
person          52146
model           52146
condition       52146
storage         52146
color           52146
compras_prod    52146
vistas_prod     52146
otros           52146
label           52146
dtype: int64

In [19]:
otros = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color','label']]
otros = otros[(otros['event'] == 'generic listing') | (otros['event'] == 'lead')]
otros.drop(['event'], inplace=True, axis=1)
otros['compras_prod'] = 0
otros['vistas_prod'] = 0
otros['otros'] = 1
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,label,compras_prod,vistas_prod,otros
38,2018-05-14 23:50:23,ad93850f,,,,,0,0,0,1
46,2018-05-18 00:12:55,ad93850f,,,,,0,0,0,1
53,2018-05-18 00:23:21,ad93850f,,,,,0,0,0,1
54,2018-05-18 22:12:18,ad93850f,,,,,0,0,0,1
57,2018-05-18 22:11:47,ad93850f,,,,,0,0,0,1


In [20]:
otros.count()

fecha           80884
person          80884
model             431
condition           0
storage             0
color               0
label           80884
compras_prod    80884
vistas_prod     80884
otros           80884
dtype: int64

In [21]:
# otros = otros.drop_duplicates()
# otros.count()

In [22]:
# Eliminamos de otros las personas que estan en productos comprados
otros = otros.merge(productos_comprados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x', 'label_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros', 'label_x': 'label'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros,label
1923,2018-05-18 00:11:31,9bb3af27,,,,,0,0,1,1
1924,2018-05-18 00:20:28,9bb3af27,,,,,0,0,1,1
1925,2018-05-18 00:22:23,9bb3af27,,,,,0,0,1,1
1926,2018-05-18 19:28:08,9bb3af27,,,,,0,0,1,1
1927,2018-05-18 19:26:43,9bb3af27,,,,,0,0,1,1


In [23]:
otros.count()

fecha           7488
person          7488
model             32
condition          0
storage            0
color              0
compras_prod    7488
vistas_prod     7488
otros           7488
label           7488
dtype: int64

In [24]:
# Eliminamos de otros las personas que estan en productos visitados
otros = otros.merge(productos_visitados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x','compras_prod_x','vistas_prod_x', 'otros_x', 'label_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros', 'label_x': 'label'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,compras_prod,vistas_prod,otros,label
32460,2018-05-29 19:45:16,f33e2cc5,,,,,0,0,1,0
32845,2018-05-19 10:31:43,fb3c6e61,,,,,0,0,1,0
56829,2018-05-29 21:21:02,4042a213,,,,,0,0,1,0
56830,2018-05-18 14:20:44,9a84d509,,,,,0,0,1,0
56831,2018-05-18 14:24:27,9a84d509,,,,,0,0,1,0


In [25]:
otros.count()

fecha           350
person          350
model             2
condition         0
storage           0
color             0
compras_prod    350
vistas_prod     350
otros           350
label           350
dtype: int64

In [26]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = training_set.loc[:,['event','fecha','person','new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version', 'label']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario['visitas_sitio'] = 1
features_usuario.head()

Unnamed: 0,fecha,person,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,visitas_sitio
60,2018-05-14 23:50:22,ad93850f,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,1
61,2018-05-16 02:48:13,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,1
62,2018-05-18 00:11:26,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,1
63,2018-05-18 22:11:46,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,1
64,2018-05-22 22:41:31,ad93850f,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,1


In [27]:
features_usuario.count()

fecha                       102299
person                      102299
new_vs_returning            102299
region                      102299
country                     102299
device_type                 102299
screen_resolution           102297
operating_system_version    102299
browser_version             102299
label                       102299
visitas_sitio               102299
dtype: int64

In [28]:
features_usuario = features_usuario.drop_duplicates()
features_usuario.count()

fecha                       102298
person                      102298
new_vs_returning            102298
region                      102298
country                     102298
device_type                 102298
screen_resolution           102296
operating_system_version    102298
browser_version             102298
label                       102298
visitas_sitio               102298
dtype: int64

In [29]:
del productos
del visitas

In [30]:
user_prod_comprados = productos_comprados.merge(features_usuario, on="person", how="outer")
user_prod_comprados.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,label_x,compras_prod,vistas_prod,otros,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label_y,visitas_sitio
0,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0.0,1.0,0.0,0.0,2018-05-14 23:50:22,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0.0,1.0
1,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0.0,1.0,0.0,0.0,2018-05-16 02:48:13,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0.0,1.0
2,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0.0,1.0,0.0,0.0,2018-05-18 00:11:26,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0.0,1.0
3,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0.0,1.0,0.0,0.0,2018-05-18 22:11:46,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0.0,1.0
4,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,0.0,1.0,0.0,0.0,2018-05-22 22:41:31,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0.0,1.0


In [31]:
user_prod_comprados.count()

fecha_x                     487502
person                      497917
model                       487502
condition                   487502
storage                     487502
color                       487502
label_x                     487502
compras_prod                487502
vistas_prod                 487502
otros                       487502
fecha_y                     497450
new_vs_returning            497450
region                      497450
country                     497450
device_type                 497450
screen_resolution           497437
operating_system_version    497450
browser_version             497450
label_y                     497450
visitas_sitio               497450
dtype: int64

In [32]:
user_prod_vistas = productos_visitados.merge(features_usuario, on="person", how="outer")
user_prod_vistas.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,label_x,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label_y,visitas_sitio
0,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0.0,1.0,0.0,1.0,2018-05-17 17:16:36,New,Minas Gerais,Brazil,Smartphone,320x534,Android 6,Chrome Mobile 66.0,1.0,1.0
1,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0.0,1.0,0.0,1.0,2018-05-17 21:45:43,Returning,Minas Gerais,Brazil,Smartphone,320x534,Android 6,Chrome Mobile 66.0,1.0,1.0
2,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0.0,1.0,0.0,1.0,2018-05-18 00:11:31,Returning,Minas Gerais,Brazil,Smartphone,320x534,Android 6,Chrome Mobile 66.0,1.0,1.0
3,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,0.0,1.0,0.0,1.0,2018-05-18 19:26:42,Returning,Minas Gerais,Brazil,Smartphone,320x534,Android 6,Chrome Mobile 66.0,1.0,1.0
4,2018-05-18 00:15:33,9bb3af27,Samsung Galaxy S5,Bom,16GB,Preto,0.0,1.0,0.0,1.0,2018-05-17 17:16:36,New,Minas Gerais,Brazil,Smartphone,320x534,Android 6,Chrome Mobile 66.0,1.0,1.0


In [33]:
user_prod_vistas.count()

fecha_x                     582196
person                      674544
model                       582196
condition                   582196
storage                     582196
color                       582196
compras_prod                582196
vistas_prod                 582196
otros                       582196
label_x                     582196
fecha_y                     674421
new_vs_returning            674421
region                      674421
country                     674421
device_type                 674421
screen_resolution           674419
operating_system_version    674421
browser_version             674421
label_y                     674421
visitas_sitio               674421
dtype: int64

In [34]:
otros = otros.merge(features_usuario, on="person", how="outer")
otros.head()

Unnamed: 0,fecha_x,person,model,condition,storage,color,compras_prod,vistas_prod,otros,label_x,fecha_y,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label_y,visitas_sitio
0,2018-05-29 19:45:16,f33e2cc5,,,,,0.0,0.0,1.0,0.0,2018-05-29 19:45:16,New,Sao Paulo,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,0.0,1.0
1,2018-05-19 10:31:43,fb3c6e61,,,,,0.0,0.0,1.0,0.0,2018-05-19 10:31:42,New,Pernambuco,Brazil,Smartphone,360x640,Android 6,Chrome Mobile 66.0,0.0,1.0
2,2018-05-29 21:21:02,4042a213,,,,,0.0,0.0,1.0,0.0,2018-05-29 21:21:02,New,Rio de Janeiro,Brazil,Smartphone,320x570,Android 6,Chrome Mobile 64.0,0.0,1.0
3,2018-05-18 14:20:44,9a84d509,,,,,0.0,0.0,1.0,0.0,2018-05-29 22:59:02,Returning,Ceara,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,0.0,1.0
4,2018-05-18 14:20:44,9a84d509,,,,,0.0,0.0,1.0,0.0,2018-05-18 14:20:10,New,Ceara,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 66.0,0.0,1.0


In [35]:
otros.count()

fecha_x                       1505
person                      103462
model                            2
condition                        0
storage                          0
color                            0
compras_prod                  1505
vistas_prod                   1505
otros                         1505
label_x                       1505
fecha_y                     103454
new_vs_returning            103454
region                      103454
country                     103454
device_type                 103454
screen_resolution           103452
operating_system_version    103454
browser_version             103454
label_y                     103454
visitas_sitio               103454
dtype: int64

In [36]:
del features_usuario
del productos_comprados
del productos_visitados

In [37]:
#concateno
frames = [user_prod_comprados, user_prod_vistas, otros]
sets = pd.concat(frames)
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,label_x,label_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod
0,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,0.0,0.0,iPhone 5s,New,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0
1,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,0.0,0.0,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0
2,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,0.0,0.0,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0
3,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,0.0,0.0,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0
4,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,0.0,0.0,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0


In [38]:
del frames
del user_prod_comprados
del user_prod_vistas
del otros

In [39]:
sets.count()

browser_version             1275325
color                       1069698
compras_prod                1071203
condition                   1069698
country                     1275325
device_type                 1275325
fecha_x                     1071203
fecha_y                     1275325
label_x                     1071203
label_y                     1275325
model                       1069700
new_vs_returning            1275325
operating_system_version    1275325
otros                       1071203
person                      1275923
region                      1275325
screen_resolution           1275308
storage                     1069698
visitas_sitio               1275325
vistas_prod                 1071203
dtype: int64

In [40]:
sets = sets.drop_duplicates()

In [41]:
sets.count()

browser_version             1171058
color                       1067853
compras_prod                1069358
condition                   1067853
country                     1171058
device_type                 1171058
fecha_x                     1069358
fecha_y                     1171058
label_x                     1069358
label_y                     1171058
model                       1067855
new_vs_returning            1171058
operating_system_version    1171058
otros                       1069358
person                      1171656
region                      1171058
screen_resolution           1171043
storage                     1067853
visitas_sitio               1171058
vistas_prod                 1069358
dtype: int64

In [42]:
# Rellenamos compras_prod  vistas_prod  otros y visitas_sitio con 0
sets['compras_prod'] = sets['compras_prod'].fillna(value=0) 
sets['vistas_prod'] = sets['vistas_prod'].fillna(value=0) 
sets['otros'] = sets['otros'].fillna(value=0) 
sets['visitas_sitio'] = sets['visitas_sitio'].fillna(value=0) 

In [43]:
sets.count()

browser_version             1171058
color                       1067853
compras_prod                1171656
condition                   1067853
country                     1171058
device_type                 1171058
fecha_x                     1069358
fecha_y                     1171058
label_x                     1069358
label_y                     1171058
model                       1067855
new_vs_returning            1171058
operating_system_version    1171058
otros                       1171656
person                      1171656
region                      1171058
screen_resolution           1171043
storage                     1067853
visitas_sitio               1171656
vistas_prod                 1171656
dtype: int64

In [44]:
# Rellenamos label_x y label_y con 0 y nos quedamos con label
sets['label_x'] = sets['label_x'].fillna(value=0) 
sets['label_y'] = sets['label_y'].fillna(value=0) 
sets.loc[(sets['label_x'] == 1) | (sets['label_y'] == 1), 'label'] = 1
sets['label'] = sets['label'].fillna(value=0) 
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,label_x,label_y,...,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label
0,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,0.0,0.0,...,New,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0
1,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,0.0,0.0,...,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0
2,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,0.0,0.0,...,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0
3,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,0.0,0.0,...,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0
4,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,0.0,0.0,...,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0


In [45]:
sets[sets['label'] == 1].shape

(276643, 21)

In [46]:
sets[sets['label'] == 0].shape

(895013, 21)

In [47]:
sets = sets.drop('label_x',1)
sets = sets.drop('label_y',1)
sets.count()

browser_version             1171058
color                       1067853
compras_prod                1171656
condition                   1067853
country                     1171058
device_type                 1171058
fecha_x                     1069358
fecha_y                     1171058
model                       1067855
new_vs_returning            1171058
operating_system_version    1171058
otros                       1171656
person                      1171656
region                      1171058
screen_resolution           1171043
storage                     1067853
visitas_sitio               1171656
vistas_prod                 1171656
label                       1171656
dtype: int64

In [48]:
# Rellenamos los demas con ''
sets = sets.fillna(value='') 

In [49]:
sets.count()

browser_version             1171656
color                       1171656
compras_prod                1171656
condition                   1171656
country                     1171656
device_type                 1171656
fecha_x                     1171656
fecha_y                     1171656
model                       1171656
new_vs_returning            1171656
operating_system_version    1171656
otros                       1171656
person                      1171656
region                      1171656
screen_resolution           1171656
storage                     1171656
visitas_sitio               1171656
vistas_prod                 1171656
label                       1171656
dtype: int64

In [50]:
sets.person.describe()

count      1171656
unique       19414
top       ecd79d0a
freq         33950
Name: person, dtype: object

## Categorizacion de los features

In [51]:
# creo una nueva columna person_int para entrenar transformando el valor alfanumerico de person a numerico
sets['person_int'] = labelencoder.fit_transform(sets['person'])
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
1,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
2,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
3,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
4,Chrome Mobile 66.0,Cinza espacial,1.0,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068


In [52]:
# convertimos a integer el valor flotante de compras_prod
sets['compras_prod'] = sets['compras_prod'].astype('int')
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
1,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
2,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
3,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068
4,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0.0,0.0,13068


In [53]:
# convertimos a integer el valor flotante de vistas_prod
sets['vistas_prod'] = sets['vistas_prod'].astype('int')
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
1,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
2,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
3,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
4,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0.0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068


In [54]:
# convertimos a integer el valor flotante de otros
sets['otros'] = sets['otros'].astype('int')
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
1,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
2,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
3,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068
4,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1.0,0,0.0,13068


In [55]:
# convertimos a integer el valor flotante de visitas_sitio
sets['visitas_sitio'] = sets['visitas_sitio'].astype('int')
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0.0,13068
1,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0.0,13068
2,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0.0,13068
3,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0.0,13068
4,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0.0,13068


In [56]:
# convertimos a integer el valor flotante de label
sets['label'] = sets['label'].astype('int')
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-14 23:50:22,iPhone 5s,New,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0,13068
1,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-16 02:48:13,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0,13068
2,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 00:11:26,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0,13068
3,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-18 22:11:46,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0,13068
4,Chrome Mobile 66.0,Cinza espacial,1,Muito Bom,Brazil,Smartphone,2018-05-14 23:54:19,2018-05-22 22:41:31,iPhone 5s,Returning,Android 5.1.1,0,ad93850f,Sao Paulo,360x640,32GB,1,0,0,13068


In [57]:
#Categorizamos los valores de los features con label encoder
sets['model'] = labelencoder.fit_transform(sets['model'])
sets['condition'] = labelencoder.fit_transform(sets['condition'])
sets['storage'] = labelencoder.fit_transform(sets['storage'])
sets['color'] = labelencoder.fit_transform(sets['color'])
sets['new_vs_returning'] = labelencoder.fit_transform(sets['new_vs_returning'])
sets['region'] = labelencoder.fit_transform(sets['region'])
sets['country'] = labelencoder.fit_transform(sets['country'])
sets['device_type'] = labelencoder.fit_transform(sets['device_type'])
sets['screen_resolution'] = labelencoder.fit_transform(sets['screen_resolution'])
sets['operating_system_version'] = labelencoder.fit_transform(sets['operating_system_version'])
sets['browser_version'] = labelencoder.fit_transform(sets['browser_version'])
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,fecha_x,fecha_y,model,new_vs_returning,operating_system_version,otros,person,region,screen_resolution,storage,visitas_sitio,vistas_prod,label,person_int
0,103,21,1,4,5,2,2018-05-14 23:54:19,2018-05-14 23:50:22,176,1,19,0,ad93850f,64,173,4,1,0,0,13068
1,103,21,1,4,5,2,2018-05-14 23:54:19,2018-05-16 02:48:13,176,2,19,0,ad93850f,64,173,4,1,0,0,13068
2,103,21,1,4,5,2,2018-05-14 23:54:19,2018-05-18 00:11:26,176,2,19,0,ad93850f,64,173,4,1,0,0,13068
3,103,21,1,4,5,2,2018-05-14 23:54:19,2018-05-18 22:11:46,176,2,19,0,ad93850f,64,173,4,1,0,0,13068
4,103,21,1,4,5,2,2018-05-14 23:54:19,2018-05-22 22:41:31,176,2,19,0,ad93850f,64,173,4,1,0,0,13068


In [58]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018)
sets['dia_view_prod'] = sets['fecha_x'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes_view_prod'] = sets['fecha_x'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_sem_view_prod'] = sets['fecha_x'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha_x',1)

In [59]:
#Categorizamos la fecha de visita del usuario en dia mes hora y minuto (ya que el anio es 2018)
sets['dia_visit_user'] = sets['fecha_y'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes_visit_user'] = sets['fecha_y'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_sem_visit_user'] = sets['fecha_y'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha_y',1)

In [60]:
sets.head()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,model,new_vs_returning,operating_system_version,otros,...,visitas_sitio,vistas_prod,label,person_int,dia_view_prod,mes_view_prod,dia_sem_view_prod,dia_visit_user,mes_visit_user,dia_sem_visit_user
0,103,21,1,4,5,2,176,1,19,0,...,1,0,0,13068,14,5,0,14,5,0
1,103,21,1,4,5,2,176,2,19,0,...,1,0,0,13068,14,5,0,16,5,2
2,103,21,1,4,5,2,176,2,19,0,...,1,0,0,13068,14,5,0,18,5,4
3,103,21,1,4,5,2,176,2,19,0,...,1,0,0,13068,14,5,0,18,5,4
4,103,21,1,4,5,2,176,2,19,0,...,1,0,0,13068,14,5,0,22,5,1


In [61]:
sets.tail()

Unnamed: 0,browser_version,color,compras_prod,condition,country,device_type,model,new_vs_returning,operating_system_version,otros,...,visitas_sitio,vistas_prod,label,person_int,dia_view_prod,mes_view_prod,dia_sem_view_prod,dia_visit_user,mes_visit_user,dia_sem_visit_user
1500,225,0,0,0,5,2,0,2,88,1,...,1,0,0,15181,27,5,6,27,5,6
1501,103,0,0,0,5,2,0,1,19,1,...,1,0,0,1225,27,5,6,28,5,0
1502,0,0,0,0,0,0,0,0,0,1,...,0,0,0,10848,24,5,3,0,0,0
1503,0,0,0,0,0,0,0,0,0,1,...,0,0,0,10848,24,5,3,0,0,0
1504,0,0,0,0,0,0,3,0,0,1,...,0,0,0,16372,24,5,3,0,0,0


In [62]:
sets.count()

browser_version             1171656
color                       1171656
compras_prod                1171656
condition                   1171656
country                     1171656
device_type                 1171656
model                       1171656
new_vs_returning            1171656
operating_system_version    1171656
otros                       1171656
person                      1171656
region                      1171656
screen_resolution           1171656
storage                     1171656
visitas_sitio               1171656
vistas_prod                 1171656
label                       1171656
person_int                  1171656
dia_view_prod               1171656
mes_view_prod               1171656
dia_sem_view_prod           1171656
dia_visit_user              1171656
mes_visit_user              1171656
dia_sem_visit_user          1171656
dtype: int64

In [63]:
sets.person.describe()

count      1171656
unique       19414
top       ecd79d0a
freq         33950
Name: person, dtype: object

In [64]:
sets[sets['label'] == 1].shape

(276643, 24)

In [65]:
sets[sets['label'] == 0].shape

(895013, 24)

In [66]:
sets.to_csv('data/set_entrenamiento_14.csv', encoding='utf-8', index=False)