In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from dateutil.parser import parse
import numpy as np

import warnings
import datetime as dt
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [8]:
events = pd.read_csv('data/events_up_to_01062018.csv', low_memory = False)
events.person.describe()

count      2341681
unique       38829
top       c76b8417
freq          4438
Name: person, dtype: object

In [9]:
prediction_set = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory = False)
training_set = pd.read_csv('data/labels_training_set.csv', low_memory=False)

print('PREDICCION ' , prediction_set.person.describe())
print('ENTRENAMIENTO ', training_set.person.describe())
print('ENTRENAMIENTO 1s ', training_set[training_set['label'] == 1].shape)
print('ENTRENAMIENTO 0s ', training_set[training_set['label'] == 0].shape)

prediction_set = prediction_set.merge(events, on = 'person', how = 'inner')
prediction_set = prediction_set[(prediction_set['event'] == 'visited site') | (prediction_set['event'] == 'viewed product') | (prediction_set['event'] == 'conversion') | (prediction_set['event'] == 'checkout') | (prediction_set['event'] == 'brand listing') | (prediction_set['event'] == 'generic listing') | (prediction_set['event'] == 'lead') | (prediction_set['event'] == 'staticpage') ]                                                                       

training_set = events.merge(training_set, on = 'person', how = 'inner')

info = training_set[['person']]
info['frecuencia'] = 1
data = pd.DataFrame({'frecuencia': info.groupby('person').aggregate(sum)['frecuencia']}).reset_index()

training_set = training_set.merge(data, on = 'person', how = 'left')
training_set= training_set[(training_set['label'] == 1) | ((training_set['frecuencia'] <= 100) & (training_set['label'] == 0))]


PREDICCION  count        19415
unique       19415
top       37648442
freq             1
Name: person, dtype: object
ENTRENAMIENTO  count        19414
unique       19414
top       bef685b9
freq             1
Name: person, dtype: object
ENTRENAMIENTO 1s  (980, 2)
ENTRENAMIENTO 0s  (18434, 2)


In [10]:
del info
del data
del events

In [11]:
prediction_set.person.describe()

count       953099
unique       19415
top       5059f7fd
freq          2932
Name: person, dtype: object

In [12]:
training_set.person.describe()

count       525152
unique       16779
top       c7cc2c23
freq          1561
Name: person, dtype: object

# Set Datos Prediccion

## Analisis TP1

In [13]:
prediction_set.columns

Index(['person', 'timestamp', 'event', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version'],
      dtype='object')

In [14]:
prediction_set.drop(['url','skus', 'sku', 'city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [15]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
prediction_set[['timestamp']] = prediction_set[['timestamp']].apply(pd.to_datetime)
prediction_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
prediction_set.head()

Unnamed: 0,person,fecha,event,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,4886f805,2018-05-18 00:11:59,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
1,4886f805,2018-05-18 00:30:30,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
3,4886f805,2018-05-18 00:11:56,checkout,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,
4,4886f805,2018-05-18 00:11:35,viewed product,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,,,,,,,,
6,4886f805,2018-05-18 00:11:53,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,


vamos a crear las columnas compras_prod, vistas_prod, otros para productos y visitas_sitio para persona, de la forma de One Hot Encoder

In [16]:
#Detalles de Productos que fueron comprados
productos = prediction_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
productos_comprados = productos[(productos['event'] == 'conversion') | (productos['event'] == 'checkout')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados['compras_prod'] = 1
productos_comprados['vistas_prod'] = 0
productos_comprados['otros'] = 0
productos_comprados['visitas_sitio'] = 0
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0


In [17]:
productos_comprados.count()

fecha                       36303
person                      36303
model                       36303
condition                   36303
storage                     36303
color                       36303
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
compras_prod                36303
vistas_prod                 36303
otros                       36303
visitas_sitio               36303
dtype: int64

In [18]:
# productos_comprados = productos_comprados.drop_duplicates()
# productos_comprados.count()

In [19]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[productos.event == 'viewed product']
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados['compras_prod'] = 0
productos_visitados['vistas_prod'] = 1
productos_visitados['otros'] = 0
productos_visitados['visitas_sitio'] = 0
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
0,2018-05-18 00:11:59,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0
1,2018-05-18 00:30:30,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0
4,2018-05-18 00:11:35,4886f805,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,,,,,,,0,1,0,0
6,2018-05-18 00:11:53,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0
9,2018-05-18 00:11:16,0297fc1e,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,0,1,0,0


In [20]:
productos_visitados.count()

fecha                       618891
person                      618891
model                       618891
condition                   618891
storage                     618891
color                       618891
new_vs_returning                 0
region                           0
country                          0
device_type                      0
screen_resolution                0
operating_system_version         0
browser_version                  0
compras_prod                618891
vistas_prod                 618891
otros                       618891
visitas_sitio               618891
dtype: int64

In [21]:
# productos_visitados = productos_visitados.drop_duplicates()
# productos_visitados.count()

In [22]:
# Eliminamos de productos visitados las personas que estan en productos comprados
productos_visitados = productos_visitados.merge(productos_comprados, on='person', how='left')
productos_visitados = productos_visitados[productos_visitados['fecha_y'].isnull()]
productos_visitados = productos_visitados.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
productos_visitados.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
6574,2018-05-18 00:25:42,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,,,,,,,,0,1,0,0
6575,2018-05-18 00:25:27,686c49c9,iPhone 7,Excelente,128GB,Preto Matte,,,,,,,,0,1,0,0
6576,2018-05-18 01:05:31,686c49c9,iPhone 7,Excelente,128GB,Ouro Rosa,,,,,,,,0,1,0,0
6577,2018-05-15 20:11:48,686c49c9,iPhone 7,Excelente,128GB,Prateado,,,,,,,,0,1,0,0
6578,2018-05-15 20:10:28,686c49c9,iPhone 6S,Excelente,16GB,Dourado,,,,,,,,0,1,0,0


In [23]:
productos_visitados.count()

fecha                       58106
person                      58106
model                       58106
condition                   58106
storage                     58106
color                       58106
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
compras_prod                58106
vistas_prod                 58106
otros                       58106
visitas_sitio               58106
dtype: int64

In [24]:
otros = prediction_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
otros = otros[(otros['event'] == 'brand listing') | (otros['event'] == 'generic listing') | (otros['event'] == 'lead') | (otros['event'] == 'staticpage')]
otros.drop(['event'], inplace=True, axis=1)
otros['compras_prod'] = 0
otros['vistas_prod'] = 0
otros['otros'] = 1
otros['visitas_sitio'] = 0
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
7,2018-05-18 00:07:22,4886f805,,,,,,,,,,,,0,0,1,0
455,2018-05-17 19:59:37,0297fc1e,,,,,,,,,,,,0,0,1,0
456,2018-05-21 23:08:54,0297fc1e,,,,,,,,,,,,0,0,1,0
457,2018-03-20 16:39:59,0297fc1e,,,,,,,,,,,,0,0,1,0
458,2018-04-29 18:41:44,0297fc1e,,,,,,,,,,,,0,0,1,0


In [25]:
otros.count()

fecha                       196135
person                      196135
model                          552
condition                        0
storage                          0
color                            0
new_vs_returning                 0
region                           0
country                          0
device_type                      0
screen_resolution                0
operating_system_version         0
browser_version                  0
compras_prod                196135
vistas_prod                 196135
otros                       196135
visitas_sitio               196135
dtype: int64

In [26]:
# otros = otros.drop_duplicates()
# otros.count()

In [27]:
# Eliminamos de otros las personas que estan en productos comprados
otros = otros.merge(productos_comprados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
523,2018-05-07 12:45:15,686c49c9,,,,,,,,,,,,0,0,1,0
524,2018-05-07 12:39:53,686c49c9,,,,,,,,,,,,0,0,1,0
525,2018-05-07 12:41:38,686c49c9,,,,,,,,,,,,0,0,1,0
526,2018-05-07 12:43:42,686c49c9,,,,,,,,,,,,0,0,1,0
527,2018-05-07 12:43:43,686c49c9,,,,,,,,,,,,0,0,1,0


In [28]:
otros.count()

fecha                       18472
person                      18472
model                          58
condition                       0
storage                         0
color                           0
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
compras_prod                18472
vistas_prod                 18472
otros                       18472
visitas_sitio               18472
dtype: int64

In [29]:
# Eliminamos de otros las personas que estan en productos visitados
otros = otros.merge(productos_visitados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
60413,2018-05-28 12:36:14,df80a783,,,,,,,,,,,,0,0,1,0
60414,2018-05-28 22:28:24,df80a783,,,,,,,,,,,,0,0,1,0
60415,2018-05-29 14:47:28,df80a783,,,,,,,,,,,,0,0,1,0
63146,2018-05-19 16:30:03,5437de8d,,,,,,,,,,,,0,0,1,0
69477,2018-02-07 13:52:08,626d2cd6,,,,,,,,,,,,0,0,1,0


In [30]:
otros.count()

fecha                       590
person                      590
model                         5
condition                     0
storage                       0
color                         0
new_vs_returning              0
region                        0
country                       0
device_type                   0
screen_resolution             0
operating_system_version      0
browser_version               0
compras_prod                590
vistas_prod                 590
otros                       590
visitas_sitio               590
dtype: int64

In [31]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = prediction_set.loc[:,['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario['compras_prod'] = 0
features_usuario['vistas_prod'] = 0
features_usuario['otros'] = 0
features_usuario['visitas_sitio'] = 1
features_usuario.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
8,2018-05-18 00:07:22,4886f805,,,,,New,Rio de Janeiro,Brazil,Smartphone,360x640,Android 7,Chrome Mobile 64.0,0,0,0,1
480,2018-05-02 01:28:27,0297fc1e,,,,,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,0,0,0,1
481,2018-05-02 14:28:20,0297fc1e,,,,,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,0,0,0,1
482,2018-05-03 01:55:52,0297fc1e,,,,,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,0,0,0,1
483,2018-05-10 00:38:17,0297fc1e,,,,,Returning,Rio de Janeiro,Brazil,Smartphone,360x640,Android 6.0.1,Chrome Mobile 65.0,0,0,0,1


In [32]:
features_usuario.count()

fecha                       101770
person                      101770
model                            0
condition                        0
storage                          0
color                            0
new_vs_returning            101770
region                      101770
country                     101770
device_type                 101770
screen_resolution           101769
operating_system_version    101770
browser_version             101770
compras_prod                101770
vistas_prod                 101770
otros                       101770
visitas_sitio               101770
dtype: int64

In [33]:
del productos
del visitas

In [34]:
#concateno
frames = [productos_comprados, productos_visitados, otros, features_usuario]
sets = pd.concat(frames)
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0


In [35]:
del frames
del productos_comprados
del productos_visitados
del otros

In [36]:
sets.count()

fecha                       196769
person                      196769
model                        94414
condition                    94409
storage                      94409
color                        94409
new_vs_returning            101770
region                      101770
country                     101770
device_type                 101770
screen_resolution           101769
operating_system_version    101770
browser_version             101770
compras_prod                196769
vistas_prod                 196769
otros                       196769
visitas_sitio               196769
dtype: int64

In [37]:
# Rellenamos compras_prod  vistas_prod  otros y visitas_sitio con 0
sets['compras_prod'] = sets['compras_prod'].fillna(value=0) 
sets['vistas_prod'] = sets['vistas_prod'].fillna(value=0) 
sets['otros'] = sets['otros'].fillna(value=0) 
sets['visitas_sitio'] = sets['visitas_sitio'].fillna(value=0) 

In [38]:
sets.count()

fecha                       196769
person                      196769
model                        94414
condition                    94409
storage                      94409
color                        94409
new_vs_returning            101770
region                      101770
country                     101770
device_type                 101770
screen_resolution           101769
operating_system_version    101770
browser_version             101770
compras_prod                196769
vistas_prod                 196769
otros                       196769
visitas_sitio               196769
dtype: int64

In [39]:
# Rellenamos los demas con ''
sets = sets.fillna(value='') 

In [40]:
sets.count()

fecha                       196769
person                      196769
model                       196769
condition                   196769
storage                     196769
color                       196769
new_vs_returning            196769
region                      196769
country                     196769
device_type                 196769
screen_resolution           196769
operating_system_version    196769
browser_version             196769
compras_prod                196769
vistas_prod                 196769
otros                       196769
visitas_sitio               196769
dtype: int64

In [41]:
sets.person.describe()

count       196769
unique       19415
top       a0b57323
freq           636
Name: person, dtype: object

## Categorizacion de los features

In [42]:
# creo una nueva columna person_int para entrenar transformando el valor alfanumerico de person a numerico
sets['person_int'] = labelencoder.fit_transform(sets['person'])
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0,189


In [43]:
# convertimos a integer el valor flotante de compras_prod
sets['compras_prod'] = sets['compras_prod'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0,189


In [44]:
# convertimos a integer el valor flotante de vistas_prod
sets['vistas_prod'] = sets['vistas_prod'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0,189


In [45]:
# convertimos a integer el valor flotante de otros
sets['otros'] = sets['otros'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0,189


In [46]:
# convertimos a integer el valor flotante de visitas_sitio
sets['visitas_sitio'] = sets['visitas_sitio'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,iPhone 6S,Bom,32GB,Ouro Rosa,,,,,,,,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,iPhone 6S Plus,Bom,16GB,Cinza espacial,,,,,,,,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,iPhone 6,Muito Bom,64GB,Dourado,,,,,,,,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,iPhone 6,Bom,64GB,Dourado,,,,,,,,1,0,0,0,189


In [47]:
#Categorizamos los valores de los features con label encoder
sets['model'] = labelencoder.fit_transform(sets['model'])
sets['condition'] = labelencoder.fit_transform(sets['condition'])
sets['storage'] = labelencoder.fit_transform(sets['storage'])
sets['color'] = labelencoder.fit_transform(sets['color'])
sets['new_vs_returning'] = labelencoder.fit_transform(sets['new_vs_returning'])
sets['region'] = labelencoder.fit_transform(sets['region'])
sets['country'] = labelencoder.fit_transform(sets['country'])
sets['device_type'] = labelencoder.fit_transform(sets['device_type'])
sets['screen_resolution'] = labelencoder.fit_transform(sets['screen_resolution'])
sets['operating_system_version'] = labelencoder.fit_transform(sets['operating_system_version'])
sets['browser_version'] = labelencoder.fit_transform(sets['browser_version'])
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int
3,2018-05-18 00:11:56,4886f805,97,3,4,28,0,0,0,0,0,0,0,1,0,0,0,5476
56,2018-05-22 20:29:35,0297fc1e,180,1,4,34,0,0,0,0,0,0,0,1,0,0,0,189
89,2018-05-10 00:39:51,0297fc1e,181,1,2,21,0,0,0,0,0,0,0,1,0,0,0,189
313,2018-02-25 18:38:00,0297fc1e,178,4,7,28,0,0,0,0,0,0,0,1,0,0,0,189
339,2018-01-31 16:12:48,0297fc1e,178,1,7,28,0,0,0,0,0,0,0,1,0,0,0,189


In [48]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018)
sets['dia'] = sets['fecha'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes'] = sets['fecha'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_semana'] = sets['fecha'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha',1)

In [49]:
sets.head()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int,dia,mes,dia_semana
3,4886f805,97,3,4,28,0,0,0,0,0,0,0,1,0,0,0,5476,18,5,4
56,0297fc1e,180,1,4,34,0,0,0,0,0,0,0,1,0,0,0,189,22,5,1
89,0297fc1e,181,1,2,21,0,0,0,0,0,0,0,1,0,0,0,189,10,5,3
313,0297fc1e,178,4,7,28,0,0,0,0,0,0,0,1,0,0,0,189,25,2,6
339,0297fc1e,178,1,7,28,0,0,0,0,0,0,0,1,0,0,0,189,31,1,2


In [50]:
sets.tail()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,compras_prod,vistas_prod,otros,visitas_sitio,person_int,dia,mes,dia_semana
1169787,fb88a7ea,0,0,0,0,1,15,7,1,60,68,66,0,0,0,1,19100,16,2,4
1169788,9707cd0e,0,0,0,0,1,82,7,1,127,49,275,0,0,0,1,11464,30,5,2
1169789,6f7632db,0,0,0,0,1,90,7,2,155,10,111,0,0,0,1,8449,21,5,0
1169790,a1c2a901,0,0,0,0,1,84,7,2,170,20,163,0,0,0,1,12257,17,5,3
1169791,ed3f80d7,0,0,0,0,1,82,7,2,170,24,111,0,0,0,1,18012,18,4,2


In [51]:
sets.person.describe()

count       196769
unique       19415
top       a0b57323
freq           636
Name: person, dtype: object

In [52]:
sets.to_csv('data/set_prediccion_8.csv', encoding='utf-8', index=False)

# Set Datos Entrenamiento

# #Analisis TP1

In [53]:
training_set.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label', 'frecuencia'],
      dtype='object')

In [54]:
training_set.drop(['url','skus', 'sku', 'city','search_engine','campaign_source','channel'], inplace=True, axis=1)

In [55]:
#Cambiamos el tipo de dato de fecha de timestamp a datetime para un mejor analisis y renombramos a "fecha"
training_set[['timestamp']] = training_set[['timestamp']].apply(pd.to_datetime)
training_set.rename({
    'timestamp' : 'fecha',
}, axis=1, inplace=True)
training_set.head()

Unnamed: 0,fecha,event,person,model,condition,storage,color,search_term,staticpage,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,frecuencia
0,2018-05-18 00:11:27,viewed product,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,0,65
1,2018-05-18 00:23:33,viewed product,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado,,,,,,,,,,0,65
2,2018-05-18 00:16:10,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,,,,,,,,,0,65
3,2018-05-18 00:14:55,viewed product,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,,,,,,,,0,65
4,2018-05-18 00:11:26,ad campaign hit,ad93850f,,,,,,,,,,,,,,0,65


vamos a crear las columnas compras_prod, vistas_prod, otros para productos y visitas_sitio para persona, de la forma de One Hot Encoder

In [56]:
#Detalles de Productos que fueron comprados
productos = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version', 'label']]
productos_comprados = productos[(productos['event'] == 'conversion') | (productos['event'] == 'checkout')]
productos_comprados.drop(['event'], inplace=True, axis=1)
productos_comprados['compras_prod'] = 1
productos_comprados['vistas_prod'] = 0
productos_comprados['otros'] = 0
productos_comprados['visitas_sitio'] = 0
productos_comprados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0


In [57]:
productos_comprados.count()

fecha                       24084
person                      24084
model                       24084
condition                   24084
storage                     24084
color                       24084
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
label                       24084
compras_prod                24084
vistas_prod                 24084
otros                       24084
visitas_sitio               24084
dtype: int64

In [58]:
# productos_comprados = productos_comprados.drop_duplicates()
# productos_comprados.count()

In [59]:
#Detalles de Productos que fueron visitados
productos_visitados = productos.loc[productos.event == 'viewed product']
productos_visitados.drop(['event'], inplace=True, axis=1)
productos_visitados['compras_prod'] = 0
productos_visitados['vistas_prod'] = 1
productos_visitados['otros'] = 0
productos_visitados['visitas_sitio'] = 0
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
0,2018-05-18 00:11:27,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,0,1,0,0
1,2018-05-18 00:23:33,ad93850f,iPhone 5s,Muito Bom,64GB,Prateado,,,,,,,,0,0,1,0,0
2,2018-05-18 00:16:10,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,,,,,,,0,0,1,0,0
3,2018-05-18 00:14:55,ad93850f,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,,,,,,0,0,1,0,0
5,2018-05-16 02:48:16,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,0,1,0,0


In [60]:
productos_visitados.count()

fecha                       247927
person                      247927
model                       247927
condition                   247927
storage                     247927
color                       247927
new_vs_returning                 0
region                           0
country                          0
device_type                      0
screen_resolution                0
operating_system_version         0
browser_version                  0
label                       247927
compras_prod                247927
vistas_prod                 247927
otros                       247927
visitas_sitio               247927
dtype: int64

In [61]:
# productos_visitados = productos_comprados.drop_duplicates()
# productos_visitados.count()

In [62]:
# Eliminamos de productos visitados las personas que estan en productos comprados
productos_visitados = productos_visitados.merge(productos_comprados, on='person', how='left')
productos_visitados = productos_visitados[productos_visitados['fecha_y'].isnull()]
productos_visitados = productos_visitados.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','label_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
productos_visitados.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','label_x': 'label','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
productos_visitados.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
559,2018-05-18 00:18:24,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,1,0,1,0,0
560,2018-05-18 00:15:33,9bb3af27,Samsung Galaxy S5,Bom,16GB,Preto,,,,,,,,1,0,1,0,0
561,2018-05-18 00:20:10,9bb3af27,Samsung Galaxy S5,Bom,16GB,Branco,,,,,,,,1,0,1,0,0
562,2018-05-18 00:21:58,9bb3af27,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,1,0,1,0,0
563,2018-05-18 00:14:35,9bb3af27,Samsung Galaxy S5,Bom,16GB,Dourado,,,,,,,,1,0,1,0,0


In [63]:
productos_visitados.count()

fecha                       36472
person                      36472
model                       36472
condition                   36472
storage                     36472
color                       36472
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
label                       36472
compras_prod                36472
vistas_prod                 36472
otros                       36472
visitas_sitio               36472
dtype: int64

In [64]:
otros = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version', 'label']]
otros = otros[(otros['event'] == 'generic listing') | (otros['event'] == 'lead')]
otros.drop(['event'], inplace=True, axis=1)
otros['compras_prod'] = 0
otros['vistas_prod'] = 0
otros['otros'] = 1
otros['visitas_sitio'] = 0
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
38,2018-05-14 23:50:23,ad93850f,,,,,,,,,,,,0,0,0,1,0
46,2018-05-18 00:12:55,ad93850f,,,,,,,,,,,,0,0,0,1,0
53,2018-05-18 00:23:21,ad93850f,,,,,,,,,,,,0,0,0,1,0
54,2018-05-18 22:12:18,ad93850f,,,,,,,,,,,,0,0,0,1,0
57,2018-05-18 22:11:47,ad93850f,,,,,,,,,,,,0,0,0,1,0


In [65]:
otros.count()

fecha                       38815
person                      38815
model                         128
condition                       0
storage                         0
color                           0
new_vs_returning                0
region                          0
country                         0
device_type                     0
screen_resolution               0
operating_system_version        0
browser_version                 0
label                       38815
compras_prod                38815
vistas_prod                 38815
otros                       38815
visitas_sitio               38815
dtype: int64

In [66]:
# otros = otros.drop_duplicates()
# otros.count()

In [67]:
# Eliminamos de otros las personas que estan en productos comprados
otros = otros.merge(productos_comprados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','label_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','label_x': 'label','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
162,2018-05-18 00:11:31,9bb3af27,,,,,,,,,,,,1,0,0,1,0
163,2018-05-18 00:20:28,9bb3af27,,,,,,,,,,,,1,0,0,1,0
164,2018-05-18 00:22:23,9bb3af27,,,,,,,,,,,,1,0,0,1,0
165,2018-05-18 19:28:08,9bb3af27,,,,,,,,,,,,1,0,0,1,0
166,2018-05-18 19:26:43,9bb3af27,,,,,,,,,,,,1,0,0,1,0


In [68]:
otros.count()

fecha                       5898
person                      5898
model                         27
condition                      0
storage                        0
color                          0
new_vs_returning               0
region                         0
country                        0
device_type                    0
screen_resolution              0
operating_system_version       0
browser_version                0
label                       5898
compras_prod                5898
vistas_prod                 5898
otros                       5898
visitas_sitio               5898
dtype: int64

In [69]:
# Eliminamos de otros las personas que estan en productos visitados
otros = otros.merge(productos_visitados, on='person', how='left')
otros = otros[otros['fecha_y'].isnull()]
otros = otros.loc[:,['fecha_x','person', 'model_x', 'condition_x', 'storage_x', 'color_x', 'new_vs_returning_x','region_x','country_x','device_type_x','screen_resolution_x','operating_system_version_x','browser_version_x','label_x','compras_prod_x','vistas_prod_x', 'otros_x', 'visitas_sitio_x']]
otros.rename(columns={'fecha_x': 'fecha', 'model_x': 'model', 'condition_x': 'condition', 'storage_x': 'storage', 'color_x': 'color', 'new_vs_returning_x': 'new_vs_returning', 'region_x': 'region', 'country_x': 'country', 'device_type_x': 'device_type', 'screen_resolution_x': 'screen_resolution' ,'operating_system_version_x': 'operating_system_version','browser_version_x': 'browser_version','label_x': 'label','compras_prod_x': 'compras_prod', 'vistas_prod_x': 'vistas_prod' ,'otros_x': 'otros','visitas_sitio_x': 'visitas_sitio'}, inplace=True)
otros.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
15646,2018-05-29 19:45:16,f33e2cc5,,,,,,,,,,,,0,0,0,1,0
15647,2018-05-19 10:31:43,fb3c6e61,,,,,,,,,,,,0,0,0,1,0
30901,2018-05-29 21:21:02,4042a213,,,,,,,,,,,,0,0,0,1,0
30902,2018-05-18 14:20:44,9a84d509,,,,,,,,,,,,0,0,0,1,0
30903,2018-05-18 14:24:27,9a84d509,,,,,,,,,,,,0,0,0,1,0


In [70]:
otros.count()

fecha                       350
person                      350
model                         2
condition                     0
storage                       0
color                         0
new_vs_returning              0
region                        0
country                       0
device_type                   0
screen_resolution             0
operating_system_version      0
browser_version               0
label                       350
compras_prod                350
vistas_prod                 350
otros                       350
visitas_sitio               350
dtype: int64

In [71]:
#Caracterisitcas del dispositivos con el que el usuario visito el sitio
visitas = training_set.loc[:, ['event','fecha','person', 'model', 'condition', 'storage', 'color', 'new_vs_returning','region','country','device_type','screen_resolution','operating_system_version','browser_version', 'label']]
features_usuario = visitas.loc[visitas.event == 'visited site']
features_usuario.drop(['event'], inplace=True, axis=1)
features_usuario['compras_prod'] = 0
features_usuario['vistas_prod'] = 0
features_usuario['otros'] = 0
features_usuario['visitas_sitio'] = 1
features_usuario.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
60,2018-05-14 23:50:22,ad93850f,,,,,New,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,0,0,0,1
61,2018-05-16 02:48:13,ad93850f,,,,,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,0,0,0,1
62,2018-05-18 00:11:26,ad93850f,,,,,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,0,0,0,1
63,2018-05-18 22:11:46,ad93850f,,,,,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,0,0,0,1
64,2018-05-22 22:41:31,ad93850f,,,,,Returning,Sao Paulo,Brazil,Smartphone,360x640,Android 5.1.1,Chrome Mobile 66.0,0,0,0,0,1


In [72]:
features_usuario.count()

fecha                       51457
person                      51457
model                           0
condition                       0
storage                         0
color                           0
new_vs_returning            51457
region                      51457
country                     51457
device_type                 51457
screen_resolution           51456
operating_system_version    51457
browser_version             51457
label                       51457
compras_prod                51457
vistas_prod                 51457
otros                       51457
visitas_sitio               51457
dtype: int64

In [73]:
del productos
del visitas

In [75]:
#concateno
frames = [productos_comprados, productos_visitados, otros, features_usuario]
sets = pd.concat(frames)
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0


In [76]:
del frames
del features_usuario
del productos_comprados
del productos_visitados
del otros

In [77]:
sets.count()

fecha                       112363
person                      112363
model                        60558
condition                    60556
storage                      60556
color                        60556
new_vs_returning             51457
region                       51457
country                      51457
device_type                  51457
screen_resolution            51456
operating_system_version     51457
browser_version              51457
label                       112363
compras_prod                112363
vistas_prod                 112363
otros                       112363
visitas_sitio               112363
dtype: int64

In [79]:
# Rellenamos compras_prod  vistas_prod  otros y visitas_sitio con 0
sets['compras_prod'] = sets['compras_prod'].fillna(value=0) 
sets['vistas_prod'] = sets['vistas_prod'].fillna(value=0) 
sets['otros'] = sets['otros'].fillna(value=0) 
sets['visitas_sitio'] = sets['visitas_sitio'].fillna(value=0) 

In [80]:
sets.count()

fecha                       112363
person                      112363
model                        60558
condition                    60556
storage                      60556
color                        60556
new_vs_returning             51457
region                       51457
country                      51457
device_type                  51457
screen_resolution            51456
operating_system_version     51457
browser_version              51457
label                       112363
compras_prod                112363
vistas_prod                 112363
otros                       112363
visitas_sitio               112363
dtype: int64

In [82]:
# Rellenamos label_x y label_y con 0 y nos quedamos con label
sets['label'] = sets['label'].fillna(value=0) 
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0


In [83]:
sets[sets['label'] == 1].shape

(24899, 18)

In [84]:
sets[sets['label'] == 0].shape

(87464, 18)

In [86]:
# Rellenamos los demas con ''
sets = sets.fillna(value='') 

In [87]:
sets.count()

fecha                       112363
person                      112363
model                       112363
condition                   112363
storage                     112363
color                       112363
new_vs_returning            112363
region                      112363
country                     112363
device_type                 112363
screen_resolution           112363
operating_system_version    112363
browser_version             112363
label                       112363
compras_prod                112363
vistas_prod                 112363
otros                       112363
visitas_sitio               112363
dtype: int64

In [88]:
sets.person.describe()

count       112363
unique       16779
top       ecd79d0a
freq           728
Name: person, dtype: object

## Categorizacion de los features

In [89]:
# creo una nueva columna person_int para entrenar transformando el valor alfanumerico de person a numerico
sets['person_int'] = labelencoder.fit_transform(sets['person'])
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [90]:
# convertimos a integer el valor flotante de compras_prod
sets['compras_prod'] = sets['compras_prod'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [91]:
# convertimos a integer el valor flotante de vistas_prod
sets['vistas_prod'] = sets['vistas_prod'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [92]:
# convertimos a integer el valor flotante de otros
sets['otros'] = sets['otros'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [93]:
# convertimos a integer el valor flotante de visitas_sitio
sets['visitas_sitio'] = sets['visitas_sitio'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [94]:
# convertimos a integer el valor flotante de label
sets['label'] = sets['label'].astype('int')
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,iPhone 6,Bom,64GB,Dourado,,,,,,,,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,Samsung Galaxy S6 Flat,Bom,32GB,Dourado,,,,,,,,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,Samsung Galaxy S8 Plus,Excelente,64GB,Preto,,,,,,,,0,1,0,0,0,6606


In [95]:
#Categorizamos los valores de los features con label encoder
sets['model'] = labelencoder.fit_transform(sets['model'])
sets['condition'] = labelencoder.fit_transform(sets['condition'])
sets['storage'] = labelencoder.fit_transform(sets['storage'])
sets['color'] = labelencoder.fit_transform(sets['color'])
sets['new_vs_returning'] = labelencoder.fit_transform(sets['new_vs_returning'])
sets['region'] = labelencoder.fit_transform(sets['region'])
sets['country'] = labelencoder.fit_transform(sets['country'])
sets['device_type'] = labelencoder.fit_transform(sets['device_type'])
sets['screen_resolution'] = labelencoder.fit_transform(sets['screen_resolution'])
sets['operating_system_version'] = labelencoder.fit_transform(sets['operating_system_version'])
sets['browser_version'] = labelencoder.fit_transform(sets['browser_version'])
sets.head()

Unnamed: 0,fecha,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,operating_system_version,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int
36,2018-05-14 23:54:19,ad93850f,172,4,4,21,0,0,0,0,0,0,0,0,1,0,0,0,11288
72,2018-05-18 00:44:49,1b9f7cf6,173,1,7,28,0,0,0,0,0,0,0,0,1,0,0,0,1819
125,2018-05-18 01:00:16,de8fe91b,93,3,4,28,0,0,0,0,0,0,0,0,1,0,0,0,14531
152,2018-05-18 00:48:20,45baf068,120,1,4,28,0,0,0,0,0,0,0,0,1,0,0,0,4572
1009,2018-05-17 20:49:37,64f45e8d,124,3,7,38,0,0,0,0,0,0,0,0,1,0,0,0,6606


In [97]:
#Categorizamos la fecha del visto del producto en dia hora y minuto (ya que el anio es 2018)
sets['dia'] = sets['fecha'].apply(lambda x: x.day if type(x) != str else 0).astype('int')
sets['mes'] = sets['fecha'].apply(lambda x: x.month if type(x) != str else 0).astype('int')
sets['dia_semana'] = sets['fecha'].apply(lambda x: x.dayofweek if type(x) != str else 0).astype('int')
sets = sets.drop('fecha',1)

In [98]:
sets.head()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,...,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int,dia,mes,dia_semana
36,ad93850f,172,4,4,21,0,0,0,0,0,...,0,0,1,0,0,0,11288,14,5,0
72,1b9f7cf6,173,1,7,28,0,0,0,0,0,...,0,0,1,0,0,0,1819,18,5,4
125,de8fe91b,93,3,4,28,0,0,0,0,0,...,0,0,1,0,0,0,14531,18,5,4
152,45baf068,120,1,4,28,0,0,0,0,0,...,0,0,1,0,0,0,4572,18,5,4
1009,64f45e8d,124,3,7,38,0,0,0,0,0,...,0,0,1,0,0,0,6606,17,5,3


In [99]:
sets.tail()

Unnamed: 0,person,model,condition,storage,color,new_vs_returning,region,country,device_type,screen_resolution,...,browser_version,label,compras_prod,vistas_prod,otros,visitas_sitio,person_int,dia,mes,dia_semana
1171880,af3374ad,0,0,0,0,2,40,5,2,145,...,99,0,0,0,0,1,11406,15,5,1
1171881,023581f9,0,0,0,0,1,40,5,2,145,...,99,0,0,0,0,1,140,27,5,6
1171882,88b91c5d,0,0,0,0,1,60,5,2,145,...,97,0,0,0,0,1,8871,20,5,6
1171883,2adb3684,0,0,0,0,1,74,5,2,153,...,212,0,0,0,0,1,2760,21,4,5
1171884,39fa45b6,0,0,0,0,1,61,5,2,153,...,212,0,0,0,0,1,3765,22,4,6


In [100]:
sets.count()

person                      112363
model                       112363
condition                   112363
storage                     112363
color                       112363
new_vs_returning            112363
region                      112363
country                     112363
device_type                 112363
screen_resolution           112363
operating_system_version    112363
browser_version             112363
label                       112363
compras_prod                112363
vistas_prod                 112363
otros                       112363
visitas_sitio               112363
person_int                  112363
dia                         112363
mes                         112363
dia_semana                  112363
dtype: int64

In [101]:
sets.person.describe()

count       112363
unique       16779
top       ecd79d0a
freq           728
Name: person, dtype: object

In [102]:
sets[sets['label'] == 1].shape

(24899, 21)

In [103]:
sets[sets['label'] == 0].shape

(87464, 21)

In [104]:
sets.to_csv('data/set_entrenamiento_13.csv', encoding='utf-8', index=False)