In [1]:
%%javascript

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-k', {
    help : 'move up selected cells',
    help_index : 'jupyter-notebook:move-selection-up',
    handler : function (event) {
        IPython.notebook.move_selection_up();
        return false;
    }}
);

IPython.keyboard_manager.command_shortcuts.add_shortcut('Ctrl-j', {
    help : 'move down selected cells',
    help_index : 'jupyter-notebook:move-selection-down',
    handler :  function (event) {
        IPython.notebook.move_selection_down();
        return false;
    }}
);

<IPython.core.display.Javascript object>

In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer

-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [3]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [4]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [5]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [6]:
pd.set_option('display.max_columns', 23)

In [7]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-30 23:13:56,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [8]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [9]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [10]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [11]:
df_dates_per_month = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month
df_months['days'] = df_months['timestamp'].dt.day

In [12]:
df_month = df_months.loc[df['month'] == 5]
df_dates = df_month.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
df_dates.columns = ['person', 'last month 5', 'first month 5']
df_dates['diferencia 5'] = (df_dates['last month 5'] - df_dates['first month 5']).dt.days



In [13]:
df_dates_per_month =df_dates_per_month.merge(df_dates, on='person', how='left')

In [14]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = df_dates_per_month, index = 'person')

In [15]:
%time feat_date, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

CPU times: user 1.02 s, sys: 0 ns, total: 1.02 s
Wall time: 1.02 s


In [16]:
feat_date.head()

Unnamed: 0_level_0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0008ed71,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091926,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091a7a,,,,,,,,,
000ba417,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0
000c79fe,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0


In [17]:
feat_date['person'] = feat_date.index
feat_date = feat_date.reset_index(drop=True)
feat_date.head()

Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
0,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0,0008ed71
1,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0,00091926
2,,,,,,,,,,00091a7a
3,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0,000ba417
4,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0,000c79fe


In [18]:
feat_date.describe(include = 'all')



Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
count,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,38829
unique,,,,,,,,,,38829
top,,,,,,,,,,c6083543
freq,,,,,,,,,,1
mean,4.732763,24.545379,19.564306,2018.0,2018.0,5.0,5.0,2.551813,2.571359,
std,7.61219,5.688888,8.168224,0.0,0.0,0.0,0.0,1.695478,1.79957,
min,0.0,1.0,1.0,2018.0,2018.0,5.0,5.0,0.0,0.0,
25%,0.0,21.0,15.0,2018.0,2018.0,5.0,5.0,1.0,1.0,
50%,0.0,25.0,21.0,2018.0,2018.0,5.0,5.0,2.0,2.0,
75%,7.0,30.0,26.0,2018.0,2018.0,5.0,5.0,3.0,4.0,


In [19]:
feat_date['diferencia 5'].fillna(int(feat_date['diferencia 5'].median()), inplace = True)
feat_date['DAY(last month 5)'].fillna(int(feat_date['DAY(last month 5)'].mean()), inplace = True)
feat_date['DAY(first month 5)'].fillna(int(feat_date['DAY(first month 5)'].mean()), inplace = True)
feat_date['WEEKDAY(last month 5)'].fillna(int(feat_date['WEEKDAY(last month 5)'].mean()), inplace = True)
feat_date['WEEKDAY(first month 5)'].fillna(int(feat_date['WEEKDAY(first month 5)'].mean()), inplace = True)
lista = ['diferencia 5','DAY(last month 5)', 'DAY(first month 5)','WEEKDAY(last month 5)','WEEKDAY(first month 5)','person']

feat_date = feat_date[lista]

 


In [20]:
feat_date = persons.merge(feat_date, on='person', how='left')

# Feature Cantidad de dias que vio un mismo modelo

#### Dias que vio su producto mas visto

In [21]:
df['day'] = df['timestamp'].dt.day

In [22]:
df_model_viewed = df.loc[df['event'] == 'viewed product']
df_model_viewed = df_model_viewed.loc[df_model_viewed['month'] == 5]

df_model_viewed['count']= 1

In [23]:
df_model_viewed = df_model_viewed.groupby(['person','month','day', 'model']).agg({'count':'sum'}).reset_index()
df_model_viewed.head()

Unnamed: 0,person,month,day,model,count
0,91926,5,3,Motorola Moto X Style,2
1,91926,5,3,Samsung Galaxy A7 2017,1
2,91926,5,3,iPhone 6S,1
3,91926,5,3,iPhone 7,2
4,91926,5,5,Motorola Moto X Style,2


In [24]:
df_model_viewed['days'] = 1
df_model_viewed = df_model_viewed.groupby(['person', 'model']).agg({'count':'sum','days':'sum'}).reset_index()


In [25]:
df_model_viewed = df_model_viewed.sort_values(by=['count'],ascending = [False]).drop_duplicates(subset='person',keep='first')

In [26]:
df_model_viewed = df_model_viewed.drop(['model','count'],axis=1)

In [27]:
df_model_viewed = df_model_viewed.merge(persons, on='person', how = 'right')

In [28]:
df_model_viewed.head()

Unnamed: 0,person,days
0,aa297476,16.0
1,b793fc69,11.0
2,069c5926,11.0
3,eb6c498c,22.0
4,455b577c,15.0


## Mean


In [29]:
df_model_viewed_mean = df.loc[df['event'] == 'viewed product']
df_model_viewed_mean = df_model_viewed_mean.loc[df_model_viewed_mean['month'] == 5]

df_model_viewed_mean['count']= 1

In [30]:
df_model_viewed_mean = df_model_viewed_mean.groupby(['person','month','day', 'model']).agg({'count':'sum'}).reset_index()
df_model_viewed_mean.head()

Unnamed: 0,person,month,day,model,count
0,91926,5,3,Motorola Moto X Style,2
1,91926,5,3,Samsung Galaxy A7 2017,1
2,91926,5,3,iPhone 6S,1
3,91926,5,3,iPhone 7,2
4,91926,5,5,Motorola Moto X Style,2


In [31]:
df_model_viewed_mean['days'] = 1
df_model_viewed_mean = df_model_viewed_mean.groupby(['person', 'model']).agg({'count':'sum','days':'sum'}).reset_index()
df_model_viewed_mean.head()

Unnamed: 0,person,model,count,days
0,91926,LG G4 H815P,1,1
1,91926,LG G5 SE,1,1
2,91926,Lenovo Vibe K5,1,1
3,91926,Motorola Moto G4 Play DTV,5,1
4,91926,Motorola Moto G5,2,1


In [32]:
df_model_viewed_mean = df_model_viewed_mean.groupby(['person']).agg({'count':['mean'],'days':['mean','max']}).reset_index()
df_model_viewed_mean.head()

Unnamed: 0_level_0,person,count,days,days
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,max
0,00091926,10.333333,2.5,16
1,000ba417,5.884615,1.346154,3
2,000c79fe,3.0,1.0,1
3,000e4d9e,9.162162,1.837838,9
4,000e619d,1.217391,1.043478,2


In [33]:
df_model_viewed_mean.columns = ['person','viewed_model_mean','models_viewed_days_mean','models_viewed_days_max']


In [34]:
df_model_viewed_mean.head()

Unnamed: 0,person,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max
0,00091926,10.333333,2.5,16
1,000ba417,5.884615,1.346154,3
2,000c79fe,3.0,1.0,1
3,000e4d9e,9.162162,1.837838,9
4,000e619d,1.217391,1.043478,2


In [35]:
feat_day_model_stats = df_model_viewed_mean
feat_day_model_stats = persons.merge(feat_day_model_stats,on='person', how='left')


In [36]:
#feat_day_model_stats = feat_day_model_stats.merge(df_model_viewed_mean,on='person', how='left')
feat_day_model_stats.head()

Unnamed: 0,person,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max
0,4886f805,4.0,1.0,1.0
1,ad93850f,6.666667,1.666667,3.0
2,0297fc1e,12.090909,3.636364,14.0
3,2d681dd8,4.333333,1.0,1.0
4,cccea85e,12.964912,2.578947,10.0


## XGboost entrenamiento para evaluar feature solo

In [37]:
df_train = df_labels.merge(df_model_viewed_mean , left_on='person', right_on='person' , how='inner')

In [38]:
df_train.head()

Unnamed: 0,person,label,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max
0,0566e9c1,0,1.916667,1.083333,2
1,abe7a2fb,0,4.666667,1.333333,2
2,34728364,0,2.181818,1.181818,2
3,87ed62de,0,9.0,2.0,2
4,cde431db,0,4.0,1.5,2


In [39]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max
0,1.916667,1.083333,2
1,4.666667,1.333333,2
2,2.181818,1.181818,2
3,9.0,2.0,2
4,4.0,1.5,2


In [40]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 65)



Este es el arbol con sus hiperparametros

In [41]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [42]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=65,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.5902427439955842

# Visited ppl


In [44]:
df['event'].value_counts()

viewed product       1248124
brand listing         216312
visited site          204069
ad campaign hit       191388
generic listing       160176
searched products     130616
search engine hit     106406
checkout               65315
staticpage             11201
conversion              7091
lead                     983
Name: event, dtype: int64

In [45]:
print((df.loc[df['event'] == 'viewed product']).drop_duplicates('person').shape)

print((df.loc[df['event'] == 'visited site']).drop_duplicates('person').shape)

print((df.loc[df['event'] == 'ad campaign hit']).drop_duplicates('person').shape)

print((df.loc[df['event'] == 'checkout']).drop_duplicates('person').shape)



(37130, 26)
(38242, 26)
(31900, 26)
(32833, 26)


In [46]:
(df.loc[df['event'] == 'visited site']).drop_duplicates('person').shape

(38242, 26)

In [47]:
df_visited = (df.loc[df['event'] == 'visited site'])
df_visited = df_visited.loc[df_visited['month']==5]

df_visited['count'] = 1

In [48]:
df_visited = df_visited.groupby(['person','month', 'day']).agg({'count':'sum'}).reset_index()


In [49]:
df_visited = df_visited.groupby('person').agg({'count':['sum' , 'std','mean']}).reset_index()

In [50]:
df_visited.head()

Unnamed: 0_level_0,person,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,std,mean
0,0008ed71,2,,2.0
1,00091926,34,1.100964,1.545455
2,000ba417,6,1.0,2.0
3,000c79fe,1,,1.0
4,000e4d9e,13,0.726483,1.444444


In [51]:
df_visited.columns = ['person' , 'visited_sum', 'visited_std', 'visited_mean']

# Checkout ppl

In [52]:
(df.loc[df['event'] == 'checkout']).drop_duplicates('person').shape

(32833, 26)

In [53]:
df_checkout = (df.loc[df['event'] == 'checkout'])
df_checkout = df_checkout.loc[df_checkout['month']==5]
df_checkout['count'] = 1

In [54]:
df_checkout = df_checkout.groupby(['person','month', 'day']).agg({'count':'sum'}).reset_index()


In [55]:
df_checkout = df_checkout.groupby('person').agg({'count':['sum' ,'mean']}).reset_index()

In [56]:
df_checkout.head()

Unnamed: 0_level_0,person,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,0008ed71,3,3.0
1,00091926,2,1.0
2,000ba417,6,2.0
3,000c79fe,1,1.0
4,000e4d9e,1,1.0


In [57]:
df_checkout = persons.merge(df_checkout, on='person', how='left')

In [58]:
df_checkout.shape

(38829, 3)

In [59]:
df_checkout.columns = ['person' , 'checkout_sum', 'checkout_mean']

# Viewed ppl

In [60]:
df_month['month'].value_counts()
df_month['day'] = df['timestamp'].dt.day

In [61]:
(df.loc[df['event'] == 'viewed product']).drop_duplicates('person').shape

(37130, 26)

In [62]:
df_viewed = (df.loc[df['event'] == 'viewed product'])

df_viewed['count'] = 1

In [63]:
df_viewed = df_viewed.groupby(['person', 'day','month']).agg({'count':'sum'}).reset_index()


In [64]:
df_viewed = df_viewed.groupby('person').agg({'count':['sum' , 'std','mean']}).reset_index()

In [65]:
df_viewed.head()

Unnamed: 0_level_0,person,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,std,mean
0,00091926,372,18.899735,18.6
1,00091a7a,3,,3.0
2,000ba417,153,16.093477,51.0
3,000c79fe,3,,3.0
4,000e4d9e,339,49.401417,37.666667


In [66]:
df_viewed = persons.merge(df_viewed, on='person', how='left')

In [67]:
df_viewed.shape

(38829, 4)

In [68]:
df_viewed.columns = ['person' , 'viewed_sum', 'viewed_mean', 'viewed_std']

# Conversion ppl

In [69]:
(df.loc[df['event'] == 'conversion']).drop_duplicates('person').shape

(4293, 26)

In [70]:
df_conversion = (df.loc[df['event'] == 'conversion'])
df_conversion = df_conversion.loc[df_conversion['month']==5]

df_conversion['count'] = 1

In [71]:
df_conversion = df_conversion.groupby(['person','month', 'day']).agg({'count':'sum'}).reset_index()


In [72]:
df_conversion = df_conversion.groupby('person').agg({'count':['sum','mean']}).reset_index()

In [73]:
df_conversion.head()

Unnamed: 0_level_0,person,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,000ba417,1,1.0
1,001001be,1,1.0
2,001804a2,1,1.0
3,0019e639,1,1.0
4,001b0bf9,1,1.0


In [74]:
df_conversion = persons.merge(df_conversion, on='person', how='left')

In [75]:
df_conversion = df_conversion.fillna(0)
df_conversion.shape

(38829, 3)

In [76]:
df_conversion.columns = ['person' , 'conversion_sum', 'conversion_mean']

- En otros notebooks hice un analisis mas exhaustivo de cuales transformaciones generaban los features mas optimos

- Los mas optimos fueron __checkout__ y __visited site__

# Junto mis features

In [77]:
feat_dummie = pd.read_csv('feat_dummies.csv')

In [78]:
feat_082= pd.read_csv('feat_082.csv')
feat_082.columns = ['person', 'dif_5_check', 'last_day_check', 'first_day_check', 'last_week_check', 'first_week_check']
feat_082.head()


Unnamed: 0,person,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,4886f805,0.0,18.0,18.0,4.0,4.0
1,ad93850f,0.0,14.0,14.0,0.0,0.0
2,0297fc1e,12.0,22.0,10.0,1.0,3.0
3,2d681dd8,0.0,27.0,27.0,6.0,6.0
4,cccea85e,0.0,11.0,11.0,4.0,4.0


In [79]:
feat_day_model_stats.head()

Unnamed: 0,person,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max
0,4886f805,4.0,1.0,1.0
1,ad93850f,6.666667,1.666667,3.0
2,0297fc1e,12.090909,3.636364,14.0
3,2d681dd8,4.333333,1.0,1.0
4,cccea85e,12.964912,2.578947,10.0


In [None]:
features = persons
#features = features.merge(df_checkout, on='person', how='left')
features = features.merge(df_conversion, on='person', how='left')
#features = features.merge(pd.read_csv('BigD-Feat-Copy1.csv'), on='person', how='left')
features = features.merge(df_viewed, on='person', how='left')
features = features.merge(feat_082, on='person', how='left')
#features = features.merge(feat_dummie, on='person', how='left')



#features = features.merge(feature_caro, on='person', how='left')


#features = features.merge(feat_diff, on='person', how='left')
#features = features.merge(features_rance, on='person', how='left')

#features = features.merge(feat_date, on='person', how='left')


In [81]:
features.to_csv('features_mios.csv', index = False)

In [82]:
features.columns

Index(['person', 'conversion_sum', 'conversion_mean', 'viewed_sum',
       'viewed_mean', 'viewed_std', 'dif_5_check', 'last_day_check',
       'first_day_check', 'last_week_check', 'first_week_check'],
      dtype='object')

In [83]:
features.head()

Unnamed: 0,person,conversion_sum,conversion_mean,viewed_sum,viewed_mean,viewed_std,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,4886f805,0.0,0.0,4.0,,4.0,0.0,18.0,18.0,4.0,4.0
1,ad93850f,0.0,0.0,20.0,7.371115,6.666667,0.0,14.0,14.0,0.0,0.0
2,0297fc1e,0.0,0.0,404.0,5.228295,7.214286,12.0,22.0,10.0,1.0,3.0
3,2d681dd8,0.0,0.0,13.0,0.707107,6.5,0.0,27.0,27.0,6.0,6.0
4,cccea85e,0.0,0.0,739.0,26.241971,56.846154,0.0,11.0,11.0,4.0,4.0


## XGboost entrenamiento

In [84]:
df_train = df_labels.merge(features , left_on='person', right_on='person' , how='inner')

In [102]:
df_train.shape

(19414, 9)

In [103]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,conversion_sum,conversion_mean,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check
0,1.0,1.0,0.0,23.0,23.0,2.0,2.0
1,0.0,0.0,0.0,23.0,22.0,2.0,2.0
2,0.0,0.0,0.0,29.0,29.0,1.0,1.0
3,0.0,0.0,0.0,23.0,22.0,2.0,2.0
4,0.0,0.0,0.0,18.0,18.0,4.0,4.0


In [104]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [105]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 65)






Este es el arbol con sus hiperparametros

In [106]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [107]:
model.fit(X_train,y_train,early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)], verbose=True)
#model.fit(X_train,y_train)

[0]	validation_0-auc:0.824396
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.824452
[2]	validation_0-auc:0.824375
[3]	validation_0-auc:0.823921
[4]	validation_0-auc:0.824084
[5]	validation_0-auc:0.824102
[6]	validation_0-auc:0.824393
[7]	validation_0-auc:0.824985
[8]	validation_0-auc:0.825
[9]	validation_0-auc:0.825267
[10]	validation_0-auc:0.829885
[11]	validation_0-auc:0.831074
[12]	validation_0-auc:0.835787
[13]	validation_0-auc:0.838035
[14]	validation_0-auc:0.838128
[15]	validation_0-auc:0.83882
[16]	validation_0-auc:0.838908
[17]	validation_0-auc:0.838465
[18]	validation_0-auc:0.838465
[19]	validation_0-auc:0.838465
[20]	validation_0-auc:0.838465
[21]	validation_0-auc:0.838465
[22]	validation_0-auc:0.838434
[23]	validation_0-auc:0.838434
[24]	validation_0-auc:0.838434
[25]	validation_0-auc:0.838434
[26]	validation_0-auc:0.838434
Stopping. Best iteration:
[16]	validation_0-auc:0.838908



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=65,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9)

In [108]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.8389080961807898

In [109]:


params = {"objective":"binary:logistic",'colsample_bytree':1,
          'learning_rate': 0.1, 'max_depth': 7, 'gamma': 1,'n_estimators': 65}
cv_val = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 6 pruned nodes, max_depth=5
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 12 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 10 pruned nodes, max_depth=5
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 8 pruned nodes, max_depth=5
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 22 pruned nodes, max_depth=6
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 8 pruned nodes, max_depth=6
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 12 pruned nodes, max_depth=6
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 26 pruned nodes, max_depth=6
[12:13:27] 

[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 56 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 56 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 60 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 62 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 66 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 68 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 72 pruned nodes, max_depth=7
[12:13:27] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 78 pruned nodes, max_depth=7
[12:13:2

In [110]:
cv_val.tail(1)

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
25,0.849331,0.006501,0.832666,0.011723


In [167]:
features.to_csv('features_nacho.csv', index=False)

In [168]:
features.head()

Unnamed: 0,person,checkout_sum,checkout_mean,conversion_sum,conversion_mean,count_x,count_y,viewed_sum,viewed_mean,viewed_std,dif_5_check,...,first_week_check,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max,mayor_actividad_ult_semana,Checkout mean,diferencia 5,DAY(last month 5),DAY(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
0,4886f805,1.0,1.0,0.0,0.0,79569.0,4479.0,4.0,,4.0,0.0,...,4.0,4.0,1.0,1.0,0,1.0,0.0,18.0,18.0,4.0,4.0
1,ad93850f,1.0,1.0,0.0,0.0,86381.0,4990.0,20.0,7.371115,6.666667,0.0,...,0.0,6.666667,1.666667,3.0,0,1.0,7.0,22.0,14.0,1.0,0.0
2,0297fc1e,2.0,1.0,0.0,0.0,88954.0,5197.0,404.0,5.228295,7.214286,12.0,...,3.0,12.090909,3.636364,14.0,0,1.0,26.0,28.0,2.0,0.0,2.0
3,2d681dd8,1.0,1.0,0.0,0.0,79569.0,4479.0,13.0,0.707107,6.5,0.0,...,6.0,4.333333,1.0,1.0,0,1.0,9.0,27.0,18.0,6.0,4.0
4,cccea85e,1.0,1.0,0.0,0.0,108040.0,6057.0,739.0,26.241971,56.846154,0.0,...,4.0,12.964912,2.578947,10.0,1,1.0,23.0,31.0,7.0,3.0,0.0


In [254]:
X_predict = persons_to_predict.merge(features, on='person', how='left')
X_predict.head()


Unnamed: 0,person,checkout_sum,checkout_mean,conversion_sum,conversion_mean,diferencia 5_x,DAY(last month 5)_x,DAY(first month 5)_x,WEEKDAY(last month 5)_x,WEEKDAY(first month 5)_x,viewed_model_mean,models_viewed_days_mean,models_viewed_days_max,mayor_actividad_ult_semana,Checkout mean,diferencia 5_y,DAY(last month 5)_y,DAY(first month 5)_y,WEEKDAY(last month 5)_y,WEEKDAY(first month 5)_y
0,4886f805,1.0,1.0,0.0,0.0,0.0,18.0,18.0,4.0,4.0,4.0,1.0,1.0,0,1.0,0.0,18.0,18.0,4.0,4.0
1,0297fc1e,2.0,1.0,0.0,0.0,12.0,22.0,10.0,1.0,3.0,12.090909,3.636364,14.0,0,1.0,26.0,28.0,2.0,0.0,2.0
2,2d681dd8,1.0,1.0,0.0,0.0,0.0,27.0,27.0,6.0,6.0,4.333333,1.0,1.0,0,1.0,9.0,27.0,18.0,6.0,4.0
3,cccea85e,1.0,1.0,0.0,0.0,0.0,11.0,11.0,4.0,4.0,12.964912,2.578947,10.0,1,1.0,23.0,31.0,7.0,3.0,0.0
4,4c8a8b93,2.0,1.0,0.0,0.0,0.0,19.0,18.0,5.0,4.0,19.666667,2.111111,4.0,0,1.0,4.0,22.0,18.0,1.0,4.0


In [267]:
X_predict.isnull().sum()

person                                    0
checkout_sum                           3540
checkout_mean                          3540
conversion_sum                            0
conversion_mean                           0
Dourado                                3540
Preto                                  3540
Bom                                    3540
Muito Bom                              3540
Direct                                 3540
Email                                  3540
Organic                                3540
Paid                                   3540
Referral                               3540
Social                                 3540
Computer                               3540
Smartphone                             3540
Tablet                                 3540
viewed_sum                              855
viewed_mean                            9782
viewed_std                              855
diferencia 5_x                            0
DAY(last month 5)_x             

In [268]:
df_entrie = persons_to_predict.copy(deep=True)
df_entrie['label'] = model.predict_proba(X_predict.iloc[:,1:])[:,1]
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.005125
2,0297fc1e,0.039907
3,2d681dd8,0.009619
4,cccea85e,0.180251
5,4c8a8b93,-0.000148


In [269]:
df_entrie.shape

(19415, 2)

In [270]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    

In [305]:
df_entrie['label1'] = model.predict_proba(X_predict.iloc[:,1:])[:,1]

In [298]:
df_entrie.to_csv('submit_kagggleeee.csv',index= False)

In [447]:
1

1

In [84]:
features_rance = pd.read_csv('features_numericos_mes_5.csv')


In [85]:
features_rance.columns

Index(['person', 'cant_checkouts_5', '%checkouts', 'checkouts_ult_semana',
       'act_primera_semana', 'act_ultima_semana', 'mayor_actividad_ult_semana',
       'cant_modelos_distintos', 'cant_checkouts_dif_modelos',
       'MAX(cant_interacciones_por_modelo)',
       'MEAN(cant_interacciones_por_modelo)', 'Checkout max', 'Checkout mean'],
      dtype='object')

In [86]:
features_rance=features_rance[['person', 'mayor_actividad_ult_semana', 'Checkout mean']]


In [451]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb

# Linear Regression

In [452]:
from sklearn import linear_model

In [453]:
X = X.fillna(0)
X_train =X_train.fillna(0)
X_test = X_test.fillna(0)

In [454]:
clf_lasso = linear_model.LinearRegression()

In [455]:
clf_lasso.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [456]:
roc_auc_score(y_test,  clf_lasso.predict(X_test))

0.6596936633899247

In [457]:
X=X.fillna(0)

# ExtraTrees

In [458]:
ExtC = ExtraTreesClassifier()


## Search grid for optimal parameters
ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

gsExtC.fit(X,y)

ExtC_best = gsExtC.best_estimator_

# Best score
gsExtC.best_score_

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:   43.4s finished


0.8245073433517922

In [459]:
gsExtC.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'max_depth': [None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [False], 'n_estimators': [100, 300], 'criterion': ['gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)>

# LinearDiscriminantAnalysis

In [460]:
### META MODELING  WITH ADABOOST, RF, EXTRATREES and GRADIENTBOOSTING
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

adaDTC = LinearDiscriminantAnalysis()

ada_param_grid = { "solver":['svd','eigen','lsqr'],
                    "store_covariance": [True, False],
                 "n_components" : [1,8,10,15,5,6,7]}



gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

gsadaDTC.fit(X,y)

ada_best = gsadaDTC.best_estimator_
gsadaDTC.best_score_

Fitting 2 folds for each of 42 candidates, totalling 84 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  84 out of  84 | elapsed:    3.4s finished


0.6689499549412908

# MultipleLayerPerceptron

In [461]:
### META MODELING  WITH ADABOOST, RF, EXTRATREES and GRADIENTBOOSTING
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

gsmlp.fit(X,y)

mlp_best = gsmlp.best_estimator_
gsmlp.best_score_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.9s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:  1.2min finished


0.8225339490249826

# Random Forest

In [462]:
# RFC Parameters tunning 
RFC = RandomForestClassifier()


## Search grid for optimal parameters
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[50,65,100],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

gsRFC.fit(X,y)

RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_

Fitting 2 folds for each of 81 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.0s
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:   45.0s finished


0.8217111017131167

# Gradient boosting

In [463]:
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [50,65,100],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

gsGBC.fit(X,y)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed:   14.2s finished


0.827882207456054

In [464]:
GBC_best

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=4,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=100, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [465]:
lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

In [466]:
lgb_cl.fit(X_train,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        importance_type='split', learning_rate=0.005, max_depth=13,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=60, n_jobs=-1, num_leaves=55, objective='binary',
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=False,
        subsample=0.8, subsample_for_bin=200000, subsample_freq=0,
        sumsample_freq=0.5)

In [467]:
roc_auc_score(y_test,  lgb_cl.predict_proba(X_test)[:,1])

0.8416315452116647

In [468]:
eclf3 = VotingClassifier(estimators=[
       ('gb', GBC_best),('lgb',lgb_cl),('mlp', mlp_best),('ext', ExtC_best), ('xgb',model)],
       voting='soft', weights=[2,0.5,1,0.5,2],
      flatten_transform=True)

In [469]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [470]:
eclf3.fit(X_train,y_train)

VotingClassifier(estimators=[('gb', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=4,
              max_features=0.3, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=100...      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9))],
         flatten_transform=True, n_jobs=None, voting='soft',
         weights=[2, 0.5, 1, 0.5, 2])

In [471]:
roc_auc_score(y_test,  eclf3.predict_proba(X_test)[:,1])


0.8462240417940587

In [472]:
X.isnull().sum()

Dourado       0
Preto         0
Bom           0
Muito Bom     0
Direct        0
Email         0
Organic       0
Paid          0
Referral      0
Social        0
Computer      0
Smartphone    0
Tablet        0
dtype: int64

In [169]:
df.shape

(2341681, 26)

In [174]:
checkout_ppl =(df.loc[df['event'] == 'checkout'])['person'].to_frame()
checkout_ppl.head()

Unnamed: 0,person
199,a66e8424
201,a66e8424
224,a66e8424
272,a66e8424
379,25950776


In [175]:
conversion_ppl = (df.loc[df['event'] == 'converson'])['person'].to_frame()

In [176]:
did_someone = (conversion_ppl.merge(checkout_ppl, on='person', how='inner'))

In [177]:
did_someone

Unnamed: 0,person
