In [188]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt
import gc

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer

-------------------------


In [8]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [9]:
from mlxtend.preprocessing import TransactionEncoder

In [10]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [11]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [12]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)

(19414, 1)
(19415, 1)


In [13]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [14]:
df_month = df.loc[df['timestamp'].dt.month == 5]

In [15]:
df_month.shape

(1713920, 23)

In [16]:
checkout_ppl = (df_month.loc[df['event'] == 'checkout'])['person'].drop_duplicates()
df_month = df_month.loc[df_month['person'].isin(checkout_ppl)]
df_month.shape

(1543313, 23)

In [379]:
pd.set_option('display.max_columns', 23)

In [380]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


# Dummies model

In [685]:
model_checkout = pd.get_dummies(df_month['model'])

In [686]:
model_checkout['person'] = df_month['person']

In [687]:
model_checkout.head()

Unnamed: 0,Asus Zenfone 3 Max 32 GB,Asus Zenfone 3 Max 16 GB,Asus Zenfone 6,Asus Zenfone Selfie,LG X Screen,LG G3 Beat D724,LG G3 D855,LG G3 Stylus D690,LG G4 Beat H736,LG G4 H815P,LG G4 H818P,...,iPhone 6,iPhone 6 Plus,iPhone 6S,iPhone 6S Plus,iPhone 7,iPhone 7 Plus,iPhone 8,iPhone 8 Plus,iPhone SE,iPhone X,person
0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,4886f805
1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,ad93850f
2,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0297fc1e
3,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,2d681dd8
4,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,cccea85e


In [688]:
model_checkout = model_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [689]:
model_checkout.head()

Unnamed: 0,person,Asus Zenfone 3 Max 32 GB,Asus Zenfone 3 Max 16 GB,Asus Zenfone 6,Asus Zenfone Selfie,LG X Screen,LG G3 Beat D724,LG G3 D855,LG G3 Stylus D690,LG G4 Beat H736,LG G4 H815P,...,iPhone 5s,iPhone 6,iPhone 6 Plus,iPhone 6S,iPhone 6S Plus,iPhone 7,iPhone 7 Plus,iPhone 8,iPhone 8 Plus,iPhone SE,iPhone X
0,0008ed71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0
1,00091926,0,0,0,0,0,0,0,0,0,1,...,0,5,41,96,51,45,9,0,2,3,1
2,000ba417,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0,0
3,000c79fe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0,0
4,000e4d9e,0,0,0,0,0,0,1,0,0,0,...,7,0,0,1,0,1,0,0,1,11,0


# Dummies Event

In [656]:
event_checkout = pd.get_dummies(df_month['event'])

In [657]:
event_checkout['person'] = df_month['person']

In [658]:
event_checkout.head()

Unnamed: 0,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site,person
0,0,0,0,0,0,0,0,0,0,1,0,4886f805
1,0,0,0,0,0,0,0,0,0,1,0,ad93850f
2,0,0,0,0,0,0,0,0,0,1,0,0297fc1e
3,0,0,0,0,0,0,0,0,0,1,0,2d681dd8
4,0,0,0,0,0,0,0,0,0,1,0,cccea85e


In [659]:
event_checkout = event_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [660]:
event_checkout.head()

Unnamed: 0,person,ad campaign hit,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,visited site
0,0008ed71,0,0,3,0,1,0,0,0,0,0,2
1,00091926,15,25,2,0,0,0,0,0,0,372,34
2,000ba417,1,24,6,1,14,0,1,0,0,153,6
3,000c79fe,1,0,1,0,1,0,1,9,0,3,1
4,000e4d9e,19,17,1,0,17,0,5,0,0,339,13


# Dummies Channel

In [653]:
channel_checkout = pd.get_dummies(df_month['channel'])

channel_checkout['person'] = df_month['person']

channel_checkout.head()

Unnamed: 0,Direct,Email,Organic,Paid,Referral,Social,Unknown,person
0,0,0,0,0,0,0,0,4886f805
1,0,0,0,0,0,0,0,ad93850f
2,0,0,0,0,0,0,0,0297fc1e
3,0,0,0,0,0,0,0,2d681dd8
4,0,0,0,0,0,0,0,cccea85e


In [654]:
channel_checkout = channel_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [655]:
channel_checkoutel_checkoutnnel_checkout.head()

Unnamed: 0,person,Direct,Email,Organic,Paid,Referral,Social,Unknown
0,0008ed71,0,0,0,0,2,0,0
1,00091926,25,0,0,8,1,0,0
2,000ba417,6,0,0,0,0,0,0
3,000c79fe,0,0,0,1,0,0,0
4,000e4d9e,5,0,1,2,5,0,0


# Dummie color

In [633]:
color_checkout = pd.get_dummies(df_month['color'])

In [634]:
color_checkout['person'] = df_month['person']

In [635]:
color_checkout.head()

Unnamed: 0,Amarelo,Ametista,Azul,Azul Escuro,Azul Safira,Azul Topázio,Bambu,Black Piano,Branco,Branco Azul,Branco Azul Navy,...,Rouge,Roxo,Silver,Titânio,Turquesa,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix,person
0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,4886f805
1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,ad93850f
2,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0297fc1e
3,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,2d681dd8
4,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,cccea85e


In [636]:
color_checkout = color_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [637]:
color_checkoutoutckout.head()

Unnamed: 0,person,Amarelo,Ametista,Azul,Azul Escuro,Azul Safira,Azul Topázio,Bambu,Black Piano,Branco,Branco Azul,...,Rose,Rouge,Roxo,Silver,Titânio,Turquesa,Verde,Verde Petroleo,Verde Água,Vermelho,Ônix
0,0008ed71,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
1,00091926,0,2,3,2,0,0,1,1,3,0,...,0,0,0,0,0,0,1,0,1,2,0
2,000ba417,0,0,4,0,0,0,3,0,15,4,...,8,1,0,0,0,0,2,0,0,1,0
3,000c79fe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0
4,000e4d9e,0,1,23,0,0,0,0,0,85,0,...,0,0,0,0,0,0,0,0,2,0,0


# Dummie device_type

In [662]:
dt_checkout = pd.get_dummies(df_month['device_type'])

In [663]:
dt_checkout['person'] = df_month['person']

In [664]:
dt_checkout.head()

Unnamed: 0,Computer,Smartphone,Tablet,Unknown,person
0,0,0,0,0,4886f805
1,0,0,0,0,ad93850f
2,0,0,0,0,0297fc1e
3,0,0,0,0,2d681dd8
4,0,0,0,0,cccea85e


In [665]:
dt_checkout = dt_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [666]:
dt_checkout.head()

Unnamed: 0,person,Computer,Smartphone,Tablet,Unknown
0,0008ed71,2,0,0,0
1,00091926,34,0,0,0
2,000ba417,6,0,0,0
3,000c79fe,0,1,0,0
4,000e4d9e,13,0,0,0


# Dummie static

In [667]:
static_checkout = pd.get_dummies(df_month['staticpage'])

In [668]:
static_checkout['person'] = df_month['person']

In [669]:
static_checkout.head()

Unnamed: 0,AboutUs,Conditions,CustomerService,FaqEcommerce,PrivacyEcommerce,Quiosks,TermsAndConditionsEcommerce,TermsAndConditionsReturnEcommerce,black_friday,club-trocafone,galaxy-s8,how-to-buy,how-to-sell,trust-trocafone,person
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4886f805
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ad93850f
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0297fc1e
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2d681dd8
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,cccea85e


In [670]:
static_checkout = dt_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [671]:
static_checkout.head()

Unnamed: 0,person,Computer,Smartphone,Tablet,Unknown
0,0008ed71,2,0,0,0
1,00091926,34,0,0,0
2,000ba417,6,0,0,0
3,000c79fe,0,1,0,0
4,000e4d9e,13,0,0,0


# Dummies Event

In [994]:
condition_checkout = pd.get_dummies(df_month['condition'])

In [995]:
condition_checkout['person'] = df_month['person']

In [996]:
condition_checkout.head()

Unnamed: 0,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo,person
0,0,0,1,0,0,4886f805
1,0,0,0,1,0,ad93850f
2,0,0,0,1,0,0297fc1e
3,1,0,0,0,0,2d681dd8
4,0,0,1,0,0,cccea85e


In [997]:
condition_checkout = condition_checkout.groupby('person').agg('sum',axis='columns').reset_index()

In [998]:
condition_checkout.head()

Unnamed: 0,person,Bom,Bom - Sem Touch ID,Excelente,Muito Bom,Novo
0,0008ed71,2,0,0,1,0
1,00091926,102,0,108,163,1
2,000ba417,115,0,11,34,0
3,000c79fe,4,0,0,0,0
4,000e4d9e,124,0,53,163,0


# Junto dummies


In [999]:
color_checkout.shape

(31827, 63)

In [1000]:
dummie_features.shape

(38829, 12)

In [1001]:
feat_week = pd.read_csv('feat_week.csv')
feat_weekNum = pd.read_csv('feat_numero_semana.csv')

In [1202]:
dummie_features = persons.merge(color_checkout,on='person', how = 'left')
dummie_features = dummie_features.merge(dt_checkout,on='person', how = 'left')
dummie_features = dummie_features.merge(channel_checkout,on='person', how ='left' )
#dummie_features = dummie_features.merge(model_checkout,on='person', how ='left' )
#dummie_features = dummie_features.merge(feat_week,on='person', how ='left' )
dummie_features = dummie_features.merge(condition_checkout,on='person', how ='left' )

#dummie_features = dummie_features.merge(feat_weekNum,on='person', how ='left' )





dummie_features.head()

dummie_features = dummie_features.merge(event_checkout,on='person', how ='left' )


## XGboost entrenamiento

In [1215]:
df_train = df_labels.merge(dummie_features[feats], left_on='person', right_on='person' , how='inner')

In [1216]:
df_train.shape

(19414, 15)

In [1217]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,Dourado,Preto,Bom,Muito Bom,Direct,Email,Organic,Paid,Referral,Social,Computer,Smartphone,Tablet
0,3.0,7.0,16.0,6.0,8.0,0.0,1.0,0.0,8.0,0.0,0.0,17.0,0.0
1,,,,,,,,,,,,,
2,3.0,3.0,10.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0
3,,,,,,,,,,,,,
4,9.0,1.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [1218]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [1219]:
gc.collect()

42

In [1220]:
X.columns

Index(['Dourado', 'Preto', 'Bom', 'Muito Bom', 'Direct', 'Email', 'Organic',
       'Paid', 'Referral', 'Social', 'Computer', 'Smartphone', 'Tablet'],
      dtype='object')

In [1221]:
X.shape

(19414, 13)

## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [1222]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 65)



Este es el arbol con sus hiperparametros

In [1223]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [1224]:

model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=65,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9)

In [1225]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.8405913734521508

# Random Forest feature importance
   - Algoritmo usado para sacar importancia de los features y ver cuales no nos estan sirviendo al modelo
 #### Es parecido a lo que hice en xgboost con algunas cosas magicas, que use para ponerlo en df y ver resultados
 
  ## NOTAR :
   - Que use la X de xgboost
   - Y que el codigo abajo del dataframe comentado puede funcionar para separar los df pidiendo que nos deje las columnas esas

In [1186]:
X = X.fillna(0)

In [1187]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])

In [1188]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance.sort_values('importance', ascending=False).head(100)

Unnamed: 0,importance,feature
13,0.2557,checkout
0,0.1349,Bom
3,0.1131,Dourado
5,0.11,Muito Bom
8,0.1025,Preto
7,0.0533,Paid
1,0.0523,Computer
2,0.046,Direct
10,0.0428,Smartphone
9,0.0392,Referral


In [1128]:
feat_importance =feat_importance.loc[feat_importance['importance'].isin(feat_importance['importance'].nlargest(3))]
feats_servibles = feat_importance['feature'].tolist()
feats_servibles.append('person')

In [1129]:
feats_servibles

['Computer', 'Smartphone', 'Tablet', 'person']

In [965]:
feats_servibles_color = feats_servibles #0.82

In [1031]:
feats_servibles_condition =feats_servibles #0.83
feats_servibles_condition

['Bom', 'Muito Bom', 'person']

In [1092]:
feats_servibles_channel =feats_servibles
feats_servibles_channel

['Direct', 'Email', 'Organic', 'Paid', 'Referral', 'Social', 'person']

In [1131]:
feats_servibles_dt = feats_servibles
feats_servibles_dt

['Computer', 'Smartphone', 'Tablet', 'person']

In [1132]:
feats = feats_servibles_color.append(feats_servibles_condition)

In [1134]:
feats_servibles_color

['Dourado', 'Preto', 'person', ['Bom', 'Muito Bom', 'person']]

In [1214]:
feats = ['person','Dourado', 'Preto','Bom', 'Muito Bom','Direct', 'Email', 'Organic', 'Paid', 'Referral', 'Social','Computer', 'Smartphone', 'Tablet']

14

In [1228]:
dummie_features[feats].to_csv('dummie_fea')

Unnamed: 0,person,Dourado,Preto,Bom,Muito Bom,Direct,Email,Organic,Paid,Referral,Social,Computer,Smartphone,Tablet
0,4886f805,5.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,ad93850f,5.0,0.0,2.0,7.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,5.0,0.0
2,0297fc1e,31.0,0.0,73.0,48.0,17.0,0.0,0.0,4.0,8.0,0.0,0.0,29.0,0.0
3,2d681dd8,1.0,5.0,14.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0
4,cccea85e,32.0,204.0,200.0,347.0,1.0,0.0,14.0,6.0,1.0,0.0,22.0,0.0,0.0
