In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer

-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [8]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [9]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [10]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [11]:
pd.set_option('display.max_columns', 23)

In [12]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-31 01:59:16,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [13]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [14]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [15]:
df_dates_per_month = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month

In [16]:
#for x in range(1,6):
#    df_month = df_months.loc[df['month'] == x]
 #   df_dates = df_month.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
  #  df_dates.columns = ['person', 'last month '+str(x), 'first month '+ str(x)]
   # df_dates['diferencia'+str(x)] = (df_dates['last month '+str(x)] - df_dates['first month '+ str(x)]).dt.days
    #print(df_dates.head(1))
     #df_dates_per_month = df_dates_per_month.merge(df_dates, on = 'person', how='left')
    

In [21]:
df_month = df_months.loc[df_months['month'] == 5]
df_month = df_month.loc[df_month['event'] == 'checkout'].reset_index(drop=True)
df_month['session'] = df_month.index
df_month.head()



Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,month,session
0,2018-05-01 00:01:40,checkout,0ef8e58a,,2829.0,iPhone 6,Bom,16GB,Prateado,,,...,,,,,,,,,,5,0
1,2018-05-01 00:02:47,checkout,754cd2b0,,9944.0,iPhone 7,Bom,128GB,Preto Matte,,,...,,,,,,,,,,5,1
2,2018-05-01 00:08:18,checkout,72d959bc,,6369.0,Samsung Galaxy J5,Excelente,16GB,Dourado,,,...,,,,,,,,,,5,2
3,2018-05-01 00:10:13,checkout,72d959bc,,6369.0,Samsung Galaxy J5,Excelente,16GB,Dourado,,,...,,,,,,,,,,5,3
4,2018-05-01 00:17:56,checkout,66a98d8f,,291.0,iPhone 5s,Excelente,16GB,Dourado,,,...,,,,,,,,,,5,4


In [22]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = persons, index = 'person')
es = es.entity_from_dataframe(entity_id = 'session_id', dataframe = df_month, index = 'session')

In [24]:
r_client_previous = ft.Relationship(es['person_id']['person'],
                                    es['session_id']['person'])

es = es.add_relationship(r_client_previous)

In [25]:
%time features, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

CPU times: user 5min 1s, sys: 1.46 s, total: 5min 3s
Wall time: 5min


In [26]:
features.head()

Unnamed: 0_level_0,SUM(session_id.sku),SUM(session_id.month),STD(session_id.sku),STD(session_id.month),MAX(session_id.sku),MAX(session_id.month),SKEW(session_id.sku),SKEW(session_id.month),MIN(session_id.sku),MIN(session_id.month),MEAN(session_id.sku),...,MODE(session_id.WEEKDAY(campaign_source)),MODE(session_id.WEEKDAY(search_engine)),MODE(session_id.WEEKDAY(channel)),MODE(session_id.WEEKDAY(new_vs_returning)),MODE(session_id.WEEKDAY(city)),MODE(session_id.WEEKDAY(region)),MODE(session_id.WEEKDAY(country)),MODE(session_id.WEEKDAY(device_type)),MODE(session_id.WEEKDAY(screen_resolution)),MODE(session_id.WEEKDAY(operating_system_version)),MODE(session_id.WEEKDAY(browser_version))
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0008ed71,19124.0,15.0,2626.717787,0.0,8247.0,5.0,-1.577862,0.0,3372.0,5.0,6374.666667,...,,,,,,,,,,,
00091926,13831.0,10.0,20.506097,0.0,6930.0,5.0,,,6901.0,5.0,6915.5,...,,,,,,,,,,,
00091a7a,0.0,0.0,,,,,,,,,,...,,,,,,,,,,,
000ba417,22760.0,30.0,1881.624901,0.0,7631.0,5.0,2.440513,0.0,2987.0,5.0,3793.333333,...,,,,,,,,,,,
000c79fe,9944.0,5.0,,,9944.0,5.0,,,9944.0,5.0,9944.0,...,,,,,,,,,,,


In [27]:
features['person'] = features.index
features = features.reset_index(drop=True)
features.head()

Unnamed: 0,SUM(session_id.sku),SUM(session_id.month),STD(session_id.sku),STD(session_id.month),MAX(session_id.sku),MAX(session_id.month),SKEW(session_id.sku),SKEW(session_id.month),MIN(session_id.sku),MIN(session_id.month),MEAN(session_id.sku),...,MODE(session_id.WEEKDAY(search_engine)),MODE(session_id.WEEKDAY(channel)),MODE(session_id.WEEKDAY(new_vs_returning)),MODE(session_id.WEEKDAY(city)),MODE(session_id.WEEKDAY(region)),MODE(session_id.WEEKDAY(country)),MODE(session_id.WEEKDAY(device_type)),MODE(session_id.WEEKDAY(screen_resolution)),MODE(session_id.WEEKDAY(operating_system_version)),MODE(session_id.WEEKDAY(browser_version)),person
0,19124.0,15.0,2626.717787,0.0,8247.0,5.0,-1.577862,0.0,3372.0,5.0,6374.666667,...,,,,,,,,,,,0008ed71
1,13831.0,10.0,20.506097,0.0,6930.0,5.0,,,6901.0,5.0,6915.5,...,,,,,,,,,,,00091926
2,0.0,0.0,,,,,,,,,,...,,,,,,,,,,,00091a7a
3,22760.0,30.0,1881.624901,0.0,7631.0,5.0,2.440513,0.0,2987.0,5.0,3793.333333,...,,,,,,,,,,,000ba417
4,9944.0,5.0,,,9944.0,5.0,,,9944.0,5.0,9944.0,...,,,,,,,,,,,000c79fe


In [28]:
features, feature_names = ft.encode_features(features, feature_names)

In [29]:
features.head()



Unnamed: 0,SUM(session_id.sku),SUM(session_id.month),STD(session_id.sku),STD(session_id.month),MAX(session_id.sku),MAX(session_id.month),SKEW(session_id.sku),SKEW(session_id.month),MIN(session_id.sku),MIN(session_id.month),MEAN(session_id.sku),...,MODE(session_id.WEEKDAY(search_engine)) is unknown,MODE(session_id.WEEKDAY(channel)) is unknown,MODE(session_id.WEEKDAY(new_vs_returning)) is unknown,MODE(session_id.WEEKDAY(city)) is unknown,MODE(session_id.WEEKDAY(region)) is unknown,MODE(session_id.WEEKDAY(country)) is unknown,MODE(session_id.WEEKDAY(device_type)) is unknown,MODE(session_id.WEEKDAY(screen_resolution)) is unknown,MODE(session_id.WEEKDAY(operating_system_version)) is unknown,MODE(session_id.WEEKDAY(browser_version)) is unknown,person
0,19124.0,15.0,2626.717787,0.0,8247.0,5.0,-1.577862,0.0,3372.0,5.0,6374.666667,...,1,1,1,1,1,1,1,1,1,1,0008ed71
1,13831.0,10.0,20.506097,0.0,6930.0,5.0,,,6901.0,5.0,6915.5,...,1,1,1,1,1,1,1,1,1,1,00091926
2,0.0,0.0,,,,,,,,,,...,1,1,1,1,1,1,1,1,1,1,00091a7a
3,22760.0,30.0,1881.624901,0.0,7631.0,5.0,2.440513,0.0,2987.0,5.0,3793.333333,...,1,1,1,1,1,1,1,1,1,1,000ba417
4,9944.0,5.0,,,9944.0,5.0,,,9944.0,5.0,9944.0,...,1,1,1,1,1,1,1,1,1,1,000c79fe


In [22]:
features['diferencia 5'].fillna(int(features['diferencia 5'].median()), inplace = True)
features['DAY(last month 5)'].fillna(int(features['DAY(last month 5)'].mean()), inplace = True)
features['DAY(first month 5)'].fillna(int(features['DAY(first month 5)'].mean()), inplace = True)
features['WEEKDAY(last month 5)'].fillna(int(features['WEEKDAY(last month 5)'].mean()), inplace = True)
features['WEEKDAY(first month 5)'].fillna(int(features['WEEKDAY(first month 5)'].mean()), inplace = True)
lista = ['diferencia 5','DAY(last month 5)', 'DAY(first month 5)','WEEKDAY(last month 5)','WEEKDAY(first month 5)','person']

features = features[lista]

 


In [23]:
person = (features.loc[features['diferencia 5'].isnull()])['person']


In [24]:
features = persons.merge(features, on='person', how='left')

In [25]:
features.shape

(38829, 6)

In [26]:
features.to_csv('date.csv')

## XGboost entrenamiento

In [129]:
df_train = df_labels.merge(features , left_on='person', right_on='person' , how='left')
print(df_train.shape)
df_train.head()

(19414, 12)


Unnamed: 0,person,label,MAX(session_id.sku),MEAN(session_id.sku),MIN(session_id.sku),MODE(session_id.DAY(timestamp)) is unknown,NUM_UNIQUE(session_id.condition),NUM_UNIQUE(session_id.storage),SKEW(session_id.sku),STD(session_id.sku),SUM(session_id.NUMWORDS(skus)),SUM(session_id.sku)
0,0566e9c1,0,2680.0,2680.0,2680.0,0,1.0,1.0,,,1.0,2680.0
1,6ec7ee77,0,,,,1,,,,,0.0,0.0
2,abe7a2fb,0,2832.0,2832.0,2832.0,0,1.0,1.0,,,1.0,2832.0
3,34728364,0,,,,1,,,,,0.0,0.0
4,87ed62de,0,6411.0,6411.0,6411.0,0,1.0,1.0,,0.0,2.0,12822.0


In [130]:
#df_train_1 = df_train.loc[df_train['label'] == 1]
#df_train_0 = df_train.loc[df_train['label'] == 0]

#df_train2 = pd.concat([df_train_1, df_train_0.sample(2000)])

Los labels me dan mi set para entrenar, los que no se encuentran en labels tengo que predecirlos

Si ven aca, de la columna label en adelante tenemos los features.

In [131]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,MAX(session_id.sku),MEAN(session_id.sku),MIN(session_id.sku),MODE(session_id.DAY(timestamp)) is unknown,NUM_UNIQUE(session_id.condition),NUM_UNIQUE(session_id.storage),SKEW(session_id.sku),STD(session_id.sku),SUM(session_id.NUMWORDS(skus)),SUM(session_id.sku)
0,2680.0,2680.0,2680.0,0,1.0,1.0,,,1.0,2680.0
1,,,,1,,,,,0.0,0.0
2,2832.0,2832.0,2832.0,0,1.0,1.0,,,1.0,2832.0
3,,,,1,,,,,0.0,0.0
4,6411.0,6411.0,6411.0,0,1.0,1.0,,0.0,2.0,12822.0


Separamos los datos para hacer xgboost de la siguiente forma


|Variable |Contiene|
|------------------------|-----------------------------------------------------|
|X| features que usa xgboost son solo numeros es decir que sacamos a la persona   |
|y| label de cada persona|


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [132]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

Este es el arbol con sus hiperparametros

In [133]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

Aca vuelvo a separar los datos para poder realizar una metrica interna y ver masomenos como performan las cosas que hago

Obtengo las personas a predecir!

In [134]:
X_predict = features.loc[features['person'].isin(persons_to_predict['person'])]
X_predict.head()

Unnamed: 0,MAX(session_id.sku),MEAN(session_id.sku),MIN(session_id.sku),MODE(session_id.DAY(timestamp)) is unknown,NUM_UNIQUE(session_id.condition),NUM_UNIQUE(session_id.storage),SKEW(session_id.sku),STD(session_id.sku),SUM(session_id.NUMWORDS(skus)),SUM(session_id.sku),person
1,6930.0,6915.5,6901.0,1,2.0,1.0,,20.506097,2.0,13831.0,00091926
2,,,,1,,,,,0.0,0.0,00091a7a
3,7631.0,3793.333333,2987.0,0,2.0,1.0,2.440513,1881.624901,6.0,22760.0,000ba417
5,3360.0,3360.0,3360.0,1,1.0,1.0,,,1.0,3360.0,000e4d9e
6,8541.0,8541.0,8541.0,0,1.0,1.0,,,1.0,8541.0,000e619d


Necesito tener el mismo dataframe que tenia cuando lo entrene pero ahora para predecir, en este caso <b>X</b>

In [135]:
my_classifier1.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

Entreno al arbol!

In [136]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.7944345493014471

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    

In [137]:
entrie = my_classifier1.predict_proba(X_predict.iloc[:,1:])[:,1]

ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields person

Valores a subir a kaggle, lo que sigue son transformaciones a dataframe para convertirlo en csv, y algunas cosas para darme idea tipo por donde andan los valores normales en los resultados que me da

In [138]:
X_predict['label'] = entrie

NameError: name 'entrie' is not defined

In [115]:
df_entrie = X_predict[['person', 'label']]

KeyError: "['label'] not in index"

In [93]:
df_entrie.head()

NameError: name 'df_entrie' is not defined

In [94]:
df_entrie.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

NameError: name 'df_entrie' is not defined

In [95]:
df_entrie.shape

NameError: name 'df_entrie' is not defined

 # Random Forest feature importance
   - Algoritmo usado para sacar importancia de los features y ver cuales no nos estan sirviendo al modelo
 #### Es parecido a lo que hice en xgboost con algunas cosas magicas, que use para ponerlo en df y ver resultados
 
  ## NOTAR :
   - Que use la X de xgboost
   - Y que el codigo abajo del dataframe comentado puede funcionar para separar los df pidiendo que nos deje las columnas esas

In [116]:
X = X.fillna(0)

In [117]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])

In [118]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance.sort_values('importance', ascending=False).head(100)

Unnamed: 0,importance,feature
5,0.3396,SUM(session_id.sku)
2,0.206,MIN(session_id.sku)
1,0.1702,MEAN(session_id.sku)
0,0.1234,MAX(session_id.sku)
4,0.0994,STD(session_id.sku)
3,0.0614,SKEW(session_id.sku)


In [119]:
feat_importance =feat_importance.loc[feat_importance['importance'].isin(feat_importance['importance'].nlargest(6))]
feats_servibles = feat_importance['feature'].tolist()
feats_servibles.append('person')
features_importantes = features[feats_servibles]

In [120]:
features_importantes.head()

Unnamed: 0,SUM(session_id.sku),person
0,19124.0,0008ed71
1,13831.0,00091926
2,0.0,00091a7a
3,22760.0,000ba417
4,9944.0,000c79fe


In [72]:
df_month.shape

(1713920, 24)

In [74]:
df.shape

(2341681, 24)

In [139]:
features.to_csv('new_features.csv', index=False)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fb66d091400>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fb66cf44f98>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fb66cf58ba8>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
