In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft

-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [3]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [4]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [5]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [6]:
pd.set_option('display.max_columns', 23)

In [7]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-31 01:59:16,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [8]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [9]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [10]:
df_dates_per_month = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month

In [11]:
#for x in range(1,6):
#    df_month = df_months.loc[df['month'] == x]
 #   df_dates = df_month.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
  #  df_dates.columns = ['person', 'last month '+str(x), 'first month '+ str(x)]
   # df_dates['diferencia'+str(x)] = (df_dates['last month '+str(x)] - df_dates['first month '+ str(x)]).dt.days
    #print(df_dates.head(1))
     #df_dates_per_month = df_dates_per_month.merge(df_dates, on = 'person', how='left')
    

In [12]:
df_month = df_months.loc[df['month'] == 5]
df_dates = df_month.groupby('person').agg({'timestamp':['max', 'min']}).reset_index()
df_dates.columns = ['person', 'last month 5', 'first month 5']
df_dates['diferencia 5'] = (df_dates['last month 5'] - df_dates['first month 5']).dt.days



In [13]:
df_dates_per_month =df_dates_per_month.merge(df_dates, on='person', how='left')

In [14]:
es = ft.EntitySet(id = 'person')
es = es.entity_from_dataframe(entity_id = 'person_id', dataframe = df_dates_per_month, index = 'person')

In [15]:
%time features, feature_names = ft.dfs(entityset=es, target_entity='person_id', max_depth = 2)

CPU times: user 1.06 s, sys: 3.91 ms, total: 1.06 s
Wall time: 1.08 s


In [16]:
features.head()

Unnamed: 0_level_0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0008ed71,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091926,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0
00091a7a,,,,,,,,,
000ba417,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0
000c79fe,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0


In [17]:
features['person'] = features.index
features = features.reset_index(drop=True)
features.head()

Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
0,0.0,17.0,17.0,2018.0,2018.0,5.0,5.0,3.0,3.0,0008ed71
1,27.0,31.0,3.0,2018.0,2018.0,5.0,5.0,3.0,3.0,00091926
2,,,,,,,,,,00091a7a
3,9.0,26.0,17.0,2018.0,2018.0,5.0,5.0,5.0,3.0,000ba417
4,0.0,29.0,29.0,2018.0,2018.0,5.0,5.0,1.0,1.0,000c79fe


In [18]:
features.describe(include = 'all')



Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),YEAR(last month 5),YEAR(first month 5),MONTH(last month 5),MONTH(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5),person
count,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,37143.0,38829
unique,,,,,,,,,,38829
top,,,,,,,,,,fb1db379
freq,,,,,,,,,,1
mean,4.732763,24.545379,19.564306,2018.0,2018.0,5.0,5.0,2.551813,2.571359,
std,7.61219,5.688888,8.168224,0.0,0.0,0.0,0.0,1.695478,1.79957,
min,0.0,1.0,1.0,2018.0,2018.0,5.0,5.0,0.0,0.0,
25%,0.0,21.0,15.0,2018.0,2018.0,5.0,5.0,1.0,1.0,
50%,0.0,25.0,21.0,2018.0,2018.0,5.0,5.0,2.0,2.0,
75%,7.0,30.0,26.0,2018.0,2018.0,5.0,5.0,3.0,4.0,


In [19]:
features['diferencia 5'].fillna(int(features['diferencia 5'].median()), inplace = True)
features['DAY(last month 5)'].fillna(int(features['DAY(last month 5)'].mean()), inplace = True)
features['DAY(first month 5)'].fillna(int(features['DAY(first month 5)'].mean()), inplace = True)
features['WEEKDAY(last month 5)'].fillna(int(features['WEEKDAY(last month 5)'].mean()), inplace = True)
features['WEEKDAY(first month 5)'].fillna(int(features['WEEKDAY(first month 5)'].mean()), inplace = True)
lista = ['diferencia 5','DAY(last month 5)', 'DAY(first month 5)','WEEKDAY(last month 5)','WEEKDAY(first month 5)','person']

features = features[lista]

 


In [20]:
person = (features.loc[features['diferencia 5'].isnull()])['person']


In [22]:
features = persons.merge(features, on='person', how='left')

In [33]:
features.shape

(38829, 6)

In [31]:
features.to_csv('date.csv')

## XGboost entrenamiento

In [23]:
df_train = df_labels.merge(features , left_on='person', right_on='person' , how='left')
print(df_train.shape)
df_train.head()

(19414, 7)


Unnamed: 0,person,label,diferencia 5,DAY(last month 5),DAY(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
0,0566e9c1,0,8.0,31.0,22.0,3.0,1.0
1,6ec7ee77,0,0.0,28.0,28.0,0.0,0.0
2,abe7a2fb,0,27.0,29.0,2.0,1.0,2.0
3,34728364,0,8.0,27.0,18.0,6.0,4.0
4,87ed62de,0,0.0,19.0,18.0,5.0,4.0


In [24]:
#df_train_1 = df_train.loc[df_train['label'] == 1]
#df_train_0 = df_train.loc[df_train['label'] == 0]

#df_train2 = pd.concat([df_train_1, df_train_0.sample(2000)])

Los labels me dan mi set para entrenar, los que no se encuentran en labels tengo que predecirlos

Si ven aca, de la columna label en adelante tenemos los features.

In [25]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,diferencia 5,DAY(last month 5),DAY(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
0,8.0,31.0,22.0,3.0,1.0
1,0.0,28.0,28.0,0.0,0.0
2,27.0,29.0,2.0,1.0,2.0
3,8.0,27.0,18.0,6.0,4.0
4,0.0,19.0,18.0,5.0,4.0


Separamos los datos para hacer xgboost de la siguiente forma


|Variable |Contiene|
|------------------------|-----------------------------------------------------|
|X| features que usa xgboost son solo numeros es decir que sacamos a la persona   |
|y| label de cada persona|


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [26]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

Este es el arbol con sus hiperparametros

In [27]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

Aca vuelvo a separar los datos para poder realizar una metrica interna y ver masomenos como performan las cosas que hago

Obtengo las personas a predecir!

In [28]:
X_predict = features.loc[features['person'].isin(persons_to_predict['person'])]
X_predict.head()

Unnamed: 0,person,diferencia 5,DAY(last month 5),DAY(first month 5),WEEKDAY(last month 5),WEEKDAY(first month 5)
0,4886f805,0.0,18.0,18.0,4.0,4.0
2,0297fc1e,26.0,28.0,2.0,0.0,2.0
3,2d681dd8,9.0,27.0,18.0,6.0,4.0
4,cccea85e,23.0,31.0,7.0,3.0,0.0
5,4c8a8b93,4.0,22.0,18.0,1.0,4.0


Necesito tener el mismo dataframe que tenia cuando lo entrene pero ahora para predecir, en este caso <b>X</b>

In [29]:
my_classifier1.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

Entreno al arbol!

In [30]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.7081364546521982

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    

In [None]:
entrie = my_classifier1.predict_proba(X_predict.iloc[:,1:])[:,1]

Valores a subir a kaggle, lo que sigue son transformaciones a dataframe para convertirlo en csv, y algunas cosas para darme idea tipo por donde andan los valores normales en los resultados que me da

In [None]:
X_predict['label'] = entrie

In [None]:
df_entrie = X_predict[['person', 'label']]

In [None]:
df_entrie.head()

In [None]:
df_entrie.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

In [None]:
df_entrie.shape

 # Random Forest feature importance
   - Algoritmo usado para sacar importancia de los features y ver cuales no nos estan sirviendo al modelo
 #### Es parecido a lo que hice en xgboost con algunas cosas magicas, que use para ponerlo en df y ver resultados
 
  ## NOTAR :
   - Que use la X de xgboost
   - Y que el codigo abajo del dataframe comentado puede funcionar para separar los df pidiendo que nos deje las columnas esas

In [232]:
X = X.fillna(0)

In [233]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f5289749c88>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f52897499b0>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f52891c6128>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored i

In [234]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance.sort_values('importance', ascending=False).head(100)

Unnamed: 0,importance,feature
5,0.3247,diff_max
6,0.3247,diff_mean
7,0.1002,session_count
4,0.0707,diferencia 5
0,0.0572,DAY(first month 5)
1,0.0499,DAY(last month 5)
2,0.0392,WEEKDAY(first month 5)
3,0.0335,WEEKDAY(last month 5)


In [230]:
feat_importance =feat_importance.loc[feat_importance['importance'].isin(feat_importance['importance'].nlargest(10))]
feats_servibles = feat_importance['feature'].tolist()
feats_servibles.append('person')
features = features[feats_servibles]

In [231]:
features.head()

Unnamed: 0,DAY(first month 5),DAY(last month 5),WEEKDAY(first month 5),WEEKDAY(last month 5),diferencia 5,person
0,18.0,18.0,4.0,4.0,0.0,4886f805
1,14.0,22.0,0.0,1.0,7.0,ad93850f
2,2.0,28.0,2.0,0.0,26.0,0297fc1e
3,18.0,27.0,4.0,6.0,9.0,2d681dd8
4,7.0,31.0,0.0,3.0,23.0,cccea85e


In [72]:
df_month.shape

(1713920, 24)

In [74]:
df.shape

(2341681, 24)

In [75]:
1713920/2341681 * 100

73.19186516011361

In [143]:
df_bl = df.loc[df['month']==4]
df_bl.shape

(309849, 24)

In [144]:
309849 /2341681 *100

13.23190477268253

In [145]:
df_bl = df.loc[df['month']==3]
df_bl.shape

(193790, 24)

In [146]:
193790 /2341681 *100

8.275678881965561

In [147]:
df_bl = df.loc[df['month']==2]
df_bl.shape

(73541, 24)

In [148]:
73541/2341681 *100

3.140521702144741

In [150]:
df_bl = df.loc[df['month']==1]
df_bl.shape

(50581, 24)

In [151]:
50581/2341681 *100

2.1600294830935556

In [None]:
df_