In [3]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft
from sklearn.feature_extraction.text import CountVectorizer

-------------------------


In [4]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [5]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [6]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [7]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [8]:
pd.set_option('display.max_columns', 23)

In [9]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]


print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [10]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,


In [11]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['timestamp','event'])
df = df.reset_index(drop=True)



In [12]:
df_dates_per_month = persons
df_months = df
df_months['month'] = df_months['timestamp'].dt.month
df_months['days'] = df_months['timestamp'].dt.day

In [13]:
df_month = df_months.loc[df['month'] == 5]
df_month.shape



(1713920, 25)

In [29]:
people_days = df_month.drop_duplicates(['person','days'])
people_days['count'] = 1

In [30]:
people_days = people_days.groupby('days').agg({'count':'sum'}).reset_index()
people_days

Unnamed: 0,days,count
0,1,889
1,2,1139
2,3,1229
3,4,1150
4,5,964
5,6,932
6,7,1324
7,8,1360
8,9,1477
9,10,1543


In [26]:
event_days = df_month
event_days['count'] = 1

In [28]:
event_days = df_month.groupby('days').agg({'count':'sum'}).reset_index()
event_days

Unnamed: 0,days,count
0,1,14812
1,2,18972
2,3,20158
3,4,19329
4,5,15091
5,6,15069
6,7,20657
7,8,20633
8,9,24140
9,10,26158


In [34]:
days_feat = df_month[['person','days']].merge(event_days, on='days', how='left')
days_feat.head()

Unnamed: 0,person,days,count
0,7dab1178,1,14812
1,4ba8900f,1,14812
2,7dab1178,1,14812
3,2cbd12ad,1,14812
4,3461e3cd,1,14812


In [35]:
days_feat = days_feat.merge(people_days, on='days', how='left')
days_feat.head()

Unnamed: 0,person,days,count_x,count_y
0,7dab1178,1,14812,889
1,4ba8900f,1,14812,889
2,7dab1178,1,14812,889
3,2cbd12ad,1,14812,889
4,3461e3cd,1,14812,889


In [36]:
most_events_day = days_feat.sort_values(by=['count_x'],ascending = [False]).drop_duplicates(subset='person',keep='first')
most_person_day = days_feat.sort_values(by=['count_y'],ascending = [False]).drop_duplicates(subset='person',keep='first')

In [41]:
most_events_day = most_events_day.drop(['days','count_y'],axis=1)

In [43]:
most_person_day = most_person_day.drop(['days','count_x'],axis=1)

In [44]:
most_events_day.shape

(37143, 2)

In [45]:
days_feat = most_events_day.merge(most_person_day, on='person', how='left')

In [47]:
days_feat.head()

Unnamed: 0,person,count_x,count_y
0,a5949369,109435,5961
1,631f3461,109435,6057
2,cd7b5c8e,109435,5961
3,3f8209df,109435,5961
4,01f8fa5b,109435,5961


In [48]:
features = persons.merge(days_feat, on='person', how = 'left')

## XGboost entrenamiento para evaluar feature solo

In [49]:
df_train = df_labels.merge(days_feat , left_on='person', right_on='person' , how='inner')

In [50]:
df_train.head()

Unnamed: 0,person,label,count_x,count_y
0,0566e9c1,0,109435,6057
1,6ec7ee77,0,88954,5197
2,abe7a2fb,0,109435,5961
3,34728364,0,79569,4726
4,87ed62de,0,79569,4479


In [51]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,count_x,count_y
0,109435,6057
1,88954,5197
2,109435,5961
3,79569,4726
4,79569,4479


In [52]:
import xgboost as xgb
model = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 65)



Este es el arbol con sus hiperparametros

In [53]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [54]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=65,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [55]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  model.predict_proba(X_test)[:,1])

0.6219102516635473

In [56]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [57]:
params = {"objective":"binary:logistic",'colsample_bytree':1,
          'learning_rate': 0.1, 'max_depth': 7, 'gamma': 1,'n_estimators': 65}
cv_val = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[12:12:00] /workspace/sr

[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 26 pruned nodes, max_depth=7
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 18 pruned nodes, max_depth=5
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 32 pruned nodes, max_depth=4
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 26 pruned nodes, max_depth=7
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 20 pruned nodes, max_depth=5
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 24 pruned nodes, max_depth=4
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 24 pruned nodes, max_depth=7
[12:12:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 24 pruned nodes, max_depth=5
[12:12:01

In [58]:
cv_val

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.62471,0.005121,0.616113,0.012605
1,0.625705,0.006171,0.617675,0.010848
2,0.625705,0.006171,0.617675,0.010848
3,0.625781,0.006184,0.617671,0.010849
4,0.625787,0.006186,0.61767,0.010849
5,0.625815,0.006217,0.617648,0.010873
6,0.629142,0.00261,0.618637,0.012203
7,0.636585,0.006337,0.62708,0.0065
8,0.637978,0.0044,0.629329,0.009429
9,0.637978,0.0044,0.629329,0.009429


In [59]:
days_feat.to_csv('BigD-Feat.csv',index=False)