In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features

import matplotlib #collection of functions for scientific and publication-ready visualization

import numpy as np #foundational package for scientific computing

import scipy as sp #collection of functions for scientific computing and advance mathematics

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook

import sklearn #collection of machine learning algorithms

#misc libraries
import random
import time
import datetime as dt

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

import featuretools as ft

-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import xgboost as xgb

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8



In [3]:
data_raw = pd.read_csv('events_up_to_01062018.csv')
data_val = pd.read_csv('labels_training_set.csv')

In [4]:
data_raw.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,...,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,...,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,...,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,...,,,,,,,,,,


In [5]:
df_labels = data_val.copy(deep=True) 
df = data_raw.copy(deep=True)

In [6]:
pd.set_option('display.max_columns', 23)

In [7]:
df.describe(include= 'all')

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
count,2341681,2341681,2341681,191131,1320530.0,1321513,1320530,1320530,1320530,505949,113763,11201,191286,106406,204069,204069,204069,204069,204069,204069,204066,204069,204069
unique,1490912,11,38829,248,,208,5,8,63,52267,10964,14,23,4,7,2,2206,122,51,4,393,131,366
top,2018-05-31 01:59:16,viewed product,c76b8417,/,,iPhone 6,Bom,16GB,Preto,"2820,6706,6720,2750,6649,7251,6663,12604,7224,...",Iphone,CustomerService,google,Google,Paid,Returning,Unknown,Sao Paulo,Brazil,Smartphone,360x640,Windows 7,Chrome 66.0
freq,14,1248124,4438,64187,,107262,547617,442096,314925,2606,2577,5239,123354,105195,91753,165827,36866,57304,197699,103502,73234,46648,57953
mean,,,,,6899.178,,,,,,,,,,,,,,,,,,
std,,,,,4028.042,,,,,,,,,,,,,,,,,,
min,,,,,71.0,,,,,,,,,,,,,,,,,,
25%,,,,,2929.0,,,,,,,,,,,,,,,,,,
50%,,,,,7057.0,,,,,,,,,,,,,,,,,,
75%,,,,,10014.0,,,,,,,,,,,,,,,,,,


In [8]:
persons = (df.drop_duplicates('person'))['person'].to_frame()
persons_to_train = df_labels['person'].to_frame()
persons_to_predict = persons.loc[~persons['person'].isin(persons_to_train['person'])]
print(persons_to_train.shape)
print(persons_to_predict.shape)


(19414, 1)
(19415, 1)


In [9]:
df_newr = df.loc[df['new_vs_returning'].notnull()]

In [10]:
df_newr = df_newr.groupby(['person','new_vs_returning']).size().reset_index()\
                                                                            .rename(columns={0:'newr'})

In [11]:
df_newr = df_newr.sort_values(by='newr',ascending=False)\
                           .drop_duplicates(subset='person',keep='first')

In [12]:
df_newr.shape

(38242, 3)

In [13]:
df_newr['new_vs_returning'].value_counts()

New          19631
Returning    18611
Name: new_vs_returning, dtype: int64

In [14]:
df_computer_smartphone = df.loc[df['device_type'].notnull()]

In [15]:
df_computer_smartphone = df_computer_smartphone.groupby(['person','device_type']).size().reset_index()\
                                                                            .rename(columns={0:'computer_smartphone'})

In [16]:
df_computer_smartphone = df_computer_smartphone.sort_values(by='computer_smartphone',ascending=False)\
                           .drop_duplicates(subset='person',keep='first')

In [17]:
df_computer_smartphone['device_type'].value_counts()

Smartphone    20101
Computer      17462
Tablet          620
Unknown          59
Name: device_type, dtype: int64

In [18]:
df_computer_smartphone['device_type'] = df_computer_smartphone['device_type'].replace(to_replace='Tablet', value='Smartphone')

In [19]:
df_computer_smartphone = df_computer_smartphone.loc[df_computer_smartphone['device_type'] != 'Unknown']

In [20]:
df_computer_smartphone['device_type'].value_counts()

Smartphone    20721
Computer      17462
Name: device_type, dtype: int64

In [21]:
df_computer_smartphone.head()

Unnamed: 0,person,device_type,computer_smartphone
12480,5059f7fd,Computer,295
434,02f14240,Smartphone,294
16117,67bdc946,Computer,203
39806,ffee0f18,Computer,173
33781,d9777589,Computer,172


In [22]:
df_features = df_computer_smartphone.merge(df_newr, on='person', how='inner')

In [23]:
df_features['device_type'] = df_features['device_type'].replace(to_replace='Computer', value=1)
df_features['device_type'] = df_features['device_type'].replace(to_replace='Smartphone', value=0)

df_features['new_vs_returning'] = df_features['new_vs_returning'].replace(to_replace='Returning', value=1)
df_features['new_vs_returning'] = df_features['new_vs_returning'].replace(to_replace='New', value=0)
df_features.head()

Unnamed: 0,person,device_type,computer_smartphone,new_vs_returning,newr
0,5059f7fd,1,295,1,294
1,02f14240,0,294,1,293
2,67bdc946,1,203,1,209
3,ffee0f18,1,173,1,172
4,d9777589,1,172,1,171


## XGboost entrenamiento

In [24]:
df_train = df_labels.merge(df_features , left_on='person', right_on='person' , how='left')

In [25]:
df_train.head()

Unnamed: 0,person,label,device_type,computer_smartphone,new_vs_returning,newr
0,0566e9c1,0,0.0,17.0,1.0,16.0
1,6ec7ee77,0,0.0,1.0,0.0,1.0
2,abe7a2fb,0,0.0,22.0,1.0,21.0
3,34728364,0,0.0,4.0,1.0,3.0
4,87ed62de,0,1.0,1.0,0.0,1.0


In [26]:
#df_train_1 = df_train.loc[df_train['label'] == 1]
#df_train_0 = df_train.loc[df_train['label'] == 0]

#df_train2 = pd.concat([df_train_1, df_train_0.sample(2000)])

Los labels me dan mi set para entrenar, los que no se encuentran en labels tengo que predecirlos

Si ven aca, de la columna label en adelante tenemos los features.

In [27]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,device_type,computer_smartphone,new_vs_returning,newr
0,0.0,17.0,1.0,16.0
1,0.0,1.0,0.0,1.0
2,0.0,22.0,1.0,21.0
3,0.0,4.0,1.0,3.0
4,1.0,1.0,0.0,1.0


Separamos los datos para hacer xgboost de la siguiente forma


|Variable |Contiene|
|------------------------|-----------------------------------------------------|
|X| features que usa xgboost son solo numeros es decir que sacamos a la persona   |
|y| label de cada persona|


## Xgboost

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [28]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

Este es el arbol con sus hiperparametros

In [29]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

Aca vuelvo a separar los datos para poder realizar una metrica interna y ver masomenos como performan las cosas que hago

Obtengo las personas a predecir!

In [30]:
X_predict = persons_to_predict.merge(df_features, on='person', how='left')
X_predict.head()

Unnamed: 0,person,device_type,computer_smartphone,new_vs_returning,newr
0,4886f805,0.0,1.0,0.0,1.0
1,0297fc1e,0.0,95.0,1.0,94.0
2,2d681dd8,1.0,2.0,1.0,1.0
3,cccea85e,1.0,22.0,1.0,21.0
4,4c8a8b93,0.0,20.0,1.0,19.0


Necesito tener el mismo dataframe que tenia cuando lo entrene pero ahora para predecir, en este caso <b>X</b>

In [31]:
my_classifier1.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

Entreno al arbol!

In [32]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.6729826198431021

Obtengo un resultado con los que separe para el test mas arriba en :
```python
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)
    ```
###### Lo que hace es sacar las predicciones para X_test y evaluarlos con y_test
    

In [33]:
entrie = my_classifier1.predict_proba(X_predict.iloc[:,1:])[:,1]

Valores a subir a kaggle, lo que sigue son transformaciones a dataframe para convertirlo en csv, y algunas cosas para darme idea tipo por donde andan los valores normales en los resultados que me da

In [34]:
X_predict['label'] = entrie

In [35]:
df_entrie = X_predict[['person', 'label']]

In [36]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.193642
1,0297fc1e,0.310879
2,2d681dd8,0.206945
3,cccea85e,0.248417
4,4c8a8b93,0.22107


In [37]:
df_entrie.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

In [38]:
df_entrie.shape

(19415, 2)

In [39]:
df_entrie['label'].nlargest(5)

2869     0.636466
2964     0.636466
9859     0.636466
13278    0.636466
59       0.622409
Name: label, dtype: float32

 # Random Forest feature importance
   - Algoritmo usado para sacar importancia de los features y ver cuales no nos estan sirviendo al modelo
 #### Es parecido a lo que hice en xgboost con algunas cosas magicas, que use para ponerlo en df y ver resultados
 
  ## NOTAR :
   - Que use la X de xgboost
   - Y que el codigo abajo del dataframe comentado puede funcionar para separar los df pidiendo que nos deje las columnas esas

In [40]:
X = X.fillna(0)

In [41]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])

In [42]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance.sort_values('importance', ascending=False).head(100)

Unnamed: 0,importance,feature
3,0.5099,newr
0,0.4065,computer_smartphone
1,0.0681,device_type
2,0.0155,new_vs_returning


In [48]:
df_features = df_features.merge(persons, on='person', how='right')
df_features.shape

(38829, 5)

In [49]:
df_features.to_csv('newr_devt.csv')