# Modelos de ML a utilizar

In [83]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    print('\nXGBoost Classifier')
    alg.fit(X_train, y_train, eval_metric='auc')
    
    pred_proba = alg.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    
    print('El puntaje auc es: {}'.format(auc))
    
    return alg

In [84]:
def rf_classifier(X, y):
    RFC = RandomForestClassifier()


    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[50,65,100],
                  "criterion": ["gini"]}


    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsRFC.fit(X,y)

    RFC_best = gsRFC.best_estimator_

    # Best score
    print('Random classifier')
    print(gsRFC.best_score_)
    
    return RFC_best

In [85]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [86]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [87]:
def ext_classifier(X, y):
    ExtC = ExtraTreesClassifier()


    ## Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsExtC.fit(X, y)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print('Extra tree classifier')
    print(gsExtC.best_score_)
    return ExtC_best

In [88]:
def lgb_classifier(X_train, X_test, y_train, y_test):
    lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

    lgb_cl.fit(X_train,y_train)
    predict = lgb_cl.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nLGB Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return lgb_cl

In [89]:
def mlp_classifier(X,y):
    mlp = MLPClassifier()

    mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsmlp.fit(X,y)

    mlp_best = gsmlp.best_estimator_
    print('\nRed Neuronal Classifier')
    print(gsmlp.best_score_)
    return mlp_best

In [90]:
def gb_classifier(X,y):
    GBC = GradientBoostingClassifier()
    gb_param_grid = {'loss' : ["deviance"],
                      'n_estimators' : [50,65,100],
                      'learning_rate': [0.1, 0.05, 0.01],
                      'max_depth': [4, 8],
                      'min_samples_leaf': [100,150],
                      'max_features': [0.3, 0.1],
                      }
                               
    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)
    gsGBC.fit(X,y)
  
    GBC_best = gsGBC.best_estimator_
  
    # Best score\n",
    print('Gradient Boosting')
    print(gsGBC.best_score_)
    return GBC_best

In [91]:
def ensamblador(estimadores,X_train, X_test, y_train, y_test):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('lgb', estimadores[1]),('mlp', estimadores[2]),('gb',estimadores[3])],
       voting='soft', weights=[1, 0.5, 0.5, 1],
      flatten_transform=True)

    eclf3.fit(X_train,y_train)
    
    predict = eclf3.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    
    print('Ensamble final')
    print('El puntaje auc es: {}'.format(auc))
    
    return eclf3

# Predicciones en varios modelos

In [92]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

In [93]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [94]:
df_features_numericos = pd.read_csv('features_numericos_merge_final.csv', low_memory=False)
df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos_reducidos.csv', low_memory=False)

In [95]:
df_features_bool.shape

(38829, 6)

In [96]:
df_features_numericos.shape

(38829, 34)

In [97]:
df_features_categoricos.shape

(38829, 7)

In [98]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [99]:
df_features_numericos = df_features_numericos.fillna(0)

In [100]:
df_features_numericos.isnull().sum()

person                                 0
conversion_sum                         0
conversion_mean                        0
viewed_sum                             0
viewed_mean                            0
viewed_std                             0
dif_5_check                            0
last_day_check                         0
first_day_check                        0
last_week_check                        0
first_week_check                       0
cant_checkouts_5                       0
%checkouts                             0
checkouts_ult_semana                   0
act_primera_semana                     0
act_ultima_semana                      0
mayor_actividad_ult_semana             0
cant_modelos_distintos                 0
cant_checkouts_dif_modelos             0
MAX(cant_interacciones_por_modelo)     0
MEAN(cant_interacciones_por_modelo)    0
Checkout max                           0
Checkout mean                          0
cant_lead_5to_mes                      0
cant_modelos_que

## Categoricos

Fabrico los dummies

In [101]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola


In [102]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

In [103]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada,region_persona_0,region_persona_Acre,region_persona_Alagoas,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [104]:
df_features_categoricos = df_features_categoricos.drop(columns=['region_persona','pais','semana_mas_interactuante','device_type','channel_frecuente','marca_mas_buscada'])

In [105]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona_0,region_persona_Acre,region_persona_Alagoas,region_persona_Amapa,region_persona_Amazonas,region_persona_Arkansas,region_persona_Asuncion,region_persona_Aveiro,region_persona_Bahia,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## Booleano

Los transformo a numericos (1 o 0)

In [106]:
df_features_bool.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses
0,4886f805,0.0,False,0,1,0
1,ad93850f,1.0,True,0,1,0
2,0297fc1e,1.0,True,0,1,1
3,2d681dd8,1.0,False,0,1,0
4,cccea85e,0.0,True,0,1,0


In [107]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [108]:
df_features_bool.shape

(38829, 6)

# Mergeo los features 

In [326]:
df_features_totales = df_features_bool.merge(df_features_numericos, on='person', how='left')

In [327]:
df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [328]:
df_features_totales = df_features_totales.merge(pd.read_csv('neww_features.csv'), on='person', how='left')

In [329]:
df_features_totales.shape

(38829, 323)

In [330]:
df_original['person'].drop_duplicates().shape

(38829,)

Aparentemente tengo datos de todos los usuarios

In [331]:
df_features_totales = df_features_totales.T.drop_duplicates().T

In [332]:
df_features_totales.shape

(38829, 305)

# Data preprocessing

In [534]:
df_train = df_labels.merge(df_features_totales[f_m_i], on='person', how='left')

In [535]:
df_train.shape

(19414, 102)

In [536]:
df_train = df_train.fillna(0)

In [537]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [538]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [539]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Modelaje y entrenamiento!

In [540]:
xgb_model = xgb_classifier(X_train, X_test, y_train, y_test,useTrainCV=True)
#rf_model = rf_classifier(X, y)
#ext_model = ext_classifier(X, y)
lgb_model = lgb_classifier(X_train, X_test, y_train, y_test)
mlp_model = mlp_classifier(X,y)
gb_model = gb_classifier(X, y)
#knn_model = knn_classifier(X_train,X_test, y_train, y_test)


XGBoost Classifier
El puntaje auc es: 0.8802646730159449

LGB Classifier
El puntaje auc es: 0.8750744499214376
Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.6s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:  1.0min finished



Red Neuronal Classifier
0.842550643996342
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   39.1s
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed:  2.0min finished


Gradient Boosting
0.8686567965582674


In [542]:
ensamble = ensamblador([xgb_model,lgb_model,mlp_model,gb_model],X_train, X_test, y_train, y_test)

Ensamble final
El puntaje auc es: 0.880551836998633


# Xgboost 
 |Features      |      XGBoost             | RandomForest           |
 |--------------|--------------------------|------------------------|
 | 308  |  0.8781084614817377 |   0.8485298904198764|
 | 250  |  0.8783623002614965 |   0.8495565536619335|
 | 200  |  0.8755176042157091 |   0.8495781973416469|
 | 150  |   0.8769683141134361 |   0.8502470479349384|
 | 100  |   0.8746185327834912 |   0.8506912692385189|
 | 50   |   0.8751496088897461 |   0.8483885145682447| 
 | 40   |   0.8662921953792833 |   0.845880394036751 |
 | 30   |   0.8630348340546704 |   0.8512362360589241|

In [530]:
zipped = zip(map(lambda x: round(x, 4), rf_model.feature_importances_), X_test.columns)
feature = sorted(zipped, key=lambda x: x[1])

In [531]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
f_m_i = list(feat_importance.sort_values('importance', ascending=False).head(100)['feature'].values)

In [532]:
f_m_i

['last_day_check',
 'first_day_check',
 'Checkout mean',
 'checkout + visited site',
 'Checkout max',
 '%checkouts',
 'PERCENTILE(visited site - checkout)',
 'cant_modelos_distintos',
 'PERCENTILE(checkout)',
 'MAX(cant_interacciones_por_modelo)',
 'visitas_mes5',
 'cant_checkouts_5',
 'checkout + viewed product',
 'Promedio_visitas_producto',
 'visited site + visited site - checkout',
 'horas_mirando_productos',
 'viewed product',
 'PERCENTILE(checkout - visited site)',
 'checkout + viewed product - visited site',
 'checkout + visited site - checkout',
 'viewed_std',
 'PERCENTILE(checkout + viewed product)',
 'MEAN(cant_interacciones_por_modelo)',
 'last_week_check',
 'promedio_ingreso_mensual',
 'viewed product + viewed product - visited site',
 'checkout - visited site + visited site',
 'act_ultima_semana',
 'Cantidad_visitas',
 'viewed product - visited site + visited site',
 'PERCENTILE(viewed product)',
 'device_type_Smartphone',
 'viewed_sum',
 'PERCENTILE(checkout - viewed prod

In [533]:
f_m_i.append('person')
f_m_i

['last_day_check',
 'first_day_check',
 'Checkout mean',
 'checkout + visited site',
 'Checkout max',
 '%checkouts',
 'PERCENTILE(visited site - checkout)',
 'cant_modelos_distintos',
 'PERCENTILE(checkout)',
 'MAX(cant_interacciones_por_modelo)',
 'visitas_mes5',
 'cant_checkouts_5',
 'checkout + viewed product',
 'Promedio_visitas_producto',
 'visited site + visited site - checkout',
 'horas_mirando_productos',
 'viewed product',
 'PERCENTILE(checkout - visited site)',
 'checkout + viewed product - visited site',
 'checkout + visited site - checkout',
 'viewed_std',
 'PERCENTILE(checkout + viewed product)',
 'MEAN(cant_interacciones_por_modelo)',
 'last_week_check',
 'promedio_ingreso_mensual',
 'viewed product + viewed product - visited site',
 'checkout - visited site + visited site',
 'act_ultima_semana',
 'Cantidad_visitas',
 'viewed product - visited site + visited site',
 'PERCENTILE(viewed product)',
 'device_type_Smartphone',
 'viewed_sum',
 'PERCENTILE(checkout - viewed prod

# Prediccion para el tp

In [543]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [544]:
df_features_totales[f_m_i].columns

Index(['last_day_check', 'first_day_check', 'Checkout mean',
       'checkout + visited site', 'Checkout max', '%checkouts',
       'PERCENTILE(visited site - checkout)', 'cant_modelos_distintos',
       'PERCENTILE(checkout)', 'MAX(cant_interacciones_por_modelo)',
       ...
       'conversion - visited site + viewed product - conversion',
       'checkout - visited site + viewed product - conversion',
       'conversion - visited site + visited site - viewed product',
       'conversion - visited site + viewed product - checkout',
       'checkout - viewed product + viewed product - visited site',
       'marca_mas_buscada_iPhone', 'mayor_actividad_ult_semana',
       'conversion + viewed product', 'PERCENTILE(conversion)', 'person'],
      dtype='object', length=101)

In [545]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales[f_m_i], on='person', how='left')

In [546]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [547]:
personas_a_predecir_con_features.head()

Unnamed: 0,person,last_day_check,first_day_check,Checkout mean,checkout + visited site,Checkout max,%checkouts,PERCENTILE(visited site - checkout),cant_modelos_distintos,PERCENTILE(checkout),...,PERCENTILE(conversion - viewed product),conversion - visited site + viewed product - conversion,checkout - visited site + viewed product - conversion,conversion - visited site + visited site - viewed product,conversion - visited site + viewed product - checkout,checkout - viewed product + viewed product - visited site,marca_mas_buscada_iPhone,mayor_actividad_ult_semana,conversion + viewed product,PERCENTILE(conversion)
0,4886f805,18.0,18.0,1.0,2.0,1.0,11.111111,0.270489,1.0,0.330125,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,0297fc1e,22.0,10.0,1.0,102.0,1.0,1.156069,0.997522,11.0,0.964274,...,0.0,0.0,0.0,0.0,0.0,-88.0,1,0,0.0,0.0
2,2d681dd8,27.0,27.0,1.0,3.0,1.0,3.846154,0.530973,3.0,0.330125,...,0.0,0.0,0.0,0.0,0.0,-1.0,1,0,0.0,0.0
3,cccea85e,11.0,11.0,1.0,23.0,1.0,0.119617,0.96308,57.0,0.330125,...,0.0,0.0,0.0,0.0,0.0,-21.0,0,1,0.0,0.0
4,4c8a8b93,19.0,18.0,1.0,22.0,1.0,0.77821,0.953525,9.0,0.739987,...,0.0,0.0,0.0,0.0,0.0,-18.0,1,0,0.0,0.0


In [548]:
personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [549]:
personas_a_predecir_con_features.columns

Index(['last_day_check', 'first_day_check', 'Checkout mean',
       'checkout + visited site', 'Checkout max', '%checkouts',
       'PERCENTILE(visited site - checkout)', 'cant_modelos_distintos',
       'PERCENTILE(checkout)', 'MAX(cant_interacciones_por_modelo)',
       'visitas_mes5', 'cant_checkouts_5', 'checkout + viewed product',
       'Promedio_visitas_producto', 'visited site + visited site - checkout',
       'horas_mirando_productos', 'viewed product',
       'PERCENTILE(checkout - visited site)',
       'checkout + viewed product - visited site',
       'checkout + visited site - checkout', 'viewed_std',
       'PERCENTILE(checkout + viewed product)',
       'MEAN(cant_interacciones_por_modelo)', 'last_week_check',
       'promedio_ingreso_mensual',
       'viewed product + viewed product - visited site',
       'checkout - visited site + visited site', 'act_ultima_semana',
       'Cantidad_visitas', 'viewed product - visited site + visited site',
       'PERCENTILE(viewed 

In [550]:
final_prediction = ensamble.predict_proba(personas_a_predecir_con_features)[:,1]

In [551]:
final_prediction_tp = pd.Series(final_prediction)

In [552]:
final_prediction_tp.sort_values()

7037     0.006523
4003     0.007530
16689    0.007972
6579     0.008984
15235    0.009080
18963    0.009116
4063     0.009376
10089    0.009684
17226    0.009711
1828     0.009714
12696    0.009720
6710     0.009727
18166    0.009753
8392     0.009793
18281    0.009819
11868    0.009844
13247    0.009867
4161     0.009948
3841     0.009979
4172     0.010079
18169    0.010079
13646    0.010080
3688     0.010080
14641    0.010080
13632    0.010080
4147     0.010080
7323     0.010081
7338     0.010081
3583     0.010081
14040    0.010081
           ...   
4880     0.423132
9128     0.423830
4768     0.424669
4813     0.424725
4789     0.426336
16126    0.426769
12849    0.429609
2872     0.430798
9107     0.434359
8705     0.438266
9127     0.438916
4869     0.440144
4502     0.442537
4512     0.445112
3147     0.446130
9012     0.446909
1519     0.447904
11555    0.452892
4309     0.455073
8321     0.457014
12704    0.461229
16103    0.472415
4793     0.473852
8902     0.475782
16111    0

In [553]:
personas_a_predecir['label'] = final_prediction_tp

In [554]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [555]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)