# Modelos de ML a utilizar

In [71]:
def xgb_classifier(X, y):
    
    XGB = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    
    xgb_param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
    }
    
    
    gsXGB = GridSearchCV(XGB, param_grid = xgb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)
    print('\nXGBoost Classifier')
    gsXGB.fit(X_train, y_train, eval_metric='auc')
    
    
    XGB_best = gsXGB.best_estimator_

    # Best score
    print('Random classifier')
    print(gsXGB.best_score_)
    
    return XGB_best

In [68]:
def rf_classifier(X, y):
    RFC = RandomForestClassifier()


    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[50,65,100],
                  "criterion": ["gini"]}


    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsRFC.fit(X,y)

    RFC_best = gsRFC.best_estimator_

    # Best score
    print('Random classifier')
    print(gsRFC.best_score_)
    
    return RFC_best

In [3]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [4]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [5]:
def ext_classifier(X, y):
    ExtC = ExtraTreesClassifier()


    ## Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsExtC.fit(X, y)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print('Extra tree classifier')
    print(gsExtC.best_score_)
    return ExtC_best

In [65]:
def lgb_classifier(X, y):
    lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

    lgbm_grid_params = {
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }
    
    gsLgbm = GridSearchCV(lgb_cl, lgbm_grid_params, verbose=0, cv=2, n_jobs=2,scoring="roc_auc")
    
    
    gsLgbm.fit(X,y)    
  
    # Best score
    print('Extra tree classifier')
    print(gsLgbm.best_score_)
    
    return gsLgbm

In [7]:
def mlp_classifier(X,y):
    mlp = MLPClassifier()

    mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsmlp.fit(X,y)

    mlp_best = gsmlp.best_estimator_
    print('\nRed Neuronal Classifier')
    print(gsmlp.best_score_)
    return mlp_best

In [8]:
def gb_classifier(X,y):
    GBC = GradientBoostingClassifier()
    gb_param_grid = {'loss' : ["deviance"],
                      'n_estimators' : [50,65,100],
                      'learning_rate': [0.1, 0.05, 0.01],
                      'max_depth': [4, 8],
                      'min_samples_leaf': [100,150],
                      'max_features': [0.3, 0.1],
                      }
                               
    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)
    gsGBC.fit(X,y)
  
    GBC_best = gsGBC.best_estimator_
  
    # Best score\n",
    print('Gradient Boosting')
    print(gsGBC.best_score_)
    return GBC_best

In [9]:
def ensamblador(estimadores,X_train, X_test, y_train, y_test):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('lgb', estimadores[1]),('mlp', estimadores[2]),('gb',estimadores[3])],
       voting='soft', weights=[1, 0.5, 0.5, 1],
      flatten_transform=True)

    eclf3.fit(X_train,y_train)
    
    predict = eclf3.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    
    print('Ensamble final')
    print('El puntaje auc es: {}'.format(auc))
    
    return eclf3

# Predicciones en varios modelos

In [10]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

In [11]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [12]:
df_features_numericos = pd.read_csv('features_numericos_merge_final.csv', low_memory=False)
df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos_reducidos.csv', low_memory=False)

In [13]:
df_features_bool.shape

(38829, 6)

In [14]:
df_features_numericos.shape

(38829, 34)

In [15]:
df_features_categoricos.shape

(38829, 7)

In [16]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [17]:
df_features_numericos = df_features_numericos.fillna(0)

In [18]:
df_features_numericos.isnull().sum()

person                                 0
conversion_sum                         0
conversion_mean                        0
viewed_sum                             0
viewed_mean                            0
viewed_std                             0
dif_5_check                            0
last_day_check                         0
first_day_check                        0
last_week_check                        0
first_week_check                       0
cant_checkouts_5                       0
%checkouts                             0
checkouts_ult_semana                   0
act_primera_semana                     0
act_ultima_semana                      0
mayor_actividad_ult_semana             0
cant_modelos_distintos                 0
cant_checkouts_dif_modelos             0
MAX(cant_interacciones_por_modelo)     0
MEAN(cant_interacciones_por_modelo)    0
Checkout max                           0
Checkout mean                          0
cant_lead_5to_mes                      0
cant_modelos_que

## Categoricos

Fabrico los dummies

In [19]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola


In [20]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

In [21]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada,region_persona_0,region_persona_Acre,region_persona_Alagoas,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [22]:
df_features_categoricos = df_features_categoricos.drop(columns=['region_persona','pais','semana_mas_interactuante','device_type','channel_frecuente','marca_mas_buscada'])

In [23]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona_0,region_persona_Acre,region_persona_Alagoas,region_persona_Amapa,region_persona_Amazonas,region_persona_Arkansas,region_persona_Asuncion,region_persona_Aveiro,region_persona_Bahia,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## Booleano

Los transformo a numericos (1 o 0)

In [27]:
df_features_bool.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses
0,4886f805,0,0,0,1,0
1,ad93850f,1,1,0,1,0
2,0297fc1e,1,1,0,1,1
3,2d681dd8,1,0,0,1,0
4,cccea85e,0,1,0,1,0


In [28]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [29]:
df_features_bool.shape

(38829, 6)

# Mergeo los features 

In [30]:
df_features_totales = df_features_bool.merge(df_features_numericos, on='person', how='left')

In [31]:
df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [32]:
df_features_totales = df_features_totales.merge(pd.read_csv('neww_features.csv'), on='person', how='left')

In [33]:
df_features_totales.shape

(38829, 323)

In [34]:
df_original['person'].drop_duplicates().shape

(38829,)

Aparentemente tengo datos de todos los usuarios

In [35]:
df_features_totales = df_features_totales.T.drop_duplicates().T

In [36]:
df_features_totales.shape

(38829, 305)

# Data preprocessing

In [37]:
df_train = df_labels.merge(df_features_totales, on='person', how='left')

In [38]:
df_train.shape

(19414, 306)

In [39]:
df_train = df_train.fillna(0)

In [40]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [41]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [42]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Random Forest para feature importance

Nos Quedaremos con los primeros 100 features

In [43]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
%time rfecv = rfecv.fit(X_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train.columns[rfecv.support_])

























































































CPU times: user 7min 25s, sys: 732 ms, total: 7min 25s
Wall time: 7min 26s
Optimal number of features : 303
Best features : Index(['busco_top_5_visitas', 'visito_mas_que_el_promedio',
       'siempre_incrementando', 'inc_ultimo_mes',
       'mismo_interes_ultimos_dos_meses', 'conversion_sum', 'conversion_mean',
       'viewed_sum', 'viewed_mean', 'viewed_std',
       ...
       'PERCENTILE(conversion - checkout)',
       'PERCENTILE(conversion - viewed product)',
       'PERCENTILE(conversion - visited site)',
       'PERCENTILE(checkout - visited site)',
       'PERCENTILE(checkout + viewed product)',
       'PERCENTILE(conversion + viewed product)',
       'PERCENTILE(checkout + conversion)',
       'PERCENTILE(conversion + visited site)',
       'PERCENTILE(viewed product + visited site)',
       'PERCENTILE(checkout + visited site)'],
      dtype='object', length=303)


In [44]:
f_m_i = list(X_train.columns[rfecv.support_])

In [45]:
f_m_i.append('person')

# Modelaje y entrenamiento!

Nos quedamos con los 100 elegidos

In [48]:
df_train = df_labels.merge(df_features_totales[f_m_i], on='person', how='left')

In [49]:
df_train.shape

(19414, 305)

In [50]:
df_train = df_train.fillna(0)

In [51]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [52]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [53]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

Entrenamos los 4 algoritmos seleccionados

In [72]:
xgb_model = xgb_classifier(X, y)
lgb_model = lgb_classifier(X, y)
mlp_model = mlp_classifier(X,y)
gb_model = gb_classifier(X, y)


XGBoost Classifier
Fitting 2 folds for each of 405 candidates, totalling 810 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   29.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 11.9min
[Parallel(n_jobs=4)]: Done 810 out of 810 | elapsed: 12.3min finished


Random classifier
0.860099263944472


In [73]:
ensamble = ensamblador([xgb_model,lgb_model,mlp_model,gb_model],X_train, X_test, y_train, y_test)

Ensamble final
El puntaje auc es: 0.8803823747965036


# Prediccion para el tp

In [96]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [97]:
mejores_columnas_por_random_forest = ['last_day_check',
 'first_day_check',
 'Checkout mean',
 'checkout + visited site',
 'Checkout max',
 '%checkouts',
 'PERCENTILE(visited site - checkout)',
 'cant_modelos_distintos',
 'PERCENTILE(checkout)',
 'MAX(cant_interacciones_por_modelo)',
 'visitas_mes5',
 'cant_checkouts_5',
 'checkout + viewed product',
 'Promedio_visitas_producto',
 'visited site + visited site - checkout',
 'horas_mirando_productos',
 'viewed product',
 'PERCENTILE(checkout - visited site)',
 'checkout + viewed product - visited site',
 'checkout + visited site - checkout',
 'viewed_std',
 'PERCENTILE(checkout + viewed product)',
 'MEAN(cant_interacciones_por_modelo)',
 'last_week_check',
 'promedio_ingreso_mensual',
 'viewed product + viewed product - visited site',
 'checkout - visited site + visited site',
 'act_ultima_semana',
 'Cantidad_visitas',
 'viewed product - visited site + visited site',
 'PERCENTILE(viewed product)',
 'device_type_Smartphone',
 'viewed_sum',
 'PERCENTILE(checkout - viewed product)',
 'PERCENTILE(visited site - viewed product)',
 'PERCENTILE(viewed product - visited site)',
 'visited site + visited site - viewed product',
 'viewed product + viewed product - checkout',
 'device_type_Computer',
 'PERCENTILE(viewed product + visited site)',
 'viewed_mean',
 'viewed product + visited site',
 'checkout',
 'first_week_check',
 'viewed product - checkout + visited site - checkout',
 'checkout - viewed product + checkout - visited site',
 'PERCENTILE(viewed product - checkout)',
 'visited site - viewed product',
 'viewed product - visited site',
 'viewed product + visited site - viewed product',
 'checkout + checkout - visited site',
 'checkout - viewed product',
 'visited site',
 'checkout - viewed product + visited site - viewed product',
 'checkout - viewed product + viewed product',
 'PERCENTILE(visited site)',
 'checkout - visited site + visited site - viewed product',
 'visitas_mes4',
 'visited site - checkout + visited site - viewed product',
 'viewed product - visited site + visited site - checkout',
 'visitas_mes3',
 'checkout - visited site + viewed product - checkout',
 'conversion + conversion - viewed product',
 'checkout + checkout - viewed product',
 'viewed product + visited site - checkout',
 'viewed product - checkout + viewed product - visited site',
 'checkout - viewed product + conversion - visited site',
 'cant_checkouts_dif_modelos',
 'viewed product - conversion + visited site',
 'act_primera_semana',
 'conversion - viewed product + visited site - viewed product',
 'checkout - viewed product + visited site',
 'conversion_sum',
 'semana_mas_interactuante_Fifth',
 'viewed product - checkout',
 'marca_mas_buscada_Samsung',
 'viewed product - conversion + visited site - conversion',
 'inc_ultimo_mes',
 'checkout - visited site + viewed product - visited site',
 'checkout + viewed product - checkout',
 'conversion + viewed product - conversion',
 'busco_top_5_visitas',
 'PERCENTILE(checkout - conversion)',
 'checkouts_ult_semana',
 'channel_frecuente_Paid',
 'conversion - checkout + conversion - viewed product',
 'viewed product - checkout + viewed product - conversion',
 'checkout - viewed product + visited site - checkout',
 'PERCENTILE(checkout + visited site)',
 'channel_frecuente_Referral',
 'PERCENTILE(conversion - viewed product)',
 'conversion - visited site + viewed product - conversion',
 'checkout - visited site + viewed product - conversion',
 'conversion - visited site + visited site - viewed product',
 'conversion - visited site + viewed product - checkout',
 'checkout - viewed product + viewed product - visited site',
 'marca_mas_buscada_iPhone',
 'mayor_actividad_ult_semana',
 'conversion + viewed product',
 'PERCENTILE(conversion)', 'person']

In [98]:
df_features_totales[f_m_i].columns

Index(['busco_top_5_visitas', 'visito_mas_que_el_promedio',
       'siempre_incrementando', 'inc_ultimo_mes',
       'mismo_interes_ultimos_dos_meses', 'conversion_sum', 'conversion_mean',
       'viewed_sum', 'viewed_mean', 'viewed_std',
       ...
       'PERCENTILE(conversion - viewed product)',
       'PERCENTILE(conversion - visited site)',
       'PERCENTILE(checkout - visited site)',
       'PERCENTILE(checkout + viewed product)',
       'PERCENTILE(conversion + viewed product)',
       'PERCENTILE(checkout + conversion)',
       'PERCENTILE(conversion + visited site)',
       'PERCENTILE(viewed product + visited site)',
       'PERCENTILE(checkout + visited site)', 'person'],
      dtype='object', length=304)

In [99]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales[f_m_i], on='person', how='left')

In [100]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [101]:
personas_a_predecir_con_features.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses,conversion_sum,conversion_mean,viewed_sum,viewed_mean,...,PERCENTILE(conversion - checkout),PERCENTILE(conversion - viewed product),PERCENTILE(conversion - visited site),PERCENTILE(checkout - visited site),PERCENTILE(checkout + viewed product),PERCENTILE(conversion + viewed product),PERCENTILE(checkout + conversion),PERCENTILE(conversion + visited site),PERCENTILE(viewed product + visited site),PERCENTILE(checkout + visited site)
0,4886f805,0,0,0,1,0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.729542,0.254634,0.0,0.0,0.0,0.247665,0.16645
1,0297fc1e,1,1,0,1,1,0.0,0.0,404.0,5.228295,...,0.0,0.0,0.0,0.002509,0.991173,0.0,0.0,0.0,0.993528,0.997089
2,2d681dd8,1,0,0,1,0,0.0,0.0,13.0,0.707107,...,0.0,0.0,0.0,0.469058,0.521515,0.0,0.0,0.0,0.526929,0.406105
3,cccea85e,0,1,0,1,0,0.0,0.0,739.0,26.241971,...,0.0,0.0,0.0,0.036951,0.997604,0.0,0.0,0.0,0.997563,0.939928
4,4c8a8b93,1,1,0,1,0,0.0,0.0,177.0,40.339807,...,0.0,0.0,0.0,0.046506,0.962219,0.0,0.0,0.0,0.965828,0.93539


In [102]:
personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [103]:
personas_a_predecir_con_features.columns

Index(['busco_top_5_visitas', 'visito_mas_que_el_promedio',
       'siempre_incrementando', 'inc_ultimo_mes',
       'mismo_interes_ultimos_dos_meses', 'conversion_sum', 'conversion_mean',
       'viewed_sum', 'viewed_mean', 'viewed_std',
       ...
       'PERCENTILE(conversion - checkout)',
       'PERCENTILE(conversion - viewed product)',
       'PERCENTILE(conversion - visited site)',
       'PERCENTILE(checkout - visited site)',
       'PERCENTILE(checkout + viewed product)',
       'PERCENTILE(conversion + viewed product)',
       'PERCENTILE(checkout + conversion)',
       'PERCENTILE(conversion + visited site)',
       'PERCENTILE(viewed product + visited site)',
       'PERCENTILE(checkout + visited site)'],
      dtype='object', length=303)

In [104]:
final_prediction = ensamble2.predict_proba(personas_a_predecir_con_features)[:,1]

In [105]:
final_prediction_tp = pd.Series(final_prediction)

In [106]:
final_prediction_tp.sort_values()

17994    0.010472
11909    0.010514
6632     0.010516
3085     0.010529
13868    0.010534
18315    0.010534
15908    0.010550
15297    0.010555
17740    0.010559
17726    0.010563
7777     0.010563
14536    0.010564
10675    0.010571
18817    0.010574
8863     0.010580
16657    0.010606
16011    0.010624
3854     0.010632
4175     0.010659
7364     0.010664
10879    0.010688
3866     0.010726
17898    0.010742
17941    0.010754
13890    0.010792
7791     0.010819
4020     0.010820
13945    0.010835
7518     0.010856
11667    0.010877
           ...   
9012     0.393238
4645     0.394165
4813     0.395466
8977     0.398224
4869     0.399017
4793     0.399090
4880     0.400504
4789     0.400571
4700     0.403403
4821     0.403511
4163     0.404133
16221    0.404467
15114    0.405304
11555    0.405959
8902     0.406184
2247     0.407406
9107     0.423097
4517     0.426597
12687    0.431949
9063     0.437056
4502     0.437952
2321     0.438381
4771     0.441227
9047     0.448756
4816     0

In [107]:
personas_a_predecir['label'] = final_prediction_tp

In [108]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [109]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

# Aca agrego el grid search con el ensamble

In [94]:
def ensamblador2(estimadores,X, y):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('lgb', estimadores[1]),('mlp', estimadores[2]),('gb',estimadores[3])],
       voting='soft', weights=[1, 0.5, 0.5, 1],
      flatten_transform=True)

        
        
    votC_param_grid = {
        'weights': [[1, 0.5, 0.5, 1], [1, 1, 1, 1], [1, 0.5, 1, 1],[1, 1, 0.5, 1]],
        
    }
    
    gsvotC = GridSearchCV(eclf3, param_grid = votC_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)
    print('Voting Classifier')
    gsvotC.fit(X, y)
    
    
    votC_best = gsvotC.best_estimator_

    # Best score
    print('Ensamble final')
    print(gsvotC.best_score_)
    
    return votC_best

In [95]:
ensamble2 = ensamblador2([xgb_model,lgb_model,mlp_model,gb_model],X, y)

Voting Classifier
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:  4.8min finished


Ensamble final
0.8688506486461354
