# Modelos de ML a utilizar

In [78]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    print('\nXGBoost Classifier')
    alg.fit(X_train, y_train, eval_metric='auc')
    
    pred_proba = alg.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    
    print('El puntaje auc es: {}'.format(auc))
    
    return alg

In [2]:
def rf_classifier(X, y):
    RFC = RandomForestClassifier()


    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[50,65,100],
                  "criterion": ["gini"]}


    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsRFC.fit(X,y)

    RFC_best = gsRFC.best_estimator_

    # Best score
    print('Random classifier')
    print(gsRFC.best_score_)
    
    return RFC_best

In [3]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [4]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [5]:
def ext_classifier(X, y):
    ExtC = ExtraTreesClassifier()


    ## Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsExtC.fit(X, y)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print('Extra tree classifier')
    print(gsExtC.best_score_)
    return ExtC_best

In [6]:
def lgb_classifier(X_train, X_test, y_train, y_test):
    lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

    lgb_cl.fit(X_train,y_train)
    predict = lgb_cl.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nLGB Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return lgb_cl

In [7]:
def mlp_classifier(X,y):
    mlp = MLPClassifier()

    mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsmlp.fit(X,y)

    mlp_best = gsmlp.best_estimator_
    print('\nRed Neuronal Classifier')
    print(gsmlp.best_score_)
    return mlp_best

In [8]:
def gb_classifier(X,y):
    GBC = GradientBoostingClassifier()
    gb_param_grid = {'loss' : ["deviance"],
                      'n_estimators' : [50,65,100],
                      'learning_rate': [0.1, 0.05, 0.01],
                      'max_depth': [4, 8],
                      'min_samples_leaf': [100,150],
                      'max_features': [0.3, 0.1],
                      }
                               
    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)
    gsGBC.fit(X,y)
  
    GBC_best = gsGBC.best_estimator_
  
    # Best score\n",
    print('Gradient Boosting')
    print(gsGBC.best_score_)
    return GBC_best

In [9]:
def ensamblador(estimadores,X_train, X_test, y_train, y_test):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('lgb', estimadores[1]),('mlp', estimadores[2]),('gb',estimadores[3])],
       voting='soft', weights=[1, 0.5, 0.5, 1],
      flatten_transform=True)

    eclf3.fit(X_train,y_train)
    
    predict = eclf3.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    
    print('Ensamble final')
    print('El puntaje auc es: {}'.format(auc))
    
    return eclf3

# Predicciones en varios modelos

In [10]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

In [11]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [12]:
df_features_numericos = pd.read_csv('features_numericos_merge_final.csv', low_memory=False)
df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos_reducidos.csv', low_memory=False)

In [13]:
df_features_bool.shape

(38829, 8)

In [14]:
df_features_numericos.shape

(38829, 34)

In [15]:
df_features_categoricos.shape

(38829, 7)

In [16]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [17]:
df_features_numericos = df_features_numericos.fillna(0)

In [18]:
df_features_numericos.isnull().sum()

person                                 0
conversion_sum                         0
conversion_mean                        0
viewed_sum                             0
viewed_mean                            0
viewed_std                             0
dif_5_check                            0
last_day_check                         0
first_day_check                        0
last_week_check                        0
first_week_check                       0
cant_checkouts_5                       0
%checkouts                             0
checkouts_ult_semana                   0
act_primera_semana                     0
act_ultima_semana                      0
mayor_actividad_ult_semana             0
cant_modelos_distintos                 0
cant_checkouts_dif_modelos             0
MAX(cant_interacciones_por_modelo)     0
MEAN(cant_interacciones_por_modelo)    0
Checkout max                           0
Checkout mean                          0
cant_lead_5to_mes                      0
cant_modelos_que

## Categoricos

Fabrico los dummies

In [19]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola


In [20]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

In [21]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada,region_persona_0,region_persona_Acre,region_persona_Alagoas,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [22]:
df_features_categoricos = df_features_categoricos.drop(columns=['region_persona','pais','semana_mas_interactuante','device_type','channel_frecuente','marca_mas_buscada'])

In [23]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona_0,region_persona_Acre,region_persona_Alagoas,region_persona_Amapa,region_persona_Amazonas,region_persona_Arkansas,region_persona_Asuncion,region_persona_Aveiro,region_persona_Bahia,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## Booleano

Los transformo a numericos (1 o 0)

In [24]:
df_features_bool.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses,ingreso_el_ultimo_mes,ingreso_ultima_quincena
0,4886f805,0.0,False,0,1,0,True,False
1,ad93850f,1.0,True,0,1,0,True,True
2,0297fc1e,1.0,True,0,1,1,True,True
3,2d681dd8,1.0,False,0,1,0,True,True
4,cccea85e,0.0,True,0,1,0,True,True


In [25]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [26]:
df_features_bool.shape

(38829, 8)

# Mergeo los features 

In [27]:
df_features_totales = df_features_bool.merge(df_features_numericos, on='person', how='left')

In [28]:
df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [29]:
df_features_totales = df_features_totales.merge(pd.read_csv('neww_features.csv'), on='person', how='left')

In [30]:
df_features_totales.shape

(38829, 325)

In [31]:
df_original['person'].drop_duplicates().shape

(38829,)

Aparentemente tengo datos de todos los usuarios

In [133]:
df_features_totales = df_features_totales.T.drop_duplicates().T

In [134]:
df_features_totales.shape

(38829, 307)

# Data preprocessing

In [224]:
df_train = df_labels.merge(df_features_totales[f_m_i], on='person', how='left')

In [225]:
df_train.shape

(19414, 32)

In [226]:
df_train = df_train.fillna(0)

In [227]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [228]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [229]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Modelaje y entrenamiento!

In [230]:
xgb_model = xgb_classifier(X_train, X_test, y_train, y_test,useTrainCV=True)
rf_model = rf_classifier(X, y)
#ext_model = ext_classifier(X, y)
#lgb_model = lgb_classifier(X_train, X_test, y_train, y_test)
#mlp_model = mlp_classifier(X,y)
#gb_model = gb_classifier(X, y,)
#knn_model = knn_classifier(X_train,X_test, y_train, y_test)


XGBoost Classifier
El puntaje auc es: 0.8630348340546704
Fitting 2 folds for each of 81 candidates, totalling 162 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.2s
[Parallel(n_jobs=4)]: Done 162 out of 162 | elapsed:  1.4min finished


Random classifier
0.8512362360589241


In [None]:
ensamble = ensamblador([xgb_model,lgb_model,mlp_model,gb_model],X_train, X_test, y_train, y_test)

In [178]:
#xgb_predict = xgb_model.predict_proba(X_test)[:,1]
#rf_predict = rf_model.predict_proba(X_test)[:,1]
#dt_predict = dt_model.predict_proba(X_test)[:,1]
#knn_predict = knn_model.predict_proba(X_test)[:,1]

# Xgboost 
 |Features      |      XGBoost             | RandomForest           |
 |--------------|--------------------------|------------------------|
 | 308  |  0.8770058935975904 |   0.8485298904198764|
 | 250  |  0.8801512255166115 |   0.8493670192390724|
 | 200  |  0.8755161861219674 |   0.8492912386827358|
 | 150  |   0.8785806866977134 |   0.8480886029143131|
 | 100  |   0.8755814184340842 |   0.8496581294989517|
 | 50   |   0.8751496088897461 |   0.8513477757382655| 
 | 40   |   0.8662921953792833 |   0.845880394036751 |
 | 30   |   0.8630348340546704 |   0.8512362360589241|

In [128]:
#final_prediction_test = ensamble.predict

In [129]:
#roc_auc_score(y_test, final_prediction_test)

In [200]:
zipped = zip(map(lambda x: round(x, 4), rf_model.feature_importances_), X_test.columns)
feature = sorted(zipped, key=lambda x: x[1])

In [221]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
f_m_i = list(feat_importance.sort_values('importance', ascending=False).head(30)['feature'].values)

In [222]:
f_m_i

['first_day_check',
 'last_day_check',
 'Checkout mean',
 'Promedio_visitas_producto',
 'Checkout max',
 'horas_mirando_productos',
 'cant_checkouts_dif_modelos',
 '%checkouts',
 'promedio_ingreso_mensual',
 'visitas_mes5',
 'PERCENTILE(checkout - visited site)',
 'act_ultima_semana',
 'cant_checkouts_5',
 'MEAN(cant_interacciones_por_modelo)',
 'viewed_std',
 'checkout',
 'viewed product + viewed product - visited site',
 'MAX(cant_interacciones_por_modelo)',
 'viewed_mean',
 'viewed product + visited site',
 'device_type_Computer',
 'PERCENTILE(viewed product + visited site)',
 'Cantidad_visitas',
 'visited site + visited site - viewed product',
 'viewed product',
 'cant_modelos_distintos',
 'PERCENTILE(viewed product)',
 'PERCENTILE(visited site - checkout)',
 'viewed product - visited site + visited site',
 'viewed_sum']

In [223]:
f_m_i.append('person')
len(f_m_i)

31

# Prediccion para el tp

In [57]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [58]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales, on='person', how='left')

In [59]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [60]:
personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [61]:
final_prediction = ensamble.predict_proba(personas_a_predecir_con_features)[:,1]

In [62]:
final_prediction_tp = pd.Series(final_prediction)

In [63]:
final_prediction_tp.sort_values()

4063     0.005858
16182    0.007322
6553     0.007848
15235    0.007850
12108    0.007985
7419     0.008405
17226    0.008548
18281    0.008557
6710     0.008557
13247    0.008568
11868    0.008636
18166    0.008666
4532     0.008834
9824     0.008851
7694     0.008918
14928    0.009022
16691    0.009055
2009     0.009102
11909    0.009160
6632     0.009171
4853     0.009177
13868    0.009191
18315    0.009194
13945    0.009221
7364     0.009231
17994    0.009237
17726    0.009243
7777     0.009243
14536    0.009246
5472     0.009258
           ...   
12849    0.431580
16103    0.432986
16191    0.433673
4861     0.435511
5122     0.435653
4163     0.437733
9107     0.438005
3147     0.441370
4502     0.448425
11555    0.462041
4523     0.465245
1935     0.468180
4309     0.472175
9722     0.475489
4821     0.480180
9063     0.480449
9012     0.481349
16111    0.483368
9128     0.486773
4813     0.487310
8662     0.489026
4789     0.489217
12704    0.490580
4793     0.493527
12687    0

In [64]:
personas_a_predecir['label'] = final_prediction_tp

In [65]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [66]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)