# Predicciones en varios modelos

In [2]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

  from numpy.core.umath_tests import inner1d


In [3]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [4]:
df_features_numericos = pd.read_csv('features_numericos_merge_final.csv', low_memory=False)
df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos_reducidos.csv', low_memory=False)

In [5]:
df_features_bool.shape

(38829, 6)

In [6]:
df_features_numericos.shape

(38829, 49)

In [7]:
df_features_categoricos.shape

(38829, 7)

In [8]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [9]:
df_features_numericos = df_features_numericos.fillna(0)

In [10]:
df_features_numericos.isnull().sum()

person                                 0
checkout_sum                           0
checkout_mean                          0
conversion_sum                         0
conversion_mean                        0
viewed_sum                             0
viewed_mean                            0
viewed_std                             0
dif_5_check                            0
last_day_check                         0
first_day_check                        0
last_week_check                        0
first_week_check                       0
viewed_model_mean                      0
models_viewed_days_mean                0
models_viewed_days_max                 0
diferencia 5                           0
DAY(last month 5)                      0
DAY(first month 5)                     0
WEEKDAY(last month 5)                  0
WEEKDAY(first month 5)                 0
cant_checkouts_5                       0
%checkouts                             0
checkouts_ult_semana                   0
act_primera_sema

## Categoricos

Fabrico los dummies

In [11]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola


In [12]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

In [13]:
df_features_categoricos.head()

Unnamed: 0,person,region_persona,pais,semana_mas_interactuante,device_type,channel_frecuente,marca_mas_buscada,region_persona_0,region_persona_Acre,region_persona_Alagoas,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,Rio de Janeiro,Brazil,Third,Smartphone,Organic,Samsung,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,Sao Paulo,Brazil,Third,Smartphone,Paid,iPhone,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,Rio de Janeiro,Brazil,Fourth,Smartphone,Direct,iPhone,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,Sao Paulo,Brazil,Fourth,Computer,Organic,iPhone,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,Sao Paulo,Brazil,Third,Computer,Organic,Motorola,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [14]:
df_features_categoricos = df_features_categoricos.drop(columns=['region_persona','pais','semana_mas_interactuante','device_type','channel_frecuente','marca_mas_buscada'])

In [15]:
df_features_categoricos

Unnamed: 0,person,region_persona_0,region_persona_Acre,region_persona_Alagoas,region_persona_Amapa,region_persona_Amazonas,region_persona_Arkansas,region_persona_Asuncion,region_persona_Aveiro,region_persona_Bahia,...,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown,marca_mas_buscada_LG,marca_mas_buscada_Motorola,marca_mas_buscada_Other,marca_mas_buscada_Samsung,marca_mas_buscada_iPhone
0,4886f805,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,cccea85e,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
5,4c8a8b93,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6,1b9f7cf6,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
7,29ebb414,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
8,de8fe91b,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,45baf068,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


## Booleano

Los transformo a numericos (1 o 0)

In [16]:
df_features_bool.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses
0,4886f805,0.0,False,0,1,0
1,ad93850f,1.0,True,0,1,0
2,0297fc1e,1.0,True,0,1,1
3,2d681dd8,1.0,False,0,1,0
4,cccea85e,0.0,True,0,1,0


In [17]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [18]:
df_features_bool.shape

(38829, 6)

# Mergeo los features 

In [19]:
df_features_totales = df_features_bool.merge(df_features_numericos, on='person', how='left')

In [20]:
df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [21]:
df_features_totales.shape

(38829, 180)

In [22]:
df_original['person'].drop_duplicates().shape

(38829,)

Aparentemente tengo datos de todos los usuarios

# Data preprocessing

In [23]:
df_train = df_labels.merge(df_features_totales, on='person', how='left')

In [24]:
df_train.shape

(19414, 181)

In [25]:
df_train = df_train.fillna(0)

In [26]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [27]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [28]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Modelaje y entrenamiento!

In [29]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    print('\nXGBoost Classifier')
    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    alg.fit(X_train, y_train, eval_metric='auc')
    
    pred_proba = alg.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    
    print('El puntaje auc es: {}'.format(auc))
    
    return alg

In [30]:
def rf_classifier(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(bootstrap=True, n_estimators = 1000, random_state = 42)
    
    rf.fit(X_train, y_train)
    
    pred_proba = rf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    print('\nRandom Forest Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return rf

In [31]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [32]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [33]:
def ext_classifier(X_train, X_test, y_train, y_test):
    ExtC = ExtraTreesClassifier()


    ## Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsExtC.fit(X_train,y_train)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print('Extra tree classifier')
    print(gsExtC.best_score_)
    return ExtC_best

In [34]:
def lgb_classifier(X_train, X_test, y_train, y_test):
    lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

    lgb_cl.fit(X_train,y_train)
    predict = lgb_cl.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nLGB Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return lgb_cl

In [35]:
def rn_classifier(X,y):
    mlp = MLPClassifier()

    mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsmlp.fit(X,y)

    mlp_best = gsmlp.best_estimator_
    print('\nRed Neuronal Classifier')
    print(gsmlp.best_score_)
    return mlp_best

In [59]:
def ensamblador(estimadores,X_train, X_test, y_train, y_test):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]),('rf',estimadores[1]),('ext', estimadores[2]), ('lgb', estimadores[3])],
       voting='soft', weights=[2,1,2,2],
      flatten_transform=True)

    eclf3.fit(X_train,y_train)
    
    predict = eclf3.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    
    print('Ensamble final')
    print('El puntaje auc es: {}'.format(auc))
    
    return eclf3

In [37]:
xgb_model = xgb_classifier(X_train, X_test, y_train, y_test,useTrainCV=True)
rf_model = rf_classifier(X_train, X_test, y_train, y_test)
ext_model = ext_classifier(X_train, X_test, y_train, y_test)
lgb_model = lgb_classifier(X_train, X_test, y_train, y_test)
mlp_model = rn_classifier(X,y)
#dt_model = dt_classifier(X_train, X_test, y_train, y_test)
#knn_model = knn_classifier(X_train,X_test, y_train, y_test)


XGBoost Classifier
Start Feeding Data
El puntaje auc es: 0.8716454992540827

Random Forest Classifier
El puntaje auc es: 0.8531868820656521
Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   35.7s
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:  1.7min finished


Extra tree classifier
0.8531094320283281

LGB Classifier
El puntaje auc es: 0.8670983816714221
Fitting 2 folds for each of 80 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   24.3s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:  1.4min finished



Red Neuronal Classifier
0.8119217373398312


In [60]:
ensamble = ensamblador([xgb_model,rf_model,ext_model,lgb_model],X_train, X_test, y_train, y_test)

Ensamble final
El puntaje auc es: 0.8689326859262705


In [39]:
#xgb_predict = xgb_model.predict_proba(X_test)[:,1]
#rf_predict = rf_model.predict_proba(X_test)[:,1]
#dt_predict = dt_model.predict_proba(X_test)[:,1]
#knn_predict = knn_model.predict_proba(X_test)[:,1]

In [40]:
#final_prediction_test = ensamble.predict

In [41]:
#roc_auc_score(y_test, final_prediction_test)

In [42]:
zipped = zip(map(lambda x: round(x, 4), rf_model.feature_importances_), X_test.columns)
feature = sorted(zipped, key=lambda x: x[1])

In [43]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
f_m_i = list(feat_importance.sort_values('importance', ascending=False).head(100)['feature'].values)

In [44]:
f_m_i

['Promedio_visitas_producto',
 'promedio_ingreso_mensual',
 'horas_mirando_productos',
 'viewed_std',
 'visitas_mes5',
 'viewed_sum',
 'Cantidad_visitas',
 'MEAN(cant_interacciones_por_modelo)',
 'registros_semana_4',
 'viewed_model_mean',
 'DAY(first month 5)',
 'viewed_mean',
 'registros_semana_5',
 '%checkouts',
 'MAX(cant_interacciones_por_modelo)',
 'registros_semana_3',
 'cant_modelos_distintos',
 'DAY(last month 5)',
 'act_ultima_semana',
 'visitas_mes4',
 'registros_semana_2',
 'WEEKDAY(first month 5)',
 'diferencia 5',
 'WEEKDAY(last month 5)',
 'registros_semana_1',
 'checkout_mean',
 'models_viewed_days_mean',
 'visitas_mes3',
 'cant_checkouts_5',
 'checkout_sum',
 'Checkout mean',
 'Checkout max',
 'cant_checkouts_dif_modelos',
 'last_day_check',
 'models_viewed_days_max',
 'channel_frecuente_Paid',
 'device_type_Computer',
 'first_day_check',
 'device_type_Smartphone',
 'act_primera_semana',
 'visitas_mes1',
 'region_persona_Sao Paulo',
 'marca_mas_buscada_Samsung',
 'visi

# Prediccion para el tp

In [61]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [62]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales, on='person', how='left')

In [63]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [64]:
personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [65]:
final_prediction = ensamble.predict_proba(personas_a_predecir_con_features)[:,1]

In [66]:
final_prediction_tp = pd.Series(final_prediction)

In [67]:
final_prediction_tp.sort_values()

18315    0.012559
13868    0.012559
17898    0.012626
10675    0.012639
11909    0.012657
6632     0.012659
8863     0.012672
12195    0.012678
8536     0.012683
18817    0.012684
7364     0.012690
4298     0.012699
15297    0.012702
17988    0.012706
16657    0.012725
16011    0.012728
2409     0.012728
15191    0.012728
13945    0.012736
11667    0.012752
3085     0.012778
18921    0.012780
11908    0.012780
2255     0.012808
9892     0.012817
13890    0.012860
10213    0.012867
17941    0.012868
15217    0.012868
17740    0.012890
           ...   
16357    0.319175
5989     0.321386
332      0.322234
9206     0.322692
16111    0.324034
11079    0.324490
19006    0.325109
2632     0.325873
5407     0.325904
14743    0.330630
7794     0.331587
9049     0.340164
1935     0.341082
9118     0.344543
4684     0.344545
16216    0.344723
17128    0.351328
4473     0.353130
6501     0.353146
5043     0.367044
863      0.368567
4816     0.370713
4651     0.371194
8924     0.375343
9404     0

In [68]:
personas_a_predecir['label'] = final_prediction_tp

In [57]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [69]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

# NO TOCAR!!!

In [1933]:
df_features_totales.to_csv(path_or_buf = 'features_0.84927.csv', index = False)
#df_features_totales que tiene el merge de nacho y mio