# Modelaje y entrenamiento!

In [263]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    print('\nXGBoost Classifier')
    
    alg.fit(X_train, y_train, eval_metric='auc')
    
    pred_proba = alg.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    
    print('El puntaje auc es: {}'.format(auc))
    
    return alg

In [264]:
def rf_classifier(X, y):
    RFC = RandomForestClassifier()


    rf_param_grid = {"max_depth": [None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [2, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [False],
                  "n_estimators" :[50,65,100],
                  "criterion": ["gini"]}


    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsRFC.fit(X,y)

    RFC_best = gsRFC.best_estimator_

    # Best score
    print('Random classifier')
    print(gsRFC.best_score_)
    
    return RFC_best

In [265]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [266]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [267]:
def ext_classifier(X,y):
    ExtC = ExtraTreesClassifier()


    ## Search grid for optimal parameters
    ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"],
              "max_features": [0.3, 0.1] 
                    }


    gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsExtC.fit(X, y)

    ExtC_best = gsExtC.best_estimator_

    # Best score
    print('Extra tree classifier')
    print(gsExtC.best_score_)
    return ExtC_best

In [268]:
def lgb_classifier(X_train, X_test, y_train, y_test):
    lgb_cl = lgb.LGBMClassifier(learning_rate=0.005,objective='binary',num_leaves=55,max_depth=13,
                        n_estimators=60,colsample_bytree=0.8,n_jobs=-1,
                        random_state=0,silent=False,subsample=0.8,
                        sumsample_freq=0.5)

    lgb_cl.fit(X_train,y_train)
    predict = lgb_cl.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nLGB Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return lgb_cl

In [269]:
def rn_classifier(X,y):
    mlp = MLPClassifier()

    mlp_param_grid = {'solver': ['lbfgs'],
                  'max_iter': [50,100],
                  'alpha': 10.0 ** -np.arange(5, 10), 
                  'hidden_layer_sizes':np.arange(13, 15),
                  'random_state':[1,5,6,9]}


    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsmlp.fit(X,y)

    mlp_best = gsmlp.best_estimator_
    print('\nRed Neuronal Classifier')
    print(gsmlp.best_score_)
    return mlp_best

In [270]:
def gb_classifier(X,y):
    GBC = GradientBoostingClassifier()
    gb_param_grid = {'loss' : ["deviance"],
                  'n_estimators' : [50,65,100],
                  'learning_rate': [0.1, 0.05, 0.01],
                  'max_depth': [4, 8],
                  'min_samples_leaf': [100,150],
                  'max_features': [0.3, 0.1] 
                  }

    gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=2, scoring="roc_auc", n_jobs= 4, verbose = 1)

    gsGBC.fit(X,y)

    GBC_best = gsGBC.best_estimator_

    # Best score
    print('\nGradient Boosting')
    print(gsGBC.best_score_)
    return GBC_best

In [271]:
def ensamblador(estimadores,X_train, X_test, y_train, y_test):
    eclf3 = VotingClassifier(estimators=[
       ('xgb', estimadores[0]), ('lgb', estimadores[3]),('gb',estimadores[4])],
       voting='soft', weights=[1, 0.5, 1],
      flatten_transform=True)

    eclf3.fit(X_train,y_train)
    
    predict = eclf3.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    
    print('Ensamble final')
    print('El puntaje auc es: {}'.format(auc))
    
    return eclf3

# Predicciones en varios modelos

In [272]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier

In [73]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f6dcb88a4e0>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [74]:
pd.set_option('display.max_rows', 100)

In [105]:
df_features_numericos = pd.read_csv('features_numericos_mes_5.csv', low_memory=False)
#df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos.csv', low_memory=False)

In [106]:
#df_features_bool.shape

In [107]:
df_features_numericos.columns

Index(['person', 'cant_checkouts_5', '%checkouts', 'checkouts_ult_semana',
       'act_primera_semana', 'act_ultima_semana', 'mayor_actividad_ult_semana',
       'cant_modelos_distintos', 'cant_checkouts_dif_modelos',
       'MAX(cant_interacciones_por_modelo)',
       'MEAN(cant_interacciones_por_modelo)', 'Checkout max', 'Checkout mean',
       'cant_lead_5to_mes', 'cant_modelos_que_consulto_stock'],
      dtype='object')

In [108]:
df_features_categoricos.shape

(38829, 8)

In [109]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [110]:
df_features_numericos = df_features_numericos.fillna(0)

In [111]:
df_features_numericos.isnull().sum()

person                                 0
cant_checkouts_5                       0
%checkouts                             0
checkouts_ult_semana                   0
act_primera_semana                     0
act_ultima_semana                      0
mayor_actividad_ult_semana             0
cant_modelos_distintos                 0
cant_checkouts_dif_modelos             0
MAX(cant_interacciones_por_modelo)     0
MEAN(cant_interacciones_por_modelo)    0
Checkout max                           0
Checkout mean                          0
cant_lead_5to_mes                      0
cant_modelos_que_consulto_stock        0
dtype: int64

## Categoricos

Fabrico los dummies

In [112]:
df_features_categoricos.head()

Unnamed: 0,person,modelo_mas_visitado,region_persona,pais,evento_predominante_mes_5,semana_mas_interactuante,device_type,channel_frecuente
0,4886f805,Samsung Galaxy J7 Prime,Rio de Janeiro,Brazil,viewed product,Third,Smartphone,Organic
1,ad93850f,iPhone 5s,Sao Paulo,Brazil,viewed product,Third,Smartphone,Paid
2,0297fc1e,iPhone 6,Rio de Janeiro,Brazil,viewed product,Fourth,Smartphone,Direct
3,2d681dd8,iPhone 7,Sao Paulo,Brazil,viewed product,Fourth,Computer,Organic
4,cccea85e,Motorola Moto G4 Plus,Sao Paulo,Brazil,viewed product,Third,Computer,Organic


In [113]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f6dcbfa6e80>>
Traceback (most recent call last):
  File "/home/amaherok/.local/lib/python3.6/site-packages/xgboost/core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [114]:
df_features_categoricos.head()

Unnamed: 0,person,modelo_mas_visitado,region_persona,pais,evento_predominante_mes_5,semana_mas_interactuante,device_type,channel_frecuente,modelo_mas_visitado_0,modelo_mas_visitado_Asus Zenfone 2,...,device_type_Smartphone,device_type_Tablet,device_type_Unknown,channel_frecuente_Direct,channel_frecuente_Email,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown
0,4886f805,Samsung Galaxy J7 Prime,Rio de Janeiro,Brazil,viewed product,Third,Smartphone,Organic,0,0,...,1,0,0,0,0,1,0,0,0,0
1,ad93850f,iPhone 5s,Sao Paulo,Brazil,viewed product,Third,Smartphone,Paid,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0297fc1e,iPhone 6,Rio de Janeiro,Brazil,viewed product,Fourth,Smartphone,Direct,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2d681dd8,iPhone 7,Sao Paulo,Brazil,viewed product,Fourth,Computer,Organic,0,0,...,0,0,0,0,0,1,0,0,0,0
4,cccea85e,Motorola Moto G4 Plus,Sao Paulo,Brazil,viewed product,Third,Computer,Organic,0,0,...,0,0,0,0,0,1,0,0,0,0


In [115]:
df_features_categoricos = df_features_categoricos.drop(columns=['region_persona','pais','semana_mas_interactuante','device_type','channel_frecuente','evento_predominante_mes_5','modelo_mas_visitado'])

In [116]:
df_features_categoricos.head()

Unnamed: 0,person,modelo_mas_visitado_0,modelo_mas_visitado_Asus Zenfone 2,modelo_mas_visitado_Asus Zenfone 2 Deluxe,modelo_mas_visitado_Asus Zenfone 2 Laser,modelo_mas_visitado_Asus Zenfone 3 Max 32 GB,modelo_mas_visitado_Asus Zenfone 3 Max 16 GB,modelo_mas_visitado_Asus Zenfone 5,modelo_mas_visitado_Asus Zenfone 6,modelo_mas_visitado_Asus Zenfone Go,...,device_type_Smartphone,device_type_Tablet,device_type_Unknown,channel_frecuente_Direct,channel_frecuente_Email,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown
0,4886f805,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Booleano

Los transformo a numericos (1 o 0)

In [103]:
df_features_bool.head()

NameError: name 'df_features_bool' is not defined

In [16]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [17]:
df_features_bool.shape

(38829, 6)

# Mergeo los features 

In [133]:
df_features_totales = pd.read_csv('features_mios.csv').merge(df_features_numericos, on='person', how='left')

In [134]:
#df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [135]:
df_features_totales.shape

(38829, 22)

In [136]:
df_features_totales.head()

Unnamed: 0,person,conversion_sum,conversion_mean,dif_5_check,last_day_check,first_day_check,last_week_check,first_week_check,cant_checkouts_5,%checkouts,...,act_ultima_semana,mayor_actividad_ult_semana,cant_modelos_distintos,cant_checkouts_dif_modelos,MAX(cant_interacciones_por_modelo),MEAN(cant_interacciones_por_modelo),Checkout max,Checkout mean,cant_lead_5to_mes,cant_modelos_que_consulto_stock
0,4886f805,0.0,0.0,0.0,18.0,18.0,4.0,4.0,1.0,11.111111,...,0.0,0,1.0,1.0,5.0,5.0,1.0,1.0,0.0,0.0
1,ad93850f,0.0,0.0,0.0,14.0,14.0,0.0,0.0,1.0,1.538462,...,0.0,0,3.0,1.0,18.0,7.0,1.0,1.0,0.0,0.0
2,0297fc1e,0.0,0.0,12.0,22.0,10.0,1.0,3.0,2.0,1.156069,...,8.0,0,11.0,2.0,79.0,12.272727,1.0,1.0,0.0,0.0
3,2d681dd8,0.0,0.0,0.0,27.0,27.0,6.0,6.0,1.0,3.846154,...,0.0,0,3.0,1.0,6.0,4.666667,1.0,1.0,0.0,0.0
4,cccea85e,0.0,0.0,0.0,11.0,11.0,4.0,4.0,1.0,0.119617,...,252.0,1,57.0,1.0,253.0,12.982456,1.0,1.0,0.0,0.0


In [137]:
df_original['person'].drop_duplicates().shape

(38829,)

In [138]:
df_features_totales = df_features_totales.merge(pd.read_csv('neww_features.csv'), on='person', how = 'left')

In [139]:
df_features_totales.shape

(38829, 180)

Aparentemente tengo datos de todos los usuarios

# Data preprocessing

In [250]:
df_train = df_labels.merge(df_features_totales, on='person', how='left')

In [251]:
df_train.shape

(19414, 181)

In [252]:
df_train = df_train.fillna(0)

In [253]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [254]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [255]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [256]:
rf_model = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [257]:



xgb_model = xgb_classifier(X_train, X_test, y_train, y_test)
#rf_model = rf_classifier(X,y)
#ext_model = ext_classifier(X,y)
lgb_model = lgb_classifier(X_train, X_test, y_train, y_test)
gb_moodel = gb_classifier(X, y)
#mlp_model = rn_classifier(X,y)
#dt_model = dt_classifier(X_train, X_test, y_train, y_test)
#knn_model = knn_classifier(X_train,X_test, y_train, y_test)


XGBoost Classifier
El puntaje auc es: 0.865869603444266

LGB Classifier
El puntaje auc es: 0.8665311441747545
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   23.6s
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed:  1.3min finished



Gradient Boosting
0.8605358222273394


In [258]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [259]:
roc_auc_score(y_test,rf_model.predict_proba(X_test)[:,1:])

0.8522395954462173

In [212]:
#ensamble = ensamblador([xgb_model,rf_model,ext_model,lgb_model,gb_moodel],X_train, X_test, y_train, y_test)

In [243]:
zipped = zip(map(lambda x: round(x, 4), rf_model.feature_importances_), X_test.columns)
feature = sorted(zipped, key=lambda x: x[1])

In [244]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
f_m_i = list(feat_importance.sort_values('importance', ascending=False).head(100)['feature'].values)

In [260]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [246]:
feat_importance.sort_values(by='importance' , ascending=False)

Unnamed: 0,importance,feature
19,0.0363,PERCENTILE(viewed product - checkout)
2,0.0338,Checkout mean
29,0.0338,cant_checkouts_5
0,0.0325,%checkouts
10,0.0311,PERCENTILE(checkout - visited site)
30,0.0296,cant_checkouts_dif_modelos
1,0.0296,Checkout max
4,0.0253,MEAN(cant_interacciones_por_modelo)
43,0.0217,checkout + viewed product - checkout
9,0.0204,PERCENTILE(checkout - viewed product)


In [247]:
#f_m_i =[ x for x in f_m_i if "_y" not in x ]

In [248]:
f_m_i.append('person')
#f_m_i.append('count_y')


In [249]:
len(f_m_i)

101

In [176]:
f_m_i

['cant_checkouts_5',
 'cant_checkouts_dif_modelos',
 'checkout',
 'Checkout max',
 'PERCENTILE(checkout + visited site)',
 'MEAN(cant_interacciones_por_modelo)',
 'PERCENTILE(checkout)',
 'Checkout mean',
 'PERCENTILE(checkout + viewed product)',
 'PERCENTILE(checkout - visited site)',
 '%checkouts',
 'cant_modelos_distintos',
 'MAX(cant_interacciones_por_modelo)',
 'act_ultima_semana',
 'PERCENTILE(viewed product - checkout)',
 'checkout + viewed product - visited site',
 'viewed product + viewed product - visited site',
 'PERCENTILE(viewed product + visited site)',
 'visited site + visited site - checkout',
 'PERCENTILE(visited site - checkout)',
 'visited site - viewed product',
 'last_day_check',
 'checkout + visited site - checkout',
 'checkout + viewed product',
 'checkout - visited site + viewed product',
 'viewed product',
 'checkout - visited site + visited site',
 'viewed product - visited site',
 'PERCENTILE(viewed product)',
 'viewed product + visited site',
 'PERCENTILE(ch

In [261]:
params = {"objective":"reg:linear",'colsample_bytree':1,
          'learning_rate': 0.1, 'max_depth': 7, 'gamma': 1,'n_estimators': 65}
cv_val = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

[12:52:34] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 72 pruned nodes, max_depth=7
[12:52:34] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 60 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 82 extra nodes, 60 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 60 extra nodes, 60 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 62 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 72 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 82 pruned nodes, max_depth=7
[12:52:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 80 pruned nodes, max_depth=7
[12:52:3

[12:52:41] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 98 pruned nodes, max_depth=0
[12:52:41] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 82 pruned nodes, max_depth=7
[12:52:41] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 104 pruned nodes, max_depth=0
[12:52:41] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 98 pruned nodes, max_depth=0
[12:52:42] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 90 pruned nodes, max_depth=7
[12:52:42] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 104 pruned nodes, max_depth=0
[12:52:42] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 98 pruned nodes, max_depth=0
[12:52:42] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 98 pruned nodes, max_depth=6
[12:52:42] 

In [262]:
cv_val

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.86144,0.007554,0.839244,0.009172
1,0.866708,0.00794,0.841118,0.009571
2,0.870525,0.008179,0.844012,0.005957
3,0.876728,0.004718,0.846018,0.00174
4,0.879545,0.003121,0.847646,0.00109
5,0.88219,0.003813,0.848248,0.001767
6,0.885928,0.002828,0.85098,0.003432
7,0.887441,0.002423,0.852894,0.002946
8,0.888959,0.002673,0.854705,0.002521
9,0.889706,0.002546,0.854673,0.00265


# Prediccion para el tp

In [384]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [385]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales[f_m_i], on='person', how='left')

In [386]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [387]:

personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [388]:
final_prediction = ensamble.predict_proba(personas_a_predecir_con_features)[:,1]

In [389]:
final_prediction_tp = pd.Series(final_prediction)

In [390]:
final_prediction_tp.sort_values()

16595    0.006338
17896    0.006488
18025    0.006704
18896    0.006705
3911     0.006712
1924     0.006798
5730     0.006922
17655    0.006966
14635    0.006983
6916     0.007145
348      0.007160
13449    0.007183
11632    0.007231
3830     0.007394
15883    0.007542
9824     0.007578
14928    0.007578
10579    0.007585
4532     0.007600
18665    0.007612
1160     0.007621
11868    0.007631
18031    0.007639
14305    0.007664
18166    0.007684
10792    0.007725
18059    0.007736
15076    0.007830
18281    0.007834
13247    0.007834
           ...   
13124    0.443175
16191    0.444147
8662     0.444654
15794    0.448692
10052    0.449775
1760     0.452175
7864     0.452514
4813     0.454096
16126    0.456982
4651     0.459956
2865     0.461922
11079    0.464280
4667     0.465884
1519     0.471622
4502     0.476319
14826    0.478120
4819     0.481309
17128    0.482298
4515     0.487716
9128     0.487944
4880     0.489197
4821     0.507800
4700     0.508128
11555    0.514846
8902     0

In [391]:
personas_a_predecir['label'] = final_prediction_tp

In [392]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [393]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)

In [59]:
df_features_totales[f_m_i].to_csv('features_to_sell.csv', index= False)

NameError: name 'f_m_i' is not defined

In [75]:
df_features_totales[f_m_i].to_csv('checkfeat_select3.csv', index= False)

# NO TOCAR!!!

In [1933]:
df_features_totales.to_csv(path_or_buf = 'features_0.84927.csv', index = False)
#df_features_totales que tiene el merge de nacho y mio

Index(['checkout_sum', 'checkout_mean', 'Checkout max', '%checkouts',
       'cant_checkouts_dif_modelos', 'Checkout mean_x', 'Checkout mean_y',
       'cant_checkouts_5', 'count_y', 'visitas_mes5_y', 'visitas_mes5_x',
       'horas_mirando_productos_x', 'viewed_sum', 'promedio_ingreso_mensual_x',
       'promedio_ingreso_mensual_y', 'horas_mirando_productos_y',
       'viewed_mean', 'Promedio_visitas_producto_x', 'count_x',
       'MEAN(cant_interacciones_por_modelo)', 'last_day_check',
       'DAY(first month 5)', 'Promedio_visitas_producto_y',
       'MAX(cant_interacciones_por_modelo)', 'Cantidad_visitas_x',
       'viewed_std', 'cant_modelos_distintos', 'viewed_model_mean',
       'Cantidad_visitas_y', 'visitas_mes4_y', 'first_day_check',
       'visitas_mes4_x', 'DAY(last month 5)', 'diferencia 5',
       'act_ultima_semana', 'visitas_mes3_y', 'models_viewed_days_mean',
       'last_week_check', 'visitas_mes3_x', 'models_viewed_days_max',
       'first_week_check', 'device_type_Computer', 'WEEKDAY(last month 5)',
       'WEEKDAY(first month 5)', 'inc_ultimo_mes', 'device_type_Smartphone',
       'visitas_mes1_y', 'visitas_mes2_y', 'visitas_mes2_x',
       'act_primera_semana', 'visitas_mes1_x', 'checkouts_ult_semana',
       'semana_mas_interactuante_Third', 'conversion_sum',
       'marca_mas_buscada_Samsung', 'channel_frecuente_Paid', 'dif_5_check',
       'semana_mas_interactuante_First', 'semana_mas_interactuante_Second',
       'conversion_mean', 'busco_top_5_visitas', 'channel_frecuente_Referral',
       'semana_mas_interactuante_Fourth', 'mayor_actividad_ult_semana_x',
       'marca_mas_buscada_iPhone', 'channel_frecuente_Direct',
       'region_persona_Unknown', 'visito_mas_que_el_promedio',
       'mayor_actividad_ult_semana_y', 'region_persona_Sao Paulo',
       'semana_mas_interactuante_Fifth', 'channel_frecuente_Organic',
       'region_persona_Minas Gerais', 'marca_mas_buscada_Motorola',
       'region_persona_Rio de Janeiro', 'region_persona_Bahia',
       'mismo_interes_ultimos_dos_meses', 'region_persona_Santa Catarina',
       'region_persona_Federal District', 'region_persona_Rio Grande do Sul',
       'region_persona_Para', 'region_persona_Maranhao',
       'region_persona_Ceara', 'pais_Brazil', 'region_persona_Pernambuco',
       'marca_mas_buscada_Other', 'pais_Another', 'region_persona_Parana',
       'region_persona_Espirito Santo', 'region_persona_Goias',
       'region_persona_Paraíba', 'device_type_Tablet', 'region_persona_0',
       'cant_modelos_que_consulto_stock', 'marca_mas_buscada_LG',
       'region_persona_Mato Grosso', 'cant_lead_5to_mes',
       'region_persona_Rio Grande do Norte', 'region_persona_Tocantins',
       'region_persona_Amazonas'],
      dtype='object')