# Predicciones en varios modelos

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import datetime
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
df_original = pd.read_csv('events_up_to_01062018.csv', low_memory = False)
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [3]:
df_features_numericos = pd.read_csv('features_numericos_y_nacho.csv', low_memory=False)
df_features_bool = pd.read_csv('features_bool.csv', low_memory=False)
df_features_categoricos = pd.read_csv('features_categoricos.csv', low_memory=False)

In [4]:
df_features_bool.shape

(38829, 6)

In [5]:
df_features_numericos.shape

(38829, 50)

In [6]:
df_features_categoricos.shape

(38829, 8)

In [7]:
df_original['person'].drop_duplicates().shape

(38829,)

## Trabajo en los features por separado, empiezo por los numericos

A los numericos hago un fillna con ceros

In [8]:
df_features_numericos = df_features_numericos.fillna(0)

In [9]:
df_features_numericos.isnull().sum()

person                                   0
checkout_sum                             0
checkout_mean                            0
conversion_sum                           0
conversion_mean                          0
viewed_sum                               0
viewed_mean                              0
viewed_std                               0
diferencia 5_x                           0
DAY(last month 5)_x                      0
DAY(first month 5)_x                     0
WEEKDAY(last month 5)_x                  0
WEEKDAY(first month 5)_x                 0
%checkouts_x                             0
mayor_actividad_ult_semana_x             0
MAX(cant_interacciones_por_modelo)_x     0
MEAN(cant_interacciones_por_modelo)_x    0
Checkout max_x                           0
Checkout mean_x                          0
diferencia 5_y                           0
DAY(last month 5)_y                      0
DAY(first month 5)_y                     0
WEEKDAY(last month 5)_y                  0
WEEKDAY(fir

## Categoricos

Fabrico los dummies

In [10]:
df_features_categoricos.head()

Unnamed: 0,person,modelo_mas_visitado,region_persona,pais,evento_predominante_mes_5,semana_mas_interactuante,device_type,channel_frecuente
0,4886f805,Samsung Galaxy J7 Prime,Rio de Janeiro,Brazil,viewed product,Third,Smartphone,Organic
1,ad93850f,iPhone 5s,Sao Paulo,Brazil,viewed product,Third,Smartphone,Paid
2,0297fc1e,iPhone 6,Rio de Janeiro,Brazil,viewed product,Fourth,Smartphone,Direct
3,2d681dd8,iPhone 7,Sao Paulo,Brazil,viewed product,Fourth,Computer,Organic
4,cccea85e,Motorola Moto G4 Plus,Sao Paulo,Brazil,viewed product,Third,Computer,Organic


In [11]:
df_features_categoricos = pd.concat([df_features_categoricos, pd.get_dummies(df_features_categoricos.iloc[:,1:])], axis=1)

In [12]:
df_features_categoricos.head()

Unnamed: 0,person,modelo_mas_visitado,region_persona,pais,evento_predominante_mes_5,semana_mas_interactuante,device_type,channel_frecuente,modelo_mas_visitado_0,modelo_mas_visitado_Asus Zenfone 2,...,device_type_Smartphone,device_type_Tablet,device_type_Unknown,channel_frecuente_Direct,channel_frecuente_Email,channel_frecuente_Organic,channel_frecuente_Paid,channel_frecuente_Referral,channel_frecuente_Social,channel_frecuente_Unknown
0,4886f805,Samsung Galaxy J7 Prime,Rio de Janeiro,Brazil,viewed product,Third,Smartphone,Organic,0,0,...,1,0,0,0,0,1,0,0,0,0
1,ad93850f,iPhone 5s,Sao Paulo,Brazil,viewed product,Third,Smartphone,Paid,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0297fc1e,iPhone 6,Rio de Janeiro,Brazil,viewed product,Fourth,Smartphone,Direct,0,0,...,1,0,0,1,0,0,0,0,0,0
3,2d681dd8,iPhone 7,Sao Paulo,Brazil,viewed product,Fourth,Computer,Organic,0,0,...,0,0,0,0,0,1,0,0,0,0
4,cccea85e,Motorola Moto G4 Plus,Sao Paulo,Brazil,viewed product,Third,Computer,Organic,0,0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
df_features_categoricos = df_features_categoricos.drop(columns = ['region_persona',\
                                                                  'modelo_mas_visitado',\
                                                                 'pais',\
                                                                 'evento_predominante_mes_5',\
                                                                 'semana_mas_interactuante',\
                                                                 'device_type',
                                                                 'channel_frecuente'])

In [14]:
df_features_categoricos.shape

(38829, 343)

## Booleano

Los transformo a numericos (1 o 0)

In [15]:
df_features_bool.head()

Unnamed: 0,person,busco_top_5_visitas,visito_mas_que_el_promedio,siempre_incrementando,inc_ultimo_mes,mismo_interes_ultimos_dos_meses
0,4886f805,0.0,False,0,1,0
1,ad93850f,1.0,True,0,1,0
2,0297fc1e,1.0,True,0,1,1
3,2d681dd8,1.0,False,0,1,0
4,cccea85e,0.0,True,0,1,0


In [16]:
df_features_bool.iloc[:,1:] = df_features_bool.iloc[:,1:].astype(int)

In [17]:
df_features_bool.shape

(38829, 6)

# Mergeo los features 

In [18]:
df_features_totales = df_features_bool.merge(df_features_numericos, on='person', how='left')

In [19]:
df_features_totales = df_features_totales.merge(df_features_categoricos, on='person', how='left')

In [20]:
df_features_totales.shape

(38829, 397)

In [21]:
df_original['person'].drop_duplicates().shape

(38829,)

Aparentemente tengo datos de todos los usuarios

# Data preprocessing

In [22]:
df_train = df_labels.merge(df_features_totales, on='person', how='left')

In [23]:
df_train.shape

(19414, 398)

In [24]:
df_train = df_train.fillna(0)

In [25]:
df_train_0 = df_train.loc[df_train['label'] == 0, :]
df_train_1 = df_train.loc[df_train['label'] == 1, :]
df_train_equal = pd.concat([df_train_1, shuffle(df_train_0).iloc[:980,:] ],axis=0)

In [26]:
X,y = df_train.iloc[:,2:], df_train.iloc[:,1]

In [27]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

# Modelaje y entrenamiento!

In [28]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 7,
                subsample = 0.9,
                gamma = 1,
                n_estimators = 50)
    
    print('\nXGBoost Classifier')
    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    alg.fit(X_train, y_train, eval_metric='auc')
    
    pred_proba = alg.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    
    print('El puntaje auc es: {}'.format(auc))
    
    return alg

In [29]:
def rf_classifier(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(bootstrap=True, n_estimators = 1000, random_state = 42)
    
    rf.fit(X_train, y_train)
    
    pred_proba = rf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, pred_proba)
    print('\nRandom Forest Classifier')
    print('El puntaje auc es: {}'.format(auc))
    
    return rf

In [30]:
def dt_classifier(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier(random_state = 100,
    max_depth=8, min_samples_leaf=4)
    
    clf.fit(X_train, y_train)
    
    predict = clf.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nDecision Tree')
    print('El puntaje auc es: {}'.format(auc))
    
    return clf

In [31]:
def knn_classifier(X_train, X_test, y_train, y_test):
    """Aplica KNN al test entregado, primero haciendo gridsearch para sacar el k optimo"""
    
    k_range = list(range(1, 2))
    param_grid = dict(n_neighbors=k_range)
    
    knn = KNeighborsClassifier(n_neighbors=5)
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    
    grid.fit(X_train, y_train)
    
    knn_best = grid.best_estimator_
    
    
    predict = knn_best.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, predict)
    print('\nKNN con k=3')
    print('El puntaje auc es: {}'.format(auc))
    
    return knn_best

In [32]:
xgb_model = xgb_classifier(X_train, X_test, y_train, y_test,useTrainCV=True)
rf_model = rf_classifier(X_train, X_test, y_train, y_test)
#dt_model = dt_classifier(X_train, X_test, y_train, y_test)
#knn_model = knn_classifier(X_train,X_test, y_train, y_test)


XGBoost Classifier
Start Feeding Data
El puntaje auc es: 0.8699579677014969

Random Forest Classifier
El puntaje auc es: 0.8527727986930849


In [33]:
xgb_predict = xgb_model.predict_proba(X_test)[:,1]
rf_predict = rf_model.predict_proba(X_test)[:,1]
#dt_predict = dt_model.predict_proba(X_test)[:,1]
#knn_predict = knn_model.predict_proba(X_test)[:,1]

In [34]:
final_prediction_test = (xgb_predict * 0.8 + rf_predict * 0.2)

In [35]:
roc_auc_score(y_test, final_prediction_test)

0.8699246424985678

In [36]:
zipped = zip(map(lambda x: round(x, 4), rf_model.feature_importances_), X_test.columns)
feature = sorted(zipped, key=lambda x: x[1])

In [37]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
f_m_i = list(feat_importance.sort_values('importance', ascending=False).head(100)['feature'].values)

In [38]:
f_m_i

['promedio_ingreso_mensual',
 'Promedio_visitas_producto',
 'horas_mirando_productos',
 'viewed_sum',
 'viewed_std',
 'visitas_mes5',
 'Cantidad_visitas',
 'registros_semana_4',
 'MEAN(cant_interacciones_por_modelo)_x',
 'MEAN(cant_interacciones_por_modelo)_y',
 'viewed_mean',
 'DAY(first month 5)_y',
 'registros_semana_5',
 'registros_semana_3',
 'MAX(cant_interacciones_por_modelo)_y',
 'cant_modelos_distintos',
 'DAY(last month 5)_y',
 'MAX(cant_interacciones_por_modelo)_x',
 'act_ultima_semana',
 '%checkouts_x',
 '%checkouts_y',
 'registros_semana_2',
 'WEEKDAY(first month 5)_y',
 'visitas_mes4',
 'diferencia 5_y',
 'registros_semana_1',
 'WEEKDAY(last month 5)_y',
 'visitas_mes3',
 'Checkout mean_y',
 'Checkout mean_x',
 'cant_checkouts_5',
 'cant_checkouts_dif_modelos',
 'checkout_mean',
 'checkout_sum',
 'Checkout max_x',
 'Checkout max_y',
 'DAY(last month 5)_x',
 'device_type_Computer',
 'channel_frecuente_Paid',
 'DAY(first month 5)_x',
 'device_type_Smartphone',
 'act_primera

# Prediccion para el tp

In [69]:
personas_a_predecir = pd.read_csv('trocafone_kaggle_test.csv', low_memory=False)

In [70]:
personas_a_predecir_con_features = personas_a_predecir.merge(df_features_totales, on='person', how='left')

In [71]:
personas_a_predecir_con_features = personas_a_predecir_con_features.fillna(0)

In [72]:
personas_a_predecir_con_features = personas_a_predecir_con_features.drop(columns = ['person'])

In [73]:
xgb_predict = xgb_model.predict_proba(personas_a_predecir_con_features)[:,1]
rf_predict = rf_model.predict_proba(personas_a_predecir_con_features)[:,1]
#dt_predict = dt_model.predict_proba(personas_a_predecir_con_features)[:,1]
#knn_predict = knn_model.predict_proba(personas_a_predecir_con_features)[:,1]

In [74]:
final_prediction_tp = pd.Series(xgb_predict * 0.8 + rf_predict * 0.2)

In [75]:
personas_a_predecir['label'] = final_prediction_tp

In [51]:
num = personas_a_predecir._get_numeric_data()
num[num < 0] = 0

In [76]:
personas_a_predecir.to_csv(path_or_buf = 'submit_kaggle.csv', index = False)