In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
dfE = pd.read_csv('/home/soledad-escobar/Descargas/Orga_datos/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv')
dfT = pd.read_csv('/home/soledad-escobar/Descargas/Orga_datos/fiuba-trocafone-tp2-final-set/labels_training_set.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#OHE a columna event
dfD = dfE[['person', 'event']]
dfD = pd.get_dummies(dfD, columns=['event']).groupby('person').sum()

In [5]:
#le agrego las features de event a las labels
dfTD = dfT.sort_values('person').merge(dfD, on='person')

In [6]:
#Agrego columna con ultima visita al sitio
temp = dfE.groupby('person')['timestamp'].max().to_frame()
dfTDb = dfTD.merge(temp, on='person')
dfTDb['month'] = dfTDb['timestamp'].str[5:7]
dfTDb['day'] = dfTDb['timestamp'].str[8:10]
dfTDb[['day','month']] = dfTDb[['day','month']].apply(pd.to_numeric)
dfTDb['last_visit'] = dfTDb['month']*30 + dfTDb['day']
dfTDb = dfTDb.drop(['timestamp','month','day'], axis=1)

In [9]:
#Agrego columna con ultima compra
temp = dfE.loc[dfE['event'] == 'conversion'].groupby('person')['timestamp'].max().to_frame()
dfTDc = dfTDb.merge(temp, how='left', on='person')
dfTDc = dfTDc.fillna('000000000000000000')
dfTDc['month'] = dfTDc['timestamp'].str[5:7]
dfTDc['day'] = dfTDc['timestamp'].str[8:10]
dfTDc[['day','month']] = dfTDc[['day','month']].apply(pd.to_numeric)
dfTDc['last_conversion'] = dfTDc['month']*30 + dfTDc['day']
dfTDc = dfTDc.drop(['timestamp','month','day'], axis=1)

# Gradient Boosting Classifier

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

df_y = dfTDc['label']
df_X = dfTDc.drop(['person', 'label'], axis=1)

GBC = GradientBoostingClassifier(random_state=23,learning_rate=0.06,subsample=0.75, min_samples_split=8)
scores = cross_val_score(GBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.8545760253915411

# Agrego nuevos features

## Analizo las fechas y los horarios 

In [11]:
# Agrego columnas para separar los meses y la hora
datos = dfE.loc[:,['timestamp', 'event', 'person']]
datos['date_time'] = pd.to_datetime(datos['timestamp'], format='%Y%m%d %H:%M:%S.%f')
datos['mes'] = datos['date_time'].dt.month
datos['hora'] = datos['date_time'].dt.hour

In [13]:
#Agrego columnas para ver el momento del dia en el que se generaron las conversiones
features = datos.loc[ datos['event'] == 'conversion', ['person', 'hora']]
features['comp_mañana'] = features['hora'].apply(lambda x: 1 if ((x >= 6) & (x < 12)) else 0)
features['comp_tarde'] = features['hora'].apply(lambda x: 1 if ((x >= 12) & (x <20)) else 0)
features['comp_noche'] = features['hora'].apply(lambda x: 1 if ((x >= 20) & (x <= 23)) else 0)
features['comp_madrugada'] = features['hora'].apply(lambda x: 1 if ((x >= 0) & (x < 6)) else 0)
features = features.drop('hora', axis = 'columns').groupby('person').sum()

In [14]:
# Hago un join con los features anteriores
df_feat = pd.merge(dfTDc, features, on = 'person', how = 'left').fillna(0)
# Agrego la cantidad de eventos que generaron las personas
df_feat['total_events'] = (np.sum(df_feat.iloc[:,2:13], axis = 'columns'))

# XGBoost Classifier

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

y = df_feat['label']
X = df_feat.drop(['person', 'label'], axis = 'columns')

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.2)

In [37]:
XGB1 = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.05,
                max_depth = 5, gamma = 4, n_estimators = 100).fit(X_train,y_train)

Realizo las predicciones y luego calculo el score

In [38]:
from sklearn.metrics import roc_auc_score
preds = XGB1.predict_proba(X_test)[:,1]
roc_auc_score(y_test, preds)

0.8426087259548369

In [39]:
scores = cross_val_score(XGB1 , X, y, scoring = "roc_auc", cv = 5)
scores.mean()

0.8537345254558832

## Agrego más features sobre marcas y modelos de los productos

Como se tienen muchos modelos diferentes pero corresponden a una cantidad acotada de marcas, trabajo sobre las marcas para crear nuevos features. Creo una columna por cada marca existente y para cada persona coloco el promedio de compra para esa marca.

In [42]:
#Creo columnas para las marcas existentes
compras_por_marcas = dfE.loc[datos['event'] == 'conversion', ['model', 'person']]
compras_por_marcas['Samsung_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('samsung' in x.lower()) else 0)
compras_por_marcas['iPhone_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('iphone' in x.lower()) else 0)
compras_por_marcas['Motorola_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('moto' in x.lower()) else 0)
compras_por_marcas['Lenovo_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('lenovo' in x.lower()) else 0)
compras_por_marcas['LG_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('lg' in x.lower()) else 0)
compras_por_marcas['Sony_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('sony' in x.lower()) else 0)
compras_por_marcas['Asus_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('asus' in x.lower()) else 0)
compras_por_marcas['iPad_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('iPad' in x.lower()) else 0)
compras_por_marcas['Quantum_mean'] = compras_por_marcas['model'].apply(lambda x: 1 if ('quantum' in x.lower()) else 0)
new_features = compras_por_marcas.groupby('person').mean()

In [43]:
# Hago join con los features anteriores 
new_features = pd.merge(df_feat, new_features, on = 'person', how = 'left').fillna(0)

## Vuelvo a correr XGBoost para ver si con los nuevos features mejoran las predicciones

In [52]:
y = new_features['label']
X = new_features.drop(['person', 'label'], axis = 'columns')
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.2)

In [53]:
XGB2 = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, gamma = 5, n_estimators = 100).fit(X_train,y_train)

Calculo las predicciones y los score

In [54]:
predicciones = XGB2.predict_proba(X_test)[:,1]
roc_auc_score(y_test, predicciones)

0.8707324872097599

In [47]:
scores = cross_val_score(XGB2 , X, y, scoring = "roc_auc", cv = 10)
scores.mean()

0.8519079473712315