In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

In [56]:
dfData = pd.read_csv('events_up_to_01062018.csv')
dfTrain = pd.read_csv('labels_training_set.csv')
dfTest = pd.read_csv('trocafone_kaggle_test.csv')
dfSample = pd.read_csv('trocafone_kaggle_submit_sample_all_0.csv')

In [57]:
#Crea un df con todos los usuarios
dfTest['label'] = 0
dfUsers = pd.concat([dfTrain,dfTest])
dfTest = dfTest.drop('label', axis=1)

In [58]:
#Le asigna a cada usuario su cantidad de eventos
def pesoStorage(almacenamiento):
    if almacenamiento == '32GB':
        return 2.8
    elif almacenamiento == '16GB':
        return 2.2
    elif almacenamiento == '64GB':
        return 1.2
    elif almacenamiento == '128GB':
        return 1
    else:
        return 1
    
dfPA = dfData[['person', 'event', 'storage']]
dfPA['puntajeAlmacenamiento'] = dfPA['storage'].apply(pesoStorage)
dfPA = pd.get_dummies(dfPA, columns=['event'])
columnasEventos = dfPA.columns.values[5:]
for eventoColumna in columnasEventos:
    dfPA[eventoColumna] *= dfPA['puntajeAlmacenamiento']
    
dfPA = dfPA.drop(['storage', 'puntajeAlmacenamiento'], axis=1).groupby('person').sum()
dfUsers = dfUsers.sort_values('person').merge(dfPA, on='person')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [59]:
#Le asigna a cada usuario su cantidad de eventos dandole mas valor si son mas recientes
def pesoTiempo(time):
    mes = int(time[6])
    dia = int(time[8:10])
    if mes < 5:
        return 1
    return 10 + dia * 0.7
    
dfPE = dfData[['person', 'event', 'timestamp']]
dfPE['puntajeTiempo'] = dfPE['timestamp'].apply(pesoTiempo)
dfPE = pd.get_dummies(dfPE, columns=['event'])
columnasEventos = dfPE.columns.values[3:]
for eventoColumna in columnasEventos:
    dfPE[eventoColumna] *= dfPE['puntajeTiempo']
dfPE = dfPE.drop(['timestamp', 'puntajeTiempo'], axis=1).groupby('person').sum()
dfUsers = dfUsers.sort_values('person').merge(dfPE, on='person')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [60]:
#Le asigna a cada usuario la region en la que mas eventos tiene
regionesMasEventos = dfData['region'].value_counts().head(9).index.tolist()
def filtrarRegiones(region):
    if pd.isnull(region):
        return 'null'
    if region not in regionesMasEventos:
        return 'Other'
    return region
    
dfPE = dfData[['person', 'region']]
dfPE['regionAcotada'] = dfPE['region'].apply(filtrarRegiones)

lDictRegiones = []
temp = dfPE.drop('region', axis=1).groupby('person')
for name, group in temp:
    regiones = {'null' : 1}
    for row in group.itertuples():
        region = row.regionAcotada
        if region == 'null':
            continue
        if region in regiones:
            regiones[region] += 1
        else:
            regiones[region] = 1
        rMax = max(regiones, key=regiones.get)
    lDictRegiones.append({'person' : name, 'region' : rMax})

dfRegiones = pd.DataFrame(lDictRegiones)
dfRegiones = pd.get_dummies(dfRegiones, columns=['region'])
dfUsers = dfUsers.merge(dfRegiones, on='person')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [61]:
#Casteo dfUsers a int32
personas = dfUsers['person']
dfUsers.drop('person', axis=1, inplace=True)
dfUsers = dfUsers.astype('int32')
dfUsers['person'] = personas

In [62]:
#Agrego columna con ultima visita al sitio
temp = dfData.groupby('person')['timestamp'].max().to_frame()
dfUsers = dfUsers.merge(temp, on='person')
dfUsers['month'] = dfUsers['timestamp'].str[5:7]
dfUsers['day'] = dfUsers['timestamp'].str[8:10]
dfUsers[['day','month']] = dfUsers[['day','month']].apply(pd.to_numeric)
dfUsers['last_visit'] = dfUsers['month']*30 + dfUsers['day']
dfUsers = dfUsers.drop(['timestamp','month','day'], axis=1)

In [63]:
#Agrego columna con ultima compra
temp = dfData.loc[dfData['event'] == 'conversion'].groupby('person')['timestamp'].max().to_frame()
dfUsers = dfUsers.merge(temp, how='left', on='person')
dfUsers = dfUsers.fillna('000000000000000000')
dfUsers['month'] = dfUsers['timestamp'].str[5:7]
dfUsers['day'] = dfUsers['timestamp'].str[8:10]
dfUsers[['day','month']] = dfUsers[['day','month']].apply(pd.to_numeric)
dfUsers['last_conversion'] = dfUsers['month']*30 + dfUsers['day']
dfUsers = dfUsers.drop(['timestamp','month','day'], axis=1)

In [64]:
#Columna con relacion entre el uso del celular y de la computadora
def filtrarDispositivo(deviceType):
    if deviceType == 'Computer':
        return 1
    elif deviceType == 'Smartphone':
        return 0
    else:
        return 2
    
dfPE = dfData[['person', 'device_type']]
dfPE['dispositivo'] = dfPE['device_type'].apply(filtrarDispositivo)
dfPE = dfPE.loc[dfPE['dispositivo'] != 2].drop('device_type', axis=1).groupby('person').mean()
dfUsers = dfUsers.merge(dfPE, on='person', how='left').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [65]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

df_y = dfTrain['label']
df_X = dfTrain.drop('label', axis=1).merge(dfUsers, on='person').drop(['person','label'], axis=1)

GBC = GradientBoostingClassifier(random_state=23)
scores = cross_val_score(GBC , df_X, df_y, scoring="roc_auc", cv=5)
scores.mean()

0.8696929506356836

El score anterior era de 0.871, por lo que agregarle la feature de storage de los productos empeora el resultado

In [66]:
GBC2 = GradientBoostingClassifier(random_state=23)
GBC2.fit(df_X, df_y)
df_p = dfTest.merge(dfUsers, on='person').drop(['person','label'], axis=1)
temp = GBC2.predict_proba(df_p)
submit = []
for l in temp:
    submit.append(l[1])
dfTest['label'] = submit

In [67]:
dfTest.to_csv('trocafone_kaggle_submit.csv', index=False)

In [13]:
pd.read_csv('trocafone_kaggle_submit.csv')

Unnamed: 0,person,label
0,4886f805,0.006694
1,0297fc1e,0.038469
2,2d681dd8,0.009852
3,cccea85e,0.079501
4,4c8a8b93,0.033649
5,29ebb414,0.009196
6,3dc1950f,0.039878
7,8ea4c165,0.037185
8,d8cfe234,0.035438
9,d6bc64df,0.044293
