In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 200)

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

df = pd.read_csv('events_up_to_01062018.csv')
pd.options.mode.chained_assignment = None

  interactivity=interactivity, compiler=compiler, result=result)


Me quedo solo con las columnas que son de interés para este análisis

In [2]:
df_tiempos=  df[['timestamp','person']]
df_tiempos['timestamp'] = pd.to_datetime(df_tiempos['timestamp'])
df_tiempos= df_tiempos.sort_values(by=["timestamp"])
df_tiempos['just_date'] = df_tiempos['timestamp'].dt.date
df_tiempos.reset_index(drop = True, inplace = True)

Agrego la columna **'diff'** que me indica _diferencia de tiempo entre los distintos eventos temporales **por usuario**_

In [3]:
df_tiempos['diff'] = df_tiempos.groupby(['person'])['timestamp'].diff()

Paso la columna a string para detectar los valores "NaT" que indican el ingreso de un nuevo usuario 

In [4]:
df_tiempos['diff'] = df_tiempos['diff'].astype(str)
df_tiempos['new_user'] = df_tiempos['diff'] =='NaT'

Agrego la columna **'diff2'** que me indica la _diferencia entre eventos temporales_ (sin importar el usuario)

In [5]:
df_tiempos['diff2'] = (df_tiempos['timestamp'] - (df_tiempos['timestamp'].shift())) / np.timedelta64(1, 'h')

Reasigno la columna **'diff'** para volver a tener datos de tipo temporal y elimina los nulos

In [6]:
df_tiempos['diff'] = df_tiempos.groupby(['person'])['timestamp'].diff()

Lleno con 0s todos los nulos del data frame

In [7]:
df_tiempos=df_tiempos.fillna(0)

Identifico en la columna **'new_session_same_user'** los _comienzos de nuevas sesiones del mismo usuario_. Para ello se tiene en cuenta un umbral de 0.48 horas. Es decir, se considera que si el tiempo entre eventos es mayor a 0.48 horas, el evento corresponde a una nueva sesion del mismo usuario. En el informe adjunto se explica detalladamente la elección de este umbral.

In [8]:
df_tiempos['new_session_same_user'] = df_tiempos['diff2'] > 0.48

Identifico en la columna **'new_session_new_user'** los comienzos de _nuevas sesiones de un usuario nuevo._ Se tiene en cuenta el mismo umbral que antes. 

In [9]:
df_tiempos["diff"]=df_tiempos["diff"]/np.timedelta64(1, 'h')
df_tiempos["new_session_new_user"] = df_tiempos["diff"] > 0.48

Para contabilizar las sesiones nuevas (que pueden corresponder al mismo usuario o a un nuevo usuario) se realiza la operacion OR entre las columnas con datos booleanos calculadas previamente.

In [10]:
df_tiempos["new_session"]=df_tiempos["new_user"]|df_tiempos["new_session_same_user"]|df_tiempos["new_session_new_user"]

Enumero las sesiones para luego separarlas por agrupación

In [11]:
df_tiempos['sessionid'] = df_tiempos['new_session'].cumsum()

Como pusimos un umbral de 0.48 horas para la finalizacion de las sesiones, elimino de la columa **'diff'** los valores que superen ese valor (Me quedo solo con los False de la columna new_sesion). Estos valores se corresponderan al primer 'diff' de cada sesion, que debería ser siempre nulo.

In [12]:
df_tiempos=df_tiempos[df_tiempos.new_session==False]

Ahora si agrupo por **sessionid** y obtengo la información buscada

In [13]:
df_tiempos.head()

Unnamed: 0,timestamp,person,just_date,diff,new_user,diff2,new_session_same_user,new_session_new_user,new_session,sessionid
1,2018-01-01 08:09:31,0f4e2a4b,2018-01-01,0.0,False,0.0,False,False,False,1
2,2018-01-01 08:09:31,0f4e2a4b,2018-01-01,0.0,False,0.0,False,False,False,1
3,2018-01-01 08:09:44,0f4e2a4b,2018-01-01,0.003611,False,0.003611,False,False,False,1
5,2018-01-01 08:45:29,0f4e2a4b,2018-01-01,0.0,False,0.0,False,False,False,2
6,2018-01-01 08:45:29,0f4e2a4b,2018-01-01,0.0,False,0.0,False,False,False,2


In [14]:
tiempos_sesiones=df_tiempos.groupby(["sessionid"]).agg({'diff':'sum','person':'first'}).reset_index()

In [22]:
tiempos_sesiones['count'] = 1
tiempos_sesiones.head()

Unnamed: 0,sessionid,diff,person,count
0,1,0.003611,0f4e2a4b,1
1,2,0.070556,0f4e2a4b,1
2,3,0.0,7c7e0de9,1
3,4,1.136111,8af11dbc,1
4,5,0.024167,23252ece,1


In [23]:
session_mean = tiempos_sesiones.groupby(["person"]).agg({'diff':['mean','max'], 'count':'sum'}).reset_index()

In [24]:
session_mean.head()

Unnamed: 0_level_0,person,diff,diff,count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,sum
0,0008ed71,0.001389,0.001389,1
1,00091926,0.14276,0.501667,31
2,00091a7a,0.171111,0.171111,1
3,000ba417,0.128333,0.464444,13
4,000c79fe,0.184444,0.333889,3


In [25]:
session_mean = df['person'].drop_duplicates().to_frame().merge(session_mean , left_on='person', right_on='person', how='left')
session_mean.columns= ['person' , 'diff_mean' , 'diff_max' , 'session_count']

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [26]:
session_mean.head()

Unnamed: 0,person,diff_mean,diff_max,session_count
0,4886f805,0.058056,0.058056,1.0
1,ad93850f,0.095324,0.4675,6.0
2,0297fc1e,0.120095,0.678889,70.0
3,2d681dd8,0.170093,0.376389,3.0
4,cccea85e,0.15207,0.847778,42.0


In [64]:
session_mean.to_csv('diff.csv', index=False)

## XgBoost

In [28]:
df_labels= pd.read_csv('labels_training_set.csv', low_memory = False)

In [29]:
df_train = df_labels.merge(session_mean , left_on='person', right_on='person', how='left')

In [30]:
df_train.head()

Unnamed: 0,person,label,diff_mean,diff_max,session_count
0,0566e9c1,0,0.059618,0.247778,8.0
1,6ec7ee77,0,,,
2,abe7a2fb,0,0.095119,0.346944,21.0
3,34728364,0,0.043148,0.080556,3.0
4,87ed62de,0,0.155278,0.172222,2.0


In [32]:
X, y = df_train.iloc[:,2:],df_train.iloc[:,1]
X.head()

Unnamed: 0,diff_mean,diff_max,session_count
0,0.059618,0.247778,8.0
1,,,
2,0.095119,0.346944,21.0
3,0.043148,0.080556,3.0
4,0.155278,0.172222,2.0


In [33]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [34]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [35]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 4,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 1500) #1200 era la mejro

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)


In [37]:
xg_reg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=1, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=1500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [38]:
preds = xg_reg.predict(X_test)

In [39]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.208400


In [40]:
persons = df_labels['person']
df_predict = session_mean.loc[~session_mean.person.isin(persons)]
ppl_to_predict = (df.loc[~df['person'].isin(persons)])['person'].to_frame()
ppl_to_predict = ppl_to_predict.drop_duplicates('person')
df_predict = ppl_to_predict.merge(df_predict, left_on = 'person' , right_on = 'person', how='left')
X_predict = df_predict.drop(['person'], axis=1)

In [41]:
entrie = xg_reg.predict(X_predict)

In [42]:
seriesita = pd.Series(entrie)

In [43]:
df_entrie = df_predict['person'].to_frame()
df_entrie['label'] = seriesita

In [44]:
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.030504
1,0297fc1e,0.090241
2,2d681dd8,0.066517
3,cccea85e,0.143501
4,4c8a8b93,0.062313


In [45]:
df_entrie = df_entrie.fillna(0)
df_entrie.head()

Unnamed: 0,person,label
0,4886f805,0.030504
1,0297fc1e,0.090241
2,2d681dd8,0.066517
3,cccea85e,0.143501
4,4c8a8b93,0.062313


In [54]:
num = df_entrie._get_numeric_data()
num[num < 0] = 0

In [55]:
df_entrie.to_csv(path_or_buf = 'diff.csv', index = False)

In [47]:
df_entrie.shape

(19415, 2)

In [48]:
df_entrie['label'].nlargest(1)

16141    0.544922
Name: label, dtype: float32

## Scoring

Para evaluar usen esta medida que me da valores muy parecidos a los de kaggle, para hacer las predicciones usen el otro

In [49]:
my_classifier1 = xgb.XGBClassifier(objective ='reg:linear', 
                colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6,
                subsample = 0.8,
                gamma = 1,
                n_estimators = 10)

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [51]:
my_classifier1.fit(X_train,y_train)
entrie = my_classifier1.predict_proba(X_predict)

In [52]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,  my_classifier1.predict_proba(X_test)[:,1])

0.6164474766439961

 # Random Forest feature importance
    -TEST THIS

In [59]:
X = X.fillna(0)

In [60]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example

names = X.columns
rf = RandomForestRegressor()
rf.fit(X, y)
print ("Features sorted by their score:")
zipped = zip(map(lambda x: round(x, 4), rf.feature_importances_), names)
feature = sorted(zipped, key=lambda x: x[1])



Features sorted by their score:


In [61]:
X.shape

(19414, 3)

In [62]:
feat_importance = pd.DataFrame(feature, columns=['importance', 'feature'])
feat_importance

Unnamed: 0,importance,feature
0,0.4322,diff_max
1,0.4622,diff_mean
2,0.1056,session_count


In [118]:
feat_importance =feat_importance.loc[feat_importance['importance'].isin(feat_importance['importance'].nlargest(100))]
feats_servibles = feat_importance['feature'].tolist()
feats_servibles

['cantidad_vistos', 'conversion', 'diff', 'lead', 'searched', 'viewed']

In [119]:
df_top = df_top[feats_servibles]
df_top['person'] = subjects
df_top.head()

Unnamed: 0,cantidad_vistos,conversion,diff,lead,searched,viewed,person
0,4.0,,0.058056,,1.0,4.0,4886f805
1,20.0,,0.095324,,,15.0,ad93850f
2,404.0,,0.120095,1.0,2.0,26.0,0297fc1e
3,13.0,,0.170093,,1.0,7.0,2d681dd8
4,739.0,,0.15207,,1.0,94.0,cccea85e


In [121]:
feats_servibles

['cantidad_vistos', 'conversion', 'diff', 'lead', 'searched', 'viewed']

In [63]:
df

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-18 00:11:59,viewed product,4886f805,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,,,,
1,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
2,2018-05-18 00:11:16,viewed product,0297fc1e,,6888.0,iPhone 6S,Muito Bom,64GB,Prateado,,,,,,,,,,,,,,
3,2018-05-18 00:11:14,viewed product,2d681dd8,,11890.0,iPhone 7,Bom,128GB,Vermelho,,,,,,,,,,,,,,
4,2018-05-18 00:11:09,viewed product,cccea85e,,7517.0,LG G4 H818P,Excelente,32GB,Branco,,,,,,,,,,,,,,
5,2018-05-18 00:44:27,searched products,4c8a8b93,,,,,,,"10240,9987,10322,10085,9944,9931,13404,10154,1...",iPhone se,,,,,,,,,,,,
6,2018-05-18 00:44:14,viewed product,1b9f7cf6,,2831.0,iPhone 6,Bom,16GB,Dourado,,,,,,,,,,,,,,
7,2018-05-18 00:44:02,viewed product,29ebb414,,2845.0,iPhone 6 Plus,Bom,128GB,Cinza espacial,,,,,,,,,,,,,,
8,2018-05-18 00:43:59,viewed product,de8fe91b,,12548.0,Motorola Moto G5 Plus,Bom,32GB,Platinum,,,,,,,,,,,,,,
9,2018-05-18 00:43:40,ad campaign hit,45baf068,/,,,,,,,,,google,,,,,,,,,,
