In [7]:
'''
Nuevo enfoque del problema: 

Vamos a intentar clusterizar las sesiones. Se va a tratar de transformar el dataset de tal forma que cada registro sea referente a una única sesión. 

Por cada sesión se intentará guardar la siguiente información: 
Fecha - hora - productos vistos - usuario - country - device type - pagetype 

Luego para las sesiones de test se buscaran sesiones similares y se tratará de recomendar los productos que han comprado sesiones similares a la actual hasta llegar al total de 5.
'''

'\nNuevo enfoque del problema: \n\nVamos a intentar clusterizar las sesiones. Se va a tratar de transformar el dataset de tal forma que cada registro sea referente a una única sesión. \n\nPor cada sesión se intentará guardar la siguiente información: \n'

In [33]:
import pandas as pd 
import numpy as np
import os

In [34]:
products = pd.read_pickle('./raw/products.pkl')
train = pd.read_csv('./raw/train.csv')
test = pd.read_csv('./raw/test.csv')

In [35]:
train.head()

Unnamed: 0,session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
0,64,2024-06-06,2024-06-06 16:43:17.389,0,,29,14327,1,24.0
1,117,2024-06-08,2024-06-08 15:11:02.782,0,,57,38422,1,24.0
2,117,2024-06-08,2024-06-08 15:11:44.797,0,,57,19763,1,24.0
3,579,2024-06-05,2024-06-05 19:24:48.397,0,,29,30253,1,24.0
4,1220,2024-06-04,2024-06-04 08:21:13.476,0,480729.0,25,1592,1,24.0


In [4]:
products.head()

Unnamed: 0,discount,embedding,partnumber,color_id,cod_section,family
0,0,"[-0.13401361, -0.1200429, -0.016117405, -0.167...",32776,85,4.0,73
1,0,"[-0.0949274, -0.107294075, -0.16559914, -0.174...",41431,135,4.0,73
2,0,"[-0.12904441, -0.07724628, -0.09799071, -0.164...",39419,339,4.0,73
3,1,"[-0.12783332, -0.133868, -0.10101265, -0.18888...",36087,135,4.0,73
4,1,"[-0.14092924, -0.1258284, -0.10809927, -0.1765...",34132,3,4.0,73


In [36]:
train.isnull().sum()

session_id                0
date                      0
timestamp_local           0
add_to_cart               0
user_id            39694715
country                   0
partnumber                0
device_type               0
pagetype               1197
dtype: int64

In [37]:
train['user_id'] = train['user_id'].fillna('invitado')
train['user_id'] = train['user_id'].apply(lambda x: 'registrado' if isinstance(x, float) else x)
train['user_id'].head(5)

0      invitado
1      invitado
2      invitado
3      invitado
4    registrado
Name: user_id, dtype: object

In [38]:
train['pagetype'] = train['pagetype'].fillna(24)
train['pagetype'] = train['pagetype'].apply(lambda x: x if x in [24, 8, 6] else 24)

In [39]:
train.isnull().sum()

session_id         0
date               0
timestamp_local    0
add_to_cart        0
user_id            0
country            0
partnumber         0
device_type        0
pagetype           0
dtype: int64

In [40]:
sessions_with_1 = train[train['add_to_cart'] == 1]['session_id'].unique()

#guardamos las sesiones que acabaron en compra en un df:
df_keep = train[train['session_id'].isin(sessions_with_1)] 
df_keep

Unnamed: 0,session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
7,1222,2024-06-13,2024-06-13 06:22:03.307,0,invitado,57,10763,1,24.0
8,1222,2024-06-13,2024-06-13 06:17:33.149,0,invitado,57,14731,1,24.0
9,1222,2024-06-13,2024-06-13 06:17:57.411,0,invitado,57,5249,1,24.0
10,1222,2024-06-13,2024-06-13 06:17:32.601,0,invitado,57,33017,1,24.0
11,1222,2024-06-13,2024-06-13 06:19:09.272,1,invitado,57,5249,1,24.0
...,...,...,...,...,...,...,...,...,...
46551372,5164747,2024-06-06,2024-06-06 07:46:34.895,0,invitado,29,24522,3,24.0
46551373,5164747,2024-06-06,2024-06-06 07:45:49.484,1,invitado,29,2814,3,24.0
46551374,5164747,2024-06-06,2024-06-06 07:48:29.248,0,invitado,29,28265,3,24.0
46551375,5164747,2024-06-06,2024-06-06 07:48:42.565,0,invitado,29,42848,3,24.0


In [41]:
#Ahora agrupar por: session, dia comienzo, hora comienzo sesion, 
grouped_df = df_keep.groupby("session_id").agg(
    {
        "date": "first",
        "timestamp_local": "first",
        "user_id": "first",
        "country": "first",
        "device_type": "first",
        "pagetype": "first",
        "partnumber": lambda x: list(x),  # Crear lista de productos visitados
    }
).reset_index()
grouped_df

Unnamed: 0,session_id,date,timestamp_local,user_id,country,device_type,pagetype,partnumber
0,10,2024-06-14,2024-06-14 11:00:10.618,invitado,57,1,24.0,"[35936, 34417, 35936]"
1,24,2024-06-14,2024-06-14 20:43:08.786,registrado,25,1,24.0,"[11798, 9575, 22095, 32424, 37885, 40390, 1151..."
2,26,2024-06-02,2024-06-02 19:45:07.368,invitado,29,1,24.0,"[12624, 32425, 10857, 13904, 13904, 13904]"
3,31,2024-06-05,2024-06-05 22:10:58.880,invitado,29,1,24.0,"[16304, 17377, 37574, 29487, 7117, 25193, 2948..."
4,36,2024-06-02,2024-06-02 19:22:40.999,registrado,25,1,24.0,"[23319, 5620, 39156, 17892]"
...,...,...,...,...,...,...,...,...
1038043,5171824,2024-06-10,2024-06-10 19:52:59.899,invitado,34,3,24.0,"[5717, 15259, 21035, 29046]"
1038044,5171828,2024-06-12,2024-06-12 17:45:03.499,invitado,29,1,24.0,[7115]
1038045,5171837,2024-06-02,2024-06-02 22:56:25.516,invitado,34,1,24.0,"[13628, 31671, 24572, 40543, 36634, 22834, 258..."
1038046,5171842,2024-06-10,2024-06-10 22:51:34.851,invitado,57,1,24.0,"[17547, 17337, 28502, 43622, 9466, 7266, 28502..."


In [42]:
grouped_df['num_productos_visitados'] = grouped_df['partnumber'].apply(lambda x: len(x))

In [43]:
grouped_df['num_productos_visitados'].sort_values(ascending = False)

300972     3199
101474     2459
646005     2426
41081      2022
39843      2017
           ... 
461116        1
461121        1
461125        1
461127        1
1038047       1
Name: num_productos_visitados, Length: 1038048, dtype: int64

In [44]:
#quitar outliers con la media y la varianza: 
mean = grouped_df['num_productos_visitados'].mean()
std_dev = grouped_df['num_productos_visitados'].std()

lower_bound = mean - 1.5 * std_dev
upper_bound = mean + 1.5 * std_dev

filtered_df = grouped_df[(grouped_df['num_productos_visitados'] >= lower_bound) & (grouped_df['num_productos_visitados'] <= 10)]
#dataframe sin outliers: 
filtered_df

Unnamed: 0,session_id,date,timestamp_local,user_id,country,device_type,pagetype,partnumber,num_productos_visitados
0,10,2024-06-14,2024-06-14 11:00:10.618,invitado,57,1,24.0,"[35936, 34417, 35936]",3
2,26,2024-06-02,2024-06-02 19:45:07.368,invitado,29,1,24.0,"[12624, 32425, 10857, 13904, 13904, 13904]",6
4,36,2024-06-02,2024-06-02 19:22:40.999,registrado,25,1,24.0,"[23319, 5620, 39156, 17892]",4
5,40,2024-06-05,2024-06-05 09:38:25.753,invitado,57,1,24.0,"[11490, 39074, 4791, 9363, 13749]",5
6,44,2024-06-13,2024-06-13 02:57:35.535,invitado,34,1,24.0,[12383],1
...,...,...,...,...,...,...,...,...,...
1038042,5171817,2024-06-01,2024-06-02 01:21:41.374,invitado,34,1,24.0,"[26665, 26439, 26582]",3
1038043,5171824,2024-06-10,2024-06-10 19:52:59.899,invitado,34,3,24.0,"[5717, 15259, 21035, 29046]",4
1038044,5171828,2024-06-12,2024-06-12 17:45:03.499,invitado,29,1,24.0,[7115],1
1038046,5171842,2024-06-10,2024-06-10 22:51:34.851,invitado,57,1,24.0,"[17547, 17337, 28502, 43622, 9466, 7266, 28502...",8


In [47]:
# Expandir la columna 'partnumber' en columnas individuales
products_df = filtered_df["partnumber"].apply(pd.Series)
products_df.columns = [f"product_{i+1}" for i in products_df.columns] 

In [48]:
products_df

Unnamed: 0,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10
0,35936.0,34417.0,35936.0,,,,,,,
2,12624.0,32425.0,10857.0,13904.0,13904.0,13904.0,,,,
4,23319.0,5620.0,39156.0,17892.0,,,,,,
5,11490.0,39074.0,4791.0,9363.0,13749.0,,,,,
6,12383.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1038042,26665.0,26439.0,26582.0,,,,,,,
1038043,5717.0,15259.0,21035.0,29046.0,,,,,,
1038044,7115.0,,,,,,,,,
1038046,17547.0,17337.0,28502.0,43622.0,9466.0,7266.0,28502.0,37237.0,,


In [49]:
products_df.fillna(0, inplace = True)

Unnamed: 0,session_id,date,timestamp_local,user_id,country,device_type,pagetype,partnumber,num_productos_visitados
0,10,2024-06-14,2024-06-14 11:00:10.618,invitado,57,1,24.0,"[35936, 34417, 35936]",3
2,26,2024-06-02,2024-06-02 19:45:07.368,invitado,29,1,24.0,"[12624, 32425, 10857, 13904, 13904, 13904]",6
4,36,2024-06-02,2024-06-02 19:22:40.999,registrado,25,1,24.0,"[23319, 5620, 39156, 17892]",4
5,40,2024-06-05,2024-06-05 09:38:25.753,invitado,57,1,24.0,"[11490, 39074, 4791, 9363, 13749]",5
6,44,2024-06-13,2024-06-13 02:57:35.535,invitado,34,1,24.0,[12383],1
...,...,...,...,...,...,...,...,...,...
1038042,5171817,2024-06-01,2024-06-02 01:21:41.374,invitado,34,1,24.0,"[26665, 26439, 26582]",3
1038043,5171824,2024-06-10,2024-06-10 19:52:59.899,invitado,34,3,24.0,"[5717, 15259, 21035, 29046]",4
1038044,5171828,2024-06-12,2024-06-12 17:45:03.499,invitado,29,1,24.0,[7115],1
1038046,5171842,2024-06-10,2024-06-10 22:51:34.851,invitado,57,1,24.0,"[17547, 17337, 28502, 43622, 9466, 7266, 28502...",8


In [53]:
 filtered_df = pd.concat([filtered_df.drop(columns=["partnumber"]), products_df], axis=1)
 filtered_df.reset_index(inplace=True,drop = True)

In [54]:
filtered_df

Unnamed: 0,session_id,date,timestamp_local,user_id,country,device_type,pagetype,num_productos_visitados,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10
0,10,2024-06-14,2024-06-14 11:00:10.618,invitado,57,1,24.0,3,35936.0,34417.0,35936.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26,2024-06-02,2024-06-02 19:45:07.368,invitado,29,1,24.0,6,12624.0,32425.0,10857.0,13904.0,13904.0,13904.0,0.0,0.0,0.0,0.0
2,36,2024-06-02,2024-06-02 19:22:40.999,registrado,25,1,24.0,4,23319.0,5620.0,39156.0,17892.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40,2024-06-05,2024-06-05 09:38:25.753,invitado,57,1,24.0,5,11490.0,39074.0,4791.0,9363.0,13749.0,0.0,0.0,0.0,0.0,0.0
4,44,2024-06-13,2024-06-13 02:57:35.535,invitado,34,1,24.0,1,12383.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484359,5171817,2024-06-01,2024-06-02 01:21:41.374,invitado,34,1,24.0,3,26665.0,26439.0,26582.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484360,5171824,2024-06-10,2024-06-10 19:52:59.899,invitado,34,3,24.0,4,5717.0,15259.0,21035.0,29046.0,0.0,0.0,0.0,0.0,0.0,0.0
484361,5171828,2024-06-12,2024-06-12 17:45:03.499,invitado,29,1,24.0,1,7115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484362,5171842,2024-06-10,2024-06-10 22:51:34.851,invitado,57,1,24.0,8,17547.0,17337.0,28502.0,43622.0,9466.0,7266.0,28502.0,37237.0,0.0,0.0


In [55]:
filtered_df['hour'] = pd.to_datetime(filtered_df['timestamp_local']).dt.hour
filtered_df['weekday'] = pd.to_datetime(filtered_df['timestamp_local']).dt.weekday
filtered_df.head()

Unnamed: 0,session_id,date,timestamp_local,user_id,country,device_type,pagetype,num_productos_visitados,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10,hour,weekday
0,10,2024-06-14,2024-06-14 11:00:10.618,invitado,57,1,24.0,3,35936.0,34417.0,35936.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,4
1,26,2024-06-02,2024-06-02 19:45:07.368,invitado,29,1,24.0,6,12624.0,32425.0,10857.0,13904.0,13904.0,13904.0,0.0,0.0,0.0,0.0,19,6
2,36,2024-06-02,2024-06-02 19:22:40.999,registrado,25,1,24.0,4,23319.0,5620.0,39156.0,17892.0,0.0,0.0,0.0,0.0,0.0,0.0,19,6
3,40,2024-06-05,2024-06-05 09:38:25.753,invitado,57,1,24.0,5,11490.0,39074.0,4791.0,9363.0,13749.0,0.0,0.0,0.0,0.0,0.0,9,2
4,44,2024-06-13,2024-06-13 02:57:35.535,invitado,34,1,24.0,1,12383.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3


In [56]:
filtered_df = filtered_df.drop(columns=['timestamp_local','date'])

In [57]:
def convertir_dia(hour):
    if 0 <= hour < 9:
        return 'noche'
    elif 9 <= hour < 14:
        return 'mañana'
    else:
        return 'tarde'


#funcion para los fines de semana 
def convertir_semana(weekday):
    if 1 <= weekday <= 5:
        return 'weekday'
    else:
        return 'weekend'

filtered_df['hour'] = filtered_df['hour'].apply(lambda x: convertir_dia(x))
filtered_df['weekday'] = filtered_df['weekday'].apply(lambda x: convertir_semana(x))

In [58]:
#Normalización y onehot encoding 
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484364 entries, 0 to 484363
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   session_id               484364 non-null  int64  
 1   user_id                  484364 non-null  object 
 2   country                  484364 non-null  int64  
 3   device_type              484364 non-null  int64  
 4   pagetype                 484364 non-null  float64
 5   num_productos_visitados  484364 non-null  int64  
 6   product_1                484364 non-null  float64
 7   product_2                484364 non-null  float64
 8   product_3                484364 non-null  float64
 9   product_4                484364 non-null  float64
 10  product_5                484364 non-null  float64
 11  product_6                484364 non-null  float64
 12  product_7                484364 non-null  float64
 13  product_8                484364 non-null  float64
 14  prod

In [59]:
categorical =[var for var in filtered_df.columns if filtered_df[var].dtype=='O']
print(categorical)
new_categorical = ['country','device_type','pagetype']
filtered_df[new_categorical] = filtered_df[new_categorical].apply(lambda x: x.astype('object'))


['user_id', 'hour', 'weekday']


In [60]:
numerical = [var for var in filtered_df.columns if filtered_df[var].dtype != 'O']
print(numerical)

['session_id', 'num_productos_visitados', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5', 'product_6', 'product_7', 'product_8', 'product_9', 'product_10']


In [61]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
encoder = OneHotEncoder()
encoder_cat = encoder.fit_transform(filtered_df[categorical])
encoded_df = pd.DataFrame(encoder_cat.toarray(),columns=encoder.get_feature_names_out().tolist())
encoded_df

Unnamed: 0,user_id_invitado,user_id_registrado,hour_mañana,hour_noche,hour_tarde,weekday_weekday,weekday_weekend
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
484359,1.0,0.0,0.0,1.0,0.0,0.0,1.0
484360,1.0,0.0,0.0,0.0,1.0,0.0,1.0
484361,1.0,0.0,0.0,0.0,1.0,1.0,0.0
484362,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [62]:
#normalizer:
scaler = RobustScaler()
encoder_num = scaler.fit_transform(filtered_df[numerical])
encoded_num = pd.DataFrame(encoder_num, columns=numerical)
encoded_num.shape
print(encoded_num.head(10))

   session_id  num_productos_visitados  product_1  product_2  product_3  \
0   -0.998688                     -0.2   0.668809   0.646157   0.835942   
1   -0.998681                      0.4  -0.424778   0.571648  -0.070517   
2   -0.998677                      0.0   0.076934  -0.430971   0.952326   
3   -0.998676                      0.2  -0.477975   0.820348  -0.289768   
4   -0.998674                     -0.6  -0.436084  -0.641182  -0.462934   
5   -0.998674                     -0.2  -0.442886   0.380101   0.382911   
6   -0.998665                      0.0  -0.886241  -0.385599   0.734124   
7   -0.998651                      0.4  -0.657363   0.680756   0.101782   
8   -0.998645                     -0.6  -0.375944  -0.641182  -0.462934   
9   -0.998645                      0.0   0.111742   0.426819   0.898001   

   product_4  product_5  product_6  product_7  product_8  product_9  \
0  -0.250864   0.000000   0.000000        0.0        0.0        0.0   
1   0.314362   0.667723   0.8961

In [63]:
encoded_num

Unnamed: 0,session_id,num_productos_visitados,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10
0,-0.998688,-0.2,0.668809,0.646157,0.835942,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,-0.998681,0.4,-0.424778,0.571648,-0.070517,0.314362,0.667723,0.896165,0.000000,0.0,0.0,0.0
2,-0.998677,0.0,0.076934,-0.430971,0.952326,0.476483,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,-0.998676,0.2,-0.477975,0.820348,-0.289768,0.129761,0.660279,0.000000,0.000000,0.0,0.0,0.0
4,-0.998674,-0.6,-0.436084,-0.641182,-0.462934,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
484359,1.000280,-0.2,0.233898,0.347746,0.497849,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0
484360,1.000283,0.0,-0.748792,-0.070432,0.297358,0.929916,0.000000,0.000000,0.000000,0.0,0.0,0.0
484361,1.000284,-0.6,-0.683211,-0.641182,-0.462934,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0
484362,1.000290,0.8,-0.193836,0.007294,0.567246,1.522460,0.454593,0.468321,4.166198,37237.0,0.0,0.0


In [64]:
filtered_df['session_id']

0              10
1              26
2              36
3              40
4              44
           ...   
484359    5171817
484360    5171824
484361    5171828
484362    5171842
484363    5171847
Name: session_id, Length: 484364, dtype: int64

In [65]:
final_train = pd.concat([filtered_df['session_id'],encoded_num,encoded_df], axis = 1)
final_train.set_index(filtered_df['session_id'], inplace=True)
final_train.drop(columns=['session_id'], inplace = True)
final_train


Unnamed: 0_level_0,num_productos_visitados,product_1,product_2,product_3,product_4,product_5,product_6,product_7,product_8,product_9,product_10,user_id_invitado,user_id_registrado,hour_mañana,hour_noche,hour_tarde,weekday_weekday,weekday_weekend
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10,-0.2,0.668809,0.646157,0.835942,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
26,0.4,-0.424778,0.571648,-0.070517,0.314362,0.667723,0.896165,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
36,0.0,0.076934,-0.430971,0.952326,0.476483,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
40,0.2,-0.477975,0.820348,-0.289768,0.129761,0.660279,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
44,-0.6,-0.436084,-0.641182,-0.462934,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171817,-0.2,0.233898,0.347746,0.497849,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
5171824,0.0,-0.748792,-0.070432,0.297358,0.929916,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
5171828,-0.6,-0.683211,-0.641182,-0.462934,-0.250864,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5171842,0.8,-0.193836,0.007294,0.567246,1.522460,0.454593,0.468321,4.166198,37237.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [66]:
final_train.to_parquet('checkpoint1-KNN_train.parquet', engine='pyarrow', compression='snappy')