In [21]:
import pandas as pd
import numpy as np

In [22]:
PROCESSED_DS_PATH = './data/processed.csv'
SESSIONIZED_DS_PATH = './data/sessionized.csv'

In [23]:
df = pd.read_csv(PROCESSED_DS_PATH)

df.head()

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,set
0,0,0,0,0,[],train
1,1,1,5756,0,[],train
2,2,2,7709,0,[],train
3,3,3,8076,0,[],train
4,4,4,9970,0,[],train


In [24]:
print(df.info())

print(f'Duplicate rows within dataset: {df.duplicated().sum()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1583815 entries, 0 to 1583814
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   userId               1583815 non-null  int64 
 1   itemId               1583815 non-null  int64 
 2   timestamp            1583815 non-null  int64 
 3   y                    1583815 non-null  int64 
 4   relational_interval  1583815 non-null  object
 5   set                  1583815 non-null  object
dtypes: int64(4), object(2)
memory usage: 72.5+ MB
None
Duplicate rows within dataset: 0


In [25]:
df.drop('set', axis=1, inplace=True) # drop set column as we will re-assign sets
df.sort_values(['userId', 'timestamp'], inplace=True) # for readibility - makes eventual grouping into user profiles more visible
df

Unnamed: 0,userId,itemId,timestamp,y,relational_interval
0,0,0,0,0,[]
7,0,7,15690,0,[]
16,0,15,38426,0,[]
19,0,5,45670,1,[0.0]
29,0,20,77618,0,[]
...,...,...,...,...,...
1577111,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134..."
1577128,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133..."
1577130,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008..."
1577138,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008..."


In [26]:
# define cutoff value for sessionizing (here, 30 mins (1800s) is taken)
THRESHOLD = 1800
def sessionize(user_interactions, threshold=THRESHOLD):
    """
    Splits interactions of one user into sessions and based on that, gives each row a unique session ID.

    Args:
        user_interactions: All interactions across all sessions for one user -> Pandas DataFrame
        threshold: Cutoff value in s -> int

    Returns:
        A dataframe with the user-item interactions, sessionized (each row has a session ID) -> Pandas DataFrame
    
    """

    # sort interactions chronologically (should be given by dataset anyway, this is just a precaution)
    user_interactions = user_interactions.sort_values('timestamp')

    # calc time differences between consecutive timestamps timestamp_j - timestamp_i
    time_diff = user_interactions['timestamp'].diff()

    # sums up separate sessions - splits to next group when a new session starts (time difference > 30 mins), otherwise it stays the same - this can be used as IDs/index to mark which rows in sorted unique_user_interactions belong to which session
    sessions = (time_diff > threshold).cumsum()
    user_interactions['session_id'] = user_interactions['userId'].astype(str) + '_' + sessions.astype(str)
    return user_interactions

In [27]:
sessionized_df = df.groupby('userId').apply(sessionize).reset_index(drop=True)
sessionized_df

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
0,0,0,0,0,[],0_0
1,0,7,15690,0,[],0_1
2,0,15,38426,0,[],0_2
3,0,5,45670,1,[0.0],0_3
4,0,20,77618,0,[],0_4
...,...,...,...,...,...,...
1583810,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134...",3622_23
1583811,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133...",3622_23
1583812,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
1583813,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23


In [28]:
# filter out inactivate users (n_sessions < 5), as we do not have enough data to model their behavior properly these numbers are guided according to general logic and boxplot results (which can vary for each dataset)

USER_SESSION_THRESHOLD = 5

session_cnt_per_user = sessionized_df.groupby('userId')['session_id'].nunique().reset_index(name='n_sessions')

keep_users_indices = session_cnt_per_user[(session_cnt_per_user['n_sessions'] >= USER_SESSION_THRESHOLD)].index

filtered_df = sessionized_df[sessionized_df['userId'].isin(keep_users_indices)]

# filter out sessions with few interactions happening (n_interactions < 5), as we do not have enough data to model the session properly

SESSION_INTERACTION_THRESHOLD_MIN = 5

interaction_cnt_per_session = filtered_df.groupby('session_id').size().reset_index(name='n_interactions')

keep_session_indices = interaction_cnt_per_session[interaction_cnt_per_session['n_interactions'] >= SESSION_INTERACTION_THRESHOLD_MIN]

filtered_df = filtered_df[filtered_df['session_id'].isin(keep_session_indices['session_id'])]

filtered_df # final filtered set

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
24,0,19,1269487,0,[],0_24
25,0,19,1269490,1,[0.0],0_24
26,0,50,1269609,1,[0.0],0_24
27,0,221,1269740,1,[0.0],0_24
28,0,51,1269858,1,[0.0],0_24
...,...,...,...,...,...,...
1583810,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134...",3622_23
1583811,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133...",3622_23
1583812,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
1583813,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23


In [29]:
# filter all users with <5 interactions

USER_INTERACTION_THRESHOLD_MIN = 5

interaction_cnt_per_user = filtered_df.groupby('userId').size().reset_index(name='n_interactions')
keep_user_indices_min = interaction_cnt_per_user[interaction_cnt_per_user['n_interactions'] >= USER_INTERACTION_THRESHOLD_MIN]
filtered_df = filtered_df[filtered_df['userId'].isin(keep_user_indices_min['userId'])]

filtered_df # final filtered set


Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
24,0,19,1269487,0,[],0_24
25,0,19,1269490,1,[0.0],0_24
26,0,50,1269609,1,[0.0],0_24
27,0,221,1269740,1,[0.0],0_24
28,0,51,1269858,1,[0.0],0_24
...,...,...,...,...,...,...
1583810,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134...",3622_23
1583811,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133...",3622_23
1583812,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
1583813,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23


In [30]:
# filter out super long sessions/sessions with too many interactions as they could be generated by bots according to [c]

print(interaction_cnt_per_session['n_interactions'].describe())

# drop more than 100 interactions per session
SESSION_INTERACTION_THRESHOLD_MAX = 100

keep_session_indices_max = interaction_cnt_per_session[interaction_cnt_per_session['n_interactions'] <= SESSION_INTERACTION_THRESHOLD_MAX]

filtered_df = filtered_df[filtered_df['session_id'].isin(keep_session_indices_max['session_id'])]

filtered_df # final filtered set


count    311509.000000
mean          5.084331
std           7.139391
min           1.000000
25%           1.000000
50%           2.000000
75%           6.000000
max         347.000000
Name: n_interactions, dtype: float64


Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
24,0,19,1269487,0,[],0_24
25,0,19,1269490,1,[0.0],0_24
26,0,50,1269609,1,[0.0],0_24
27,0,221,1269740,1,[0.0],0_24
28,0,51,1269858,1,[0.0],0_24
...,...,...,...,...,...,...
1583810,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134...",3622_23
1583811,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133...",3622_23
1583812,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
1583813,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23


In [31]:
# store final dataset
PATH_SESSIONIZED_DATASET = './data/'
filtered_df.to_csv(PATH_SESSIONIZED_DATASET + 'sessionized.csv', sep=',', index=False)

In [32]:
# create smaller version of dataset for testing GRU4Rec model
random_samples_user_ids = np.random.choice(filtered_df['userId'].unique(), size=1000, replace=False) # choose 1000 user IDs randomly
filtered_df_small = filtered_df[filtered_df['userId'].isin(random_samples_user_ids)]

# save smaller dataset
filtered_df_small.to_csv(PATH_SESSIONIZED_DATASET + 'sessionized_sm.csv', sep=',', index=False)

filtered_df_small

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
24,0,19,1269487,0,[],0_24
25,0,19,1269490,1,[0.0],0_24
26,0,50,1269609,1,[0.0],0_24
27,0,221,1269740,1,[0.0],0_24
28,0,51,1269858,1,[0.0],0_24
...,...,...,...,...,...,...
1583495,3621,565,10490549,1,"[692.8044444444445, 650.5577777777778, 624.535...",3621_44
1583496,3621,566,10490716,1,"[692.8044444444445, 650.5577777777778, 622.066...",3621_44
1583497,3621,567,10490891,0,"[692.8044444444445, 650.5577777777778, 622.066...",3621_44
1583498,3621,568,10491037,1,"[692.7761111111112, 650.5291666666667, 622.038...",3621_44
