In [46]:
import pandas as pd
import numpy as np

In [47]:
PROCESSED_DS_PATH = './data/processed.csv'
SESSIONIZED_DS_PATH = './data/sessionized.csv'

In [48]:
df = pd.read_csv(PROCESSED_DS_PATH)

df.head()

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,set
0,0,0,0,0,[],train
1,1,1,5756,0,[],train
2,2,2,7709,0,[],train
3,3,3,8076,0,[],train
4,4,4,9970,0,[],train


In [49]:
print(df.info())

print(f'Duplicate rows within dataset: {df.duplicated().sum()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1583815 entries, 0 to 1583814
Data columns (total 6 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   userId               1583815 non-null  int64 
 1   itemId               1583815 non-null  int64 
 2   timestamp            1583815 non-null  int64 
 3   y                    1583815 non-null  int64 
 4   relational_interval  1583815 non-null  object
 5   set                  1583815 non-null  object
dtypes: int64(4), object(2)
memory usage: 72.5+ MB
None
Duplicate rows within dataset: 0


In [50]:
df.drop('set', axis=1, inplace=True) # drop set column as we will re-assign sets
df.sort_values(['userId', 'timestamp'], inplace=True) # for readibility - makes eventual grouping into user profiles more visible
df

Unnamed: 0,userId,itemId,timestamp,y,relational_interval
0,0,0,0,0,[]
7,0,7,15690,0,[]
16,0,15,38426,0,[]
19,0,5,45670,1,[0.0]
29,0,20,77618,0,[]
...,...,...,...,...,...
1577111,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134..."
1577128,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133..."
1577130,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008..."
1577138,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008..."


In [51]:
# define cutoff value for sessionizing (here, 30 mins (1800s) is taken)
THRESHOLD = 1800
def sessionize(user_interactions, threshold=THRESHOLD):
    """
    Splits interactions of one user into sessions and based on that, gives each row a unique session ID.

    Args:
        user_interactions: All interactions across all sessions for one user -> Pandas DataFrame
        threshold: Cutoff value in s -> inr

    Returns:
        A dataframe with the user-item interactions, sessionized (each row has a session ID) -> Pandas DataFrame
    
    """

    # sort interactions chronologically (should be given by dataset anyway, this is just a precaution)
    user_interactions = user_interactions.sort_values('timestamp')

    # calc time differences between consecutive timestamps timestamp_j - timestamp_i
    time_diff = user_interactions['timestamp'].diff()

    # sums up separate sessions - splits to next group when a new session starts (time difference > 30 mins), otherwise it stays the same - this can be used as IDs/index to mark which rows in sorted unique_user_interactions belong to which session
    sessions = (time_diff > threshold).cumsum()
    user_interactions['session_id'] = user_interactions['userId'].astype(str) + '_' + sessions.astype(str)
    return user_interactions

In [52]:
sessionized_df = df.groupby('userId').apply(sessionize).reset_index(drop=True)
sessionized_df

Unnamed: 0,userId,itemId,timestamp,y,relational_interval,session_id
0,0,0,0,0,[],0_0
1,0,7,15690,0,[],0_1
2,0,15,38426,0,[],0_2
3,0,5,45670,1,[0.0],0_3
4,0,20,77618,0,[],0_4
...,...,...,...,...,...,...
1583810,3622,579,10398504,1,"[711.3533333333334, 665.9480555555556, 644.134...",3622_23
1583811,3622,590,10398803,0,"[711.3533333333334, 665.9480555555556, 644.133...",3622_23
1583812,3622,591,10398827,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
1583813,3622,592,10398988,1,"[711.2280555555556, 665.8227777777778, 644.008...",3622_23
