In [1]:
# installs necessary libraries

#!pip install pandas

In [2]:
# necessary imports for notebook to run
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
paths = ['./data/new_release_stream.csv',
         './data/processed.csv',
         './data/sessionized.csv',
         './data/sessionized_sm.csv',
         './data/sessionized_GRU4Rec_train.csv',
         './data/sessionized_GRU4Rec_test.csv',
         './data/sessionized_GRU4Rec_train_optim.csv',
         './data/sessionized_GRU4Rec_valid.csv'
         ]

dfs = {}
for path in paths:
    df_name = path.split('/')[-1].split('.')[0] # split out name of path string
    dfs[df_name] = pd.read_csv(path)

# universal naming
for df_name, df in dfs.items():
    if "ItemId" in df.columns:
        df.rename(columns={'ItemId': 'itemId'}, inplace=True)
    if "Time" in df.columns:
        df.rename(columns={'Time': 'timestamp'}, inplace=True)
    if "SessionId" in df.columns:
        df.rename(columns={'SessionId': 'session_id'}, inplace=True)

In [4]:
for df_name, df in dfs.items():
    print(f"Stats for {df_name}")
    print("Number of events/interactions in total: ", len(df))
    if "userId" in df.columns:
        print("Number of unique users: ", df.userId.nunique())
    print("Number of unique items: ", df.itemId.nunique())

Stats for new_release_stream
Number of events/interactions in total:  1583815
Number of unique users:  3623
Number of unique items:  879
Stats for processed
Number of events/interactions in total:  1583815
Number of unique users:  3623
Number of unique items:  879
Stats for sessionized
Number of events/interactions in total:  1206315
Number of unique users:  3623
Number of unique items:  879
Stats for sessionized_sm
Number of events/interactions in total:  335445
Number of unique users:  1000
Number of unique items:  879
Stats for sessionized_GRU4Rec_train
Number of events/interactions in total:  1205353
Number of unique items:  879
Stats for sessionized_GRU4Rec_test
Number of events/interactions in total:  962
Number of unique items:  377
Stats for sessionized_GRU4Rec_train_optim
Number of events/interactions in total:  1204682
Number of unique items:  879
Stats for sessionized_GRU4Rec_valid
Number of events/interactions in total:  671
Number of unique items:  333


In [5]:
for df_name, df in dfs.items():
    item_interaction_counts = df.groupby('itemId').size().reset_index(name='n_interactions') 
    avg_item_counts = item_interaction_counts['n_interactions'].mean().round(2)
    print("Stats for set ", df_name)
    print(f"Avg. events per item: ", avg_item_counts)

Stats for set  new_release_stream
Avg. events per item:  1801.84
Stats for set  processed
Avg. events per item:  1801.84
Stats for set  sessionized
Avg. events per item:  1372.37
Stats for set  sessionized_sm
Avg. events per item:  381.62
Stats for set  sessionized_GRU4Rec_train
Avg. events per item:  1371.28
Stats for set  sessionized_GRU4Rec_test
Avg. events per item:  2.55
Stats for set  sessionized_GRU4Rec_train_optim
Avg. events per item:  1370.51
Stats for set  sessionized_GRU4Rec_valid
Avg. events per item:  2.02


In [6]:
for df_name, df in dfs.items():
    if "userId" in df.columns:
        user_interaction_counts = df.groupby('userId').size().reset_index(name='n_interactions') 
        avg_user_counts = user_interaction_counts['n_interactions'].mean().round(2)
        print("Stats for set ", df_name)
        print(f"Avg. events per user: ", avg_user_counts)

Stats for set  new_release_stream
Avg. events per user:  437.16
Stats for set  processed
Avg. events per user:  437.16
Stats for set  sessionized
Avg. events per user:  332.96
Stats for set  sessionized_sm
Avg. events per user:  335.44


In [7]:
for df_name, df in dfs.items():
    if "userId" in df.columns:
        user_item_same_pairs_counts_per_set = df.groupby(['userId', 'itemId']).size().reset_index(name='n_reps') # counts the number of same user-item pairs per set

        average_repetitions_per_set = user_item_same_pairs_counts_per_set.groupby('userId')['n_reps'].mean() # averages them for each set

        average_repetitions_per_set = average_repetitions_per_set.mean().round(2)
        

        print("Stats for set ", df_name)
        print("Avg. reps per user: ", average_repetitions_per_set)

Stats for set  new_release_stream
Avg. reps per user:  11.09
Stats for set  processed
Avg. reps per user:  11.09
Stats for set  sessionized
Avg. reps per user:  8.49
Stats for set  sessionized_sm
Avg. reps per user:  8.61


In [21]:
for df_name, df in dfs.items():
    item_cnt = df.groupby('itemId').size().reset_index(name='n_interactions')
    rep_items = item_cnt[item_cnt['n_interactions'] > 1]
    rep_items['n_repetitions'] = rep_items['n_interactions'] - 1 # 2 interactions means it was repeated once
    avg_reps_per_item = rep_items['n_repetitions'].mean().round(2)

    print("Stats for set ", df_name)
    print("Avg. reps of an item: ", avg_reps_per_item)

Stats for set  new_release_stream
Avg. reps of an item:  1800.84
Stats for set  processed
Avg. reps of an item:  1800.84
Stats for set  sessionized
Avg. reps of an item:  1371.37
Stats for set  sessionized_sm
Avg. reps of an item:  380.62
Stats for set  sessionized_GRU4Rec_train
Avg. reps of an item:  1370.28
Stats for set  sessionized_GRU4Rec_test
Avg. reps of an item:  2.79
Stats for set  sessionized_GRU4Rec_train_optim
Avg. reps of an item:  1369.51
Stats for set  sessionized_GRU4Rec_valid
Avg. reps of an item:  2.07


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rep_items['n_repetitions'] = rep_items['n_interactions'] - 1 # 2 interactions means it was repeated once
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rep_items['n_repetitions'] = rep_items['n_interactions'] - 1 # 2 interactions means it was repeated once


In [8]:
for df_name, df in dfs.items():
    if "session_id" in df.columns: # for all session-based datasets
        print("Stats for set ", df_name)
        print("Number of sessions: ", df.session_id.nunique())

Stats for set  sessionized
Number of sessions:  102290
Stats for set  sessionized_sm
Number of sessions:  28580
Stats for set  sessionized_GRU4Rec_train
Number of sessions:  102190
Stats for set  sessionized_GRU4Rec_test
Number of sessions:  100
Stats for set  sessionized_GRU4Rec_train_optim
Number of sessions:  102109
Stats for set  sessionized_GRU4Rec_valid
Number of sessions:  81


In [9]:
for df_name, df in dfs.items():
    # count id's across users and calculate average of that count across all users
    if ("userId" in df.columns) and ("session_id" in df.columns):
        session_counts_user_overall = df.groupby('userId')['session_id'].nunique()

        avg_session_counts_user_overall = session_counts_user_overall.mean().round(2)

        print("Stats for set ", df_name)
        print("Avg. sessions per user: ", avg_session_counts_user_overall)

Stats for set  sessionized
Avg. sessions per user:  28.23
Stats for set  sessionized_sm
Avg. sessions per user:  28.58


In [10]:
for df_name, df in dfs.items():
    if "session_id" in df.columns:
        events_per_session = df.groupby('session_id').size().reset_index(name='n_interactions')
        avg_events_per_session = events_per_session['n_interactions'].mean().round(2)
        print("Stats for set ",df_name)
        print("Avg. interactions per session: ", avg_events_per_session)

Stats for set  sessionized
Avg. interactions per session:  11.79
Stats for set  sessionized_sm
Avg. interactions per session:  11.74
Stats for set  sessionized_GRU4Rec_train
Avg. interactions per session:  11.8
Stats for set  sessionized_GRU4Rec_test
Avg. interactions per session:  9.62
Stats for set  sessionized_GRU4Rec_train_optim
Avg. interactions per session:  11.8
Stats for set  sessionized_GRU4Rec_valid
Avg. interactions per session:  8.28


In [11]:
for df_name, df in dfs.items():
    if "session_id" in df.columns:
        # group by itemId and count the sessions for each item
        item_sessions_count = df.groupby('itemId')['session_id'].nunique()

        # average sessions per item
        avg_sessions_per_item = item_sessions_count.mean().round(2)
        print("Stats for set ", df_name)
        print("Avg. sessions per item: ", avg_sessions_per_item)


Stats for set  sessionized
Avg. sessions per item:  956.48
Stats for set  sessionized_sm
Avg. sessions per item:  265.94
Stats for set  sessionized_GRU4Rec_train
Avg. sessions per item:  955.46
Stats for set  sessionized_GRU4Rec_test
Avg. sessions per item:  2.38
Stats for set  sessionized_GRU4Rec_train_optim
Avg. sessions per item:  954.79
Stats for set  sessionized_GRU4Rec_valid
Avg. sessions per item:  1.77


##### **Intra-, and Inter-session repetition rate**

In [19]:
for df_name, df in dfs.items():
    if ("session_id" in df.columns):
        # INTRA rep avg
        reps_per_item_per_session_intra = df.groupby(['session_id', 'itemId']).size().reset_index(name='n_reps')

        repeated_items = reps_per_item_per_session_intra[reps_per_item_per_session_intra['n_reps'] > 1]

        repeated_items['n_reps'] = repeated_items['n_reps'] - 1

        intra_session_rep_rate = repeated_items.groupby('session_id')['n_reps'].sum()

        intra_session_rep_rate = intra_session_rep_rate.mean().round(2)

        print("Stats for set ", df_name)
        print("Intra-session repetition rate: ", intra_session_rep_rate)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1


Stats for set  sessionized
Intra-session repetition rate:  5.29
Stats for set  sessionized_sm
Intra-session repetition rate:  5.32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1


Stats for set  sessionized_GRU4Rec_train
Intra-session repetition rate:  5.29
Stats for set  sessionized_GRU4Rec_test
Intra-session repetition rate:  1.91
Stats for set  sessionized_GRU4Rec_train_optim
Intra-session repetition rate:  5.3
Stats for set  sessionized_GRU4Rec_valid
Intra-session repetition rate:  2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repeated_items['n_reps'] = repeated_items['n_reps'] - 1


In [26]:
# INTER rep avg
for df_name, df in dfs.items():
    if ('session_id' in df.columns) and ('userId' in df.columns):
        reps_per_item_per_session_inter = df.groupby(['userId', 'itemId', 'session_id']).size().reset_index(name='reps')

        reps_per_user_across_sessions = reps_per_item_per_session_inter.groupby('userId')['reps'].sum().reset_index(name='reps')

        inter_session_rep_rate = reps_per_user_across_sessions['reps'].mean().round(2)

        print("Stats for set ", df_name)
        print("Inter session repetition rate per user: ", inter_session_rep_rate)

Stats for set  sessionized
Inter session repetition rate per user:  332.96
Stats for set  sessionized_sm
Inter session repetition rate per user:  335.44


In [29]:
inter_session_rep_rate = reps_per_user_across_sessions['reps'].mean().round(2)
inter_session_rep_rate

437.16

In [24]:
#avg length per single user session 
for df_name, df in dfs.items():
    if ("session_id" in df.columns) and ("timestamp" in df.columns):
        session_lengths = df.groupby('session_id')['timestamp'].agg(sess_length=lambda ts: ts.max() - ts.min()) # calc length of one session per session

        avg_session_lengths = session_lengths['sess_length'].mean()
        print("Stats for set ", df_name)
        print("Average session length in minutes: ", (avg_session_lengths / 60).round(2))

Stats for set  sessionized
Average session length in minutes:  38.02
Stats for set  sessionized_sm
Average session length in minutes:  38.33
Stats for set  sessionized_GRU4Rec_train
Average session length in minutes:  38.02
Stats for set  sessionized_GRU4Rec_test
Average session length in minutes:  38.67
Stats for set  sessionized_GRU4Rec_train_optim
Average session length in minutes:  38.02
Stats for set  sessionized_GRU4Rec_valid
Average session length in minutes:  37.68
