In [8]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [9]:
# Load the dataset
file_path = './ml-100k.rating.data'
columns = ['user', 'item', 'rating', 'timestamp']
ratings = pd.read_csv(file_path, sep='\t', names=columns, engine='python')

# Step 1: Keep only interactions with a rating of 4 or above
positive_ratings = ratings[ratings['rating'] > 0]

# Step 2: Apply k-core filtering
def filter_k_core(df, k=10):
    while True:
        before_shape = df.shape[0]
        user_counts = df['user'].value_counts()
        item_counts = df['item'].value_counts()
        
        df = df[df['user'].isin(user_counts[user_counts >= k].index)]
        df = df[df['item'].isin(item_counts[item_counts >= k].index)]
        
        after_shape = df.shape[0]
        if before_shape == after_shape:
            break
    return df

filtered_ratings = filter_k_core(positive_ratings, k=10)
#filtered_ratings = positive_ratings

# Step 3: Sort the data by timestamp
filtered_ratings = filtered_ratings.sort_values(by='timestamp')

# Step 4: Remap user and item IDs starting from 0
unique_user_ids = filtered_ratings['user'].unique()
unique_item_ids = filtered_ratings['item'].unique()

user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_user_ids)}
item_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_item_ids)}

filtered_ratings['user'] = filtered_ratings['user'].map(user_id_map)
filtered_ratings['item'] = filtered_ratings['item'].map(item_id_map)

# Assign unique interaction IDs starting from 0
filtered_ratings['interactionId'] = range(len(filtered_ratings))

# Step 5: Determine splits based on interaction counts
total_interactions = len(filtered_ratings)
base_count = int(total_interactions * 0.6)
inc1_count = int(total_interactions * 0.4/3)
inc2_count = int(total_interactions * 0.4/3)
inc3_count = total_interactions - (base_count + inc1_count + inc2_count)

# Step 6: Split the data based on interaction counts
base_data = filtered_ratings.iloc[:base_count]
inc1_data = filtered_ratings.iloc[base_count:base_count + inc1_count]
inc2_data = filtered_ratings.iloc[base_count + inc1_count:base_count + inc1_count + inc2_count]
inc3_data = filtered_ratings.iloc[base_count + inc1_count + inc2_count:]

In [10]:
print(f"base block accum_user: {base_data['user'].max() + 1}")
print(f"base block accum_item: {base_data['item'].max() + 1}")
print(f"base block num_interactions: {len(base_data)}\n\n")

print(f"inc1 accum_user: {inc1_data['user'].max() + 1}")
print(f"inc1 accum_item: {inc1_data['item'].max() + 1}")
print(f"inc1 num_interactions: {len(inc1_data)}\n\n")

print(f"inc2 accum_user: {inc2_data['user'].max() + 1}")
print(f"inc2 accum_item: {inc2_data['item'].max() + 1}")
print(f"inc2 num_interactions: {len(inc2_data)}\n\n")

print(f"inc3 block accum_user: {inc3_data['user'].max() + 1}")
print(f"inc3 accum_item: {inc3_data['item'].max() + 1}")
print(f"inc3 num_interactions: {len(inc3_data)}")

base block accum_user: 587
base block accum_item: 1136
base block num_interactions: 58771


inc1 accum_user: 697
inc1 accum_item: 1146
inc1 num_interactions: 13060


inc2 accum_user: 827
inc2 accum_item: 1148
inc2 num_interactions: 13060


inc3 block accum_user: 943
inc3 accum_item: 1152
inc3 num_interactions: 13062
