### Implementation of problem 4

##### Problem4 - (a)

##### Load dataset

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens-1M dataset (assuming ratings.dat is used)
column_names = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv("/Users/luninghao/Desktop/CSCI-SHU-381-Recsys/CA1_ProblemSet/ml-1m/ratings.dat", sep="::", names=column_names, engine="python")

# Convert timestamp to datetime
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")

# Sort by user and timestamp
ratings = ratings.sort_values(by=["user_id", "timestamp"])

##### Leave-One_Last Split

In [2]:
def leave_one_last_split(ratings):
    train_list = []
    valid_list = []
    test_list = []

    # Group by user_id
    for user, data in ratings.groupby("user_id"):
        if len(data) >= 2:
            train = data.iloc[:-2]  # All but last two interactions
            valid = data.iloc[-2:-1]  # Second-to-last interaction
            test = data.iloc[-1:]  # Last interaction
        else:
            train = data.iloc[:-1]  # If only one interaction, assign to train
            valid = data.iloc[-1:]  # Assign the last to validation
            test = pd.DataFrame(columns=ratings.columns)  # No test sample

        train_list.append(train)
        valid_list.append(valid)
        test_list.append(test)

    # Combine all users
    train_set = pd.concat(train_list)
    valid_set = pd.concat(valid_list)
    test_set = pd.concat(test_list)

    return train_set, valid_set, test_set


##### Temporal Global Split

In [3]:
def temporal_global_split(ratings, cutoff="2002-01-01 00:00:00"):
    cutoff_date = pd.to_datetime(cutoff)

    # Train set: interactions before the cutoff
    train_set = ratings[ratings["timestamp"] < cutoff_date]

    # Split last 10% of train interactions as validation
    valid_size = int(len(train_set) * 0.1)
    valid_set = train_set.tail(valid_size)
    train_set = train_set.iloc[:-valid_size]

    # Test set: interactions after the cutoff
    test_set = ratings[ratings["timestamp"] >= cutoff_date]

    return train_set, valid_set, test_set



In [4]:
# Apply the function
train_lol, valid_lol, test_lol = leave_one_last_split(ratings)
train_tg, valid_tg, test_tg = temporal_global_split(ratings)

##### Dataset Statistics

In [5]:
# Compute statistics
def dataset_statistics(train, valid, test, name):
    num_users = len(pd.concat([train, valid, test])["user_id"].unique())
    num_items = len(pd.concat([train, valid, test])["movie_id"].unique())
    num_interactions = (len(train), len(valid), len(test))
    return [name, num_users, num_items] + list(num_interactions)

# Create statistics table
stats_table = pd.DataFrame(
    [
        dataset_statistics(train_lol, valid_lol, test_lol, "Leave-One-Last"),
        dataset_statistics(train_tg, valid_tg, test_tg, "Temporal Global"),
    ],
    columns=["Data Split", "# Users", "# Items", "# Interactions (train)", "# Interactions (valid)", "# Interactions (test)"]
)

stats_table

Unnamed: 0,Data Split,# Users,# Items,# Interactions (train),# Interactions (valid),# Interactions (test)
0,Leave-One-Last,6040,3706,988129,6040,6040
1,Temporal Global,6040,3706,875534,97281,27394


##### Problem4 - (b)

In [None]:
import time
from annoy import AnnoyIndex
from sklearn.preprocessing import normalize