# explore the dataset

In [20]:

import pandas as pd
from river import datasets
import numpy as np

dataset = datasets.MovieLens100K()
# Initialize last item variable
last_item = None
# Inspect the first few examples
first_item = None
ratings = set()
for i, (x, y) in enumerate(dataset):
    ratings.add(y)
    if i == 0:
        first_item = (x, y)
    # print(f"Example {i+1}")
    # print("Features:", x)
    datetime = pd.to_datetime(x["timestamp"], unit='ns')
    # print(datetime)
    # print("Rating:", y)
    # print()  # Blank line for readability
    last_item = (x, y)

print(ratings)


{1.0, 2.0, 3.0, 4.0, 5.0}


In [21]:

datetime_first = pd.to_datetime(first_item[0]["timestamp"], unit='ns')
datetime_last = pd.to_datetime(last_item[0]["timestamp"], unit='ns')
print("first item datetime = ", datetime_first, "last item datetime = ", datetime_last)

first item datetime =  1997-09-20 05:05:10 last item datetime =  1998-04-23 01:10:38


In [3]:
# how many days apart
datetime_last - datetime_first

Timedelta('214 days 20:05:28')

# matrix factorization

In [23]:
import csv
import pandas as pd
import numpy as np
from river import reco, optim, datasets, metrics

# Initialize the model
model = reco.BiasedMF(
    n_factors=10,
    bias_optimizer=optim.SGD(0.01),
    latent_optimizer=optim.SGD(0.01),
    loss=optim.losses.Squared(),  # Using 'Squared' loss
    l2_bias=0.1,
    l2_latent=0.1,
    weight_initializer=optim.initializers.Zeros(),  # Initialize biases to zero
    latent_initializer=optim.initializers.Normal(mu=0., sigma=0.1, seed=42),  # Initialize latent factors
    seed=42
)

# Initialize regression metrics
mae = metrics.MAE()
rmse = metrics.RMSE()

# Load the dataset
dataset = datasets.MovieLens100K()

headers = ["user", "item", "timestamp", "title", "release_date", "genres",
           "age", "age_2groups", "age_4groups", "gender", "occupation", "zip_code",
           "rating", "prediction", "prediction_float", "datetime", "rating_binary", "prediction_binary",
           "diff", "diff_binary_correctness"]

# Set the decay rate (lambda) for exponential decay
decay_rate = 0.001  # Adjust this value as needed
decay_factor = np.exp(-decay_rate)

def get_integer(x):
    if x < 1.5:
        return 1
    elif x < 2.5:
        return 2
    elif x < 3.5:
        return 3
    elif x < 4.5:
        return 4
    else:
        return 5

# Open the CSV file for writing and write the header
with open("result_incremental_mf_time_decay_2.csv", "w", newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()

    for x, y in dataset:
        user_id = x['user']
        item_id = x['item']
        interaction_time = x['timestamp']

        # Apply exponential time decay to user and item factors
        # Decay existing user factors
        if user_id in model.u_latents:
            model.u_latents[user_id] *= decay_factor

        # Decay existing item factors
        if item_id in model.i_latents:
            model.i_latents[item_id] *= decay_factor

        # Calculate time decay weight for the current interaction
        # Since we process data sequentially, we can consider the time difference
        # between the current interaction and a reference time (e.g., the first timestamp)
        if 'initial_time' not in locals():
            initial_time = interaction_time  # Set the reference time to the first timestamp

        time_difference = interaction_time - initial_time
        time_decay_weight = np.exp(-decay_rate * time_difference)

        # Get the prediction from the model
        pred = model.predict_one(user=x["user"], item=x["item"])

        # Update the model with time decay weight
        model.learn_one(user=x["user"], item=x["item"], y=y)

        # Add the actual rating and the prediction to the dictionary
        x['rating'] = y
        x['prediction_float'] = pred if pred is not None else 3.0  # Default to average rating if None
        x["prediction"] = get_integer(x["prediction_float"])  # Apply time decay to the prediction
        
        preds = x['prediction']

        x["rating_binary"] = int(y >= 4)
        x["prediction_binary"] = int(preds >= 4) if preds is not None else None
        x["diff"] = abs(y - preds)
        x["diff_binary_correctness"] = int(abs(y - preds) <= 1)
        x["datetime"] = pd.to_datetime(x['timestamp'], unit='ns').strftime('%Y-%m-%d')

        # Age groupings
        if x["age"] <= 30:
            x["age_2groups"] = "7-30"
        else:
            x["age_2groups"] = "31-73"

        if x["age"] <= 24:
            x["age_4groups"] = "7-24"
        elif x["age"] <= 30:
            x["age_4groups"] = "25-30"
        elif x["age"] <= 40:
            x["age_4groups"] = "31-40"
        else:
            x["age_4groups"] = "41-73"

        writer.writerow(x)

        # Update regression metrics
        if pred is not None:
            mae.update(y, pred)
            rmse.update(y, pred)

# Output the final results
print(f"MAE: {mae.get():.4f}")
print(f"RMSE: {rmse.get():.4f}")


MAE: 0.7793
RMSE: 0.9751
