# explore the dataset

In [16]:

import pandas as pd
from river import datasets
import numpy as np

dataset = datasets.MovieLens100K()
# Initialize last item variable
last_item = None
# Inspect the first few examples
first_item = None
for i, (x, y) in enumerate(dataset):
    if i == 0:
        first_item = (x, y)
    # print(f"Example {i+1}")
    # print("Features:", x)
    datetime = pd.to_datetime(x["timestamp"], unit='ns')
    # print(datetime)
    # print("Rating:", y)
    # print()  # Blank line for readability
    last_item = (x, y)




In [17]:

datetime_first = pd.to_datetime(first_item[0]["timestamp"], unit='ns')
datetime_last = pd.to_datetime(last_item[0]["timestamp"], unit='ns')
print("first item datetime = ", datetime_first, "last item datetime = ", datetime_last)

first item datetime =  1997-09-20 05:05:10 last item datetime =  1998-04-23 01:10:38


In [18]:
# how many days apart
datetime_last - datetime_first

Timedelta('214 days 20:05:28')

In [37]:
from collections import defaultdict
from itertools import combinations
from river import datasets

# Initialize data structures
user_ratings = defaultdict(dict)

# Load the MovieLens 100K dataset
dataset = datasets.MovieLens100K()

# Build user_ratings structure from MovieLens data
def load_movielens_data():
    for x, y in dataset:
        user_id = x['user']   # Extract the user ID
        item_id = x['item']   # Extract the item ID
        rating = y            # Extract the rating value
        user_ratings[user_id][item_id] = rating  # Store the rating in user_ratings

# Create user pairs and count common items
def compute_common_items_distribution(user_ratings):
    common_item_distribution = defaultdict(int)  # Key: number of common items, Value: frequency
    
    # Get all unique pairs of users
    user_pairs = combinations(user_ratings.keys(), 2)
    
    for user1, user2 in user_pairs:
        # Find common items between two users
        common_items = set(user_ratings[user1]) & set(user_ratings[user2])
        
        # Count the number of common items
        common_item_count = len(common_items)
        
        # Update the distribution
        common_item_distribution[common_item_count] += 1
    
    return common_item_distribution

# Function to print the distribution in a readable format
def print_common_item_distribution(common_item_distribution):
    print(f"{'Common Items':<15} {'User Pair Count':<15}")
    print("-" * 30)
    for common_items, count in sorted(common_item_distribution.items()):
        print(f"{common_items:<15} {count:<15}")

# Load the data
load_movielens_data()

# Run the common item computation
common_item_distribution = compute_common_items_distribution(user_ratings)

# Print the result
print_common_item_distribution(common_item_distribution)


Common Items    User Pair Count
------------------------------
0               15043          
1               22359          
2               26088          
3               27027          
4               25725          
5               24279          
6               22423          
7               20617          
8               18598          
9               16941          
10              15307          
11              14062          
12              12483          
13              11413          
14              10029          
15              9148           
16              8215           
17              7668           
18              6753           
19              6410           
20              5800           
21              5370           
22              4925           
23              4716           
24              4309           
25              4069           
26              3766           
27              3491           
28              3296           
29       

# collaborative_filtering with exponential time decay (GPT)

In [40]:
import csv
import pandas as pd
import numpy as np
import time
from collections import defaultdict
from river import datasets, metrics

# Initialize data structures
user_ratings = defaultdict(dict)
user_timestamps = defaultdict(dict)
similarity = defaultdict(dict)

# Initialize classification metrics
accuracy = metrics.Accuracy()
precision = metrics.Precision()
recall = metrics.Recall()
f1 = metrics.F1()

# Load the dataset
dataset = datasets.MovieLens100K()

headers = ["user", "item", "timestamp", "title", "release_date", "genres",
           "age", "age_2groups", "age_4groups", "gender", "occupation", "zip_code",
           "rating", "prediction", "datetime", "rating_binary", "prediction_binary",
           "diff", "diff_binary_correctness"]

# Function to convert nanoseconds timestamp to days
def convert_to_days(nanosecond_timestamp):
    return nanosecond_timestamp / (86400 * 1e9)

def time_decay_weight(new_time_in_days, last_time_in_days, alpha):
    """
    Calculate the time decay weight for an interaction.

    Parameters:
    - new_time_in_days: The current interaction timestamp in days.
    - last_time_in_days: The previous interaction timestamp in days.
    - alpha: The decay rate.

    Returns:
    - A float representing the time decay weight.
    """
    time_difference = new_time_in_days - last_time_in_days
    return alpha ** time_difference

def compute_similarity(user1, user2, current_time_in_days, decay_rate):
    """
    Compute the similarity between two users using time-decayed ratings.

    Parameters:
    - user1, user2: The user IDs.
    - current_time_in_days: The current timestamp in days.
    - decay_rate: The time decay rate.

    Returns:
    - A float representing the similarity between user1 and user2.
    """
    common_items = set(user_ratings[user1]) & set(user_ratings[user2])
    # print("user1 = ", user1, "user2 = ", user2, "common items = ", common_items)
    if not common_items:
        return 0
    if len(common_items) == 1:
        return 0
    print("common items more than 1")
    print(common_items)
    ratings1 = []
    ratings2 = []
    for item in common_items:
        # Calculate time decay weights for both users
        time1 = convert_to_days(user_timestamps[user1][item])
        time2 = convert_to_days(user_timestamps[user2][item])
        weight1 = time_decay_weight(time1, current_time_in_days, decay_rate)
        weight2 = time_decay_weight(time2, current_time_in_days, decay_rate)
        ratings1.append(user_ratings[user1][item] * weight1)
        ratings2.append(user_ratings[user2][item] * weight2)
    print("ratings1 = ", ratings1)
    print("ratings2 = ", ratings2)
    # Use Pearson correlation coefficient
    mean1 = sum(ratings1) / len(ratings1)
    mean2 = sum(ratings2) / len(ratings2)
    numerator = sum((r1 - mean1) * (r2 - mean2) for r1, r2 in zip(ratings1, ratings2))
    denominator = (sum((r - mean1) ** 2 for r in ratings1) * sum((r - mean2) ** 2 for r in ratings2)) ** 0.5
    print("numerator = ", numerator, "denominator = ", denominator)
    return numerator / denominator if denominator != 0 else 0


def run(result_file_name, decay_rate, accuracy, precision, recall, f1, dataset, headers):
    print("run with alpha = ", decay_rate, "result file name = ", result_file_name)
    
   # Convert dataset to a list to sort by timestamp
    dataset_list = list(dataset)
    
    # Sort the dataset by timestamp
    dataset_list.sort(key=lambda x: x[0]['timestamp'])
    
    # Initialize the clock with the earliest timestamp in days
    last_time_in_days = convert_to_days(dataset_list[0][0]['timestamp'])


   # Open the CSV file for writing and write the header
    with open(result_file_name, "w", newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()

        for x, y in dataset_list:
            user_id = x['user']
            item_id = x['item']

            # Convert the current interaction timestamp to days
            new_time_in_days = convert_to_days(x['timestamp'])

            # Calculate the time decay weight between the last and the current interaction
            decay_weight = time_decay_weight(new_time_in_days, last_time_in_days, alpha)

            # Update the last_time_in_days to the current interaction time
            last_time_in_days = new_time_in_days

            # Initialize prediction
            pred = None

            # Only proceed if user_id is already in user_ratings
            if user_id in user_ratings:
                similarities = {}
                for other_user in user_ratings:
                    if other_user != user_id:
                        # Check if similarity has been computed before
                        if user_id in similarity and other_user in similarity[user_id]:
                            sim = similarity[user_id][other_user]
                            # print("sim = ", sim)
                        else:
                            sim = compute_similarity(user_id, other_user, new_time_in_days, alpha)
                            # Store the computed similarity
                            similarity[user_id][other_user] = sim
                            similarity[other_user][user_id] = sim
                            # print("sim = ", sim)
                        # Only consider users with positive similarity who have rated the item
                        if sim > 0 and item_id in user_ratings[other_user]:
                            similarities[other_user] = sim

                if similarities:
                    numerator = 0.0
                    denominator = 0.0
                    for other_user in similarities:
                        # Apply time decay to the neighbor's rating
                        time_of_rating_in_days = convert_to_days(user_timestamps[other_user][item_id])
                        weight = time_decay_weight(time_of_rating_in_days, new_time_in_days, alpha)
                        numerator += similarities[other_user] * user_ratings[other_user][item_id] * weight
                        denominator += abs(similarities[other_user]) * weight
                    pred = numerator / denominator if denominator != 0 else None

            # Default prediction if None
            if pred is None:
                pred = 3.0  # Average rating in MovieLens dataset

            # Add the actual rating and the prediction to the dictionary
            x['rating'] = y
            x['prediction'] = pred
            preds = pred

            x["rating_binary"] = int(y >= 4)
            x["prediction_binary"] = int(preds >= 4) if preds is not None else None
            x["diff"] = abs(y - preds)
            x["diff_binary_correctness"] = int(abs(y - preds) <= 1)
            x["datetime"] = pd.to_datetime(x['timestamp'], unit='ns').strftime('%Y-%m-%d')

            # Age groupings
            if x["age"] <= 30:
                x["age_2groups"] = "7-30"
            else:
                x["age_2groups"] = "31-73"

            if x["age"] <= 24:
                x["age_4groups"] = "7-24"
            elif x["age"] <= 30:
                x["age_4groups"] = "25-30"
            elif x["age"] <= 40:
                x["age_4groups"] = "31-40"
            else:
                x["age_4groups"] = "41-73"

            writer.writerow(x)

            # Update the user ratings and timestamps AFTER making the prediction
            user_ratings[user_id][item_id] = y
            user_timestamps[user_id][item_id] = x['timestamp']

            # Update classification metrics
            if preds is not None:
                y_binary = int(y >= 4)
                pred_binary = int(preds >= 4)
                accuracy.update(y_binary, pred_binary)
                precision.update(y_binary, pred_binary)
                recall.update(y_binary, pred_binary)
                f1.update(y_binary, pred_binary)

    # Output the final results
    print(f"Accuracy: {accuracy.get():.4f}")
    print(f"Precision: {precision.get():.4f}")
    print(f"Recall: {recall.get():.4f}")
    print(f"F1 Score: {f1.get():.4f}")


In [41]:
import math

# find the best alpha value
for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    accuracy = metrics.Accuracy()
    precision = metrics.Precision()
    recall = metrics.Recall()
    f1 = metrics.F1()
     # Load the dataset
    dataset = datasets.MovieLens100K()

    headers = ["user", "item", "timestamp", "title", "release_date", "genres",
               "age", "age_2groups", "age_4groups", "gender", "occupation", "zip_code",
               "rating", "prediction", "datetime", "rating_binary", "prediction_binary",
               "diff", "diff_binary_correctness"]
    
    result_file = "movielens_online_cf_time_decay_alpha_10e" + str(int(math.log10(alpha))) + ".csv"
    decay_rate = alpha  # Adjust this value as needed
    run(result_file, decay_rate, accuracy, precision, recall, f1, dataset, headers)
    break
    

run with alpha =  0.1 result file name =  movielens_online_cf_time_decay_alpha_10e-1.csv
Accuracy: 0.4462
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


# collaborative_filtering with exponential time decay (github)  FAIL
https://github.com/gucino/Temporal-Collaborative-Filtering-using-decay-function-to-track-dynamic-interest-of-user/blob/master/Temporal_CF_decay_funnction.py

In [26]:
# -*- coding: utf-8 -*-
"""
Created on Sun May 10 21:20:54 2020

@author: Tisana
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from river import datasets

# Load the dataset
data_set = datasets.MovieLens100K()


##clean data
########################################################################
########################################################################
########################################################################
#compute user rating matrix and timestamp matrix
num_user=943 #user id 1 to 943
num_movie=1682 #movie id 1 to 1682

user_rating_dict={}
#key is user id : value are rating of all movie 
for user_id in range(1,num_user+1):
    user_rating_dict[user_id]=np.array([0]*num_movie)
    
user_timestamp_dict={}
for user_id in range(1,num_user+1):
    user_timestamp_dict[user_id]=np.array([0]*num_movie)

#append rating data set to user rating dict
data_set_list=data_set.tolist()
for each_row in data_set_list:
    user_id=each_row[0]
    mmovie_id=each_row[1]
    rating=each_row[2]
    movie_index=mmovie_id-1
    timestamp=each_row[3]
    #append to dictionary
    user_rating_dict[user_id][movie_index]=rating
    user_timestamp_dict[user_id][movie_index]=timestamp

user_rating_array=[]
for each in user_rating_dict.values():
    user_rating_array.append(each)
user_rating_array=np.array(user_rating_array) #index by user index (user id -1)

#convert rating matrix to user-like matrix
user_like_matrix=[]
for i in range(0,num_user):
    row_list=[]
    for j in range(0,num_movie):
        rating=user_rating_array[i,j]
        if rating>=3:
            row_list.append(1)
        else:
            row_list.append(0)
    user_like_matrix.append(np.array(row_list))
user_like_matrix=np.array(user_like_matrix)

#convert user-like matrix to user-user network
user_user_network=[]
for i in range(0,num_user):
    if i%10==0:
        print(i)
    row_list=[]
    for j in range(0,num_user):
        common_prefered_item=user_like_matrix[i,:]*user_like_matrix[j,:]
        row_list.append(common_prefered_item)
    row_list=np.array(row_list).sum(axis=1)
    user_user_network.append(row_list)
user_user_network=np.array(user_user_network)
#normalization
row_mean=np.mean(user_rating_array,axis=1)
row_mean=row_mean[:,np.newaxis]
user_rating_array=(user_rating_array-row_mean)*(user_rating_array)/(user_rating_array)

for each_row in range(0,num_user):
    for each_column in range(0,num_movie):
        if np.isnan(user_rating_array[each_row,each_column])==True:
            user_rating_array[each_row,each_column]=0

########################################################################
########################################################################
########################################################################
#get timestamp matrix
user_timestamp_array=[]
for each in user_timestamp_dict.values():
    user_timestamp_array.append(each)
user_timestamp_array=np.array(user_timestamp_array) 

########################################################################
########################################################################
########################################################################
#compute user similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr


user_similarity_matrix=[]
for i in range(0,num_user):
    if i%10==0:
        print(i," out of ",num_user)
    user_1_id=i+1
    row=[]
    for j in range(0,num_user):
        user_2_id=j+1
        similarity=pearsonr(user_rating_array[user_1_id-1],user_rating_array[user_2_id-1])[0]
        #similarity=cosine_similarity([user_rating_array[user_1_id-1]],[user_rating_array[user_2_id-1]])[0][0]
        row.append(similarity)
    user_similarity_matrix.append(np.array(row))    
user_similarity_matrix=np.array(user_similarity_matrix)    


########################################################################
########################################################################
########################################################################   
#prediction function

def faster_rating_prediction(k,user_similarity_matrix,time,alpha,user_timestamp_array):
    #avg rating of all user matrix
    avg_rating_user_matrix=np.mean(user_rating_array,axis=1)
    avg_rating_user_matrix=avg_rating_user_matrix[:,np.newaxis]
    avg_rating_user_matrix=np.repeat(avg_rating_user_matrix,num_movie,axis=1)
    
    predicted_rating_array=[]
    for target_user_index in range(0,num_user):
        
        #avg rating of target user
        avg_rating_of_target_user=avg_rating_user_matrix[target_user_index,:]
        
        #find k similar user
        lst=pd.Series(list(user_similarity_matrix[target_user_index,:]))
        i=lst.nlargest(k+1)
        similar_user_index_list=i.index.values.tolist()
        similar_user_index_list=similar_user_index_list[1:] #exclude yourself
        
        #avg rating of similar user
        avg_rating_of_similar_user=avg_rating_user_matrix[similar_user_index_list,:]
        rating_of_similar_user=user_rating_array[similar_user_index_list,:]
        diff_of_similar_user=rating_of_similar_user-avg_rating_of_similar_user
        
        
        
        time_diff=weighted_time(target_user_index,similar_user_index_list,alpha,user_timestamp_array)
        
        #check for time
        if time==True:
            diff_of_similar_user=diff_of_similar_user*time_diff
        
        #second term
        similarity_to_target_user=user_similarity_matrix[target_user_index,similar_user_index_list]
        similarity_to_target_user=similarity_to_target_user[:,np.newaxis]
        numerator=sum(diff_of_similar_user*similarity_to_target_user)
        
        if time==True:
            denominator=sum(similarity_to_target_user*time_diff)
        else:
            denominator=sum(similarity_to_target_user)
        
        
        second_term=numerator/denominator
        
        #prediction
        predicted_rating_of_target_user=avg_rating_of_target_user+second_term
        predicted_rating_array.append(predicted_rating_of_target_user)

    predicted_rating_array=np.array(predicted_rating_array)
    return predicted_rating_array

########################################################################
########################################################################
######################################################################## 
#MAE function
def MAE_calculator(predicted_user_rating_array,user_rating_array):
    #change predict matrix to have only known value
    filter_matrix=np.copy(user_rating_array)
    filter_matrix[filter_matrix>0]=1
    predicted_user_rating_array=predicted_user_rating_array*filter_matrix
    
    num_predict=np.count_nonzero(predicted_user_rating_array)
    MAE=(abs(predicted_user_rating_array-user_rating_array).sum())/num_predict
    return MAE


########################################################################
########################################################################
########################################################################     

#generate abs time diff matrix
def weighted_time(target_user_index,similar_user_index_list,alpha,user_timestamp_array):


    a=user_timestamp_array[target_user_index,:]
    b=user_timestamp_array[similar_user_index_list,:]
    time_diff_matrix=abs(a-b)
    
    #standardization
    from sklearn.preprocessing import StandardScaler
    scaler=StandardScaler()
    time_diff_matrix = scaler.fit_transform(time_diff_matrix)
    lam_matrix=np.exp(-1*time_diff_matrix*alpha)
    return lam_matrix

########################################################################
########################################################################
######################################################################## 
#find best value of alpha (1.7)
k=3
MAE_list=[]
alpha_list=[]
alpha=0
for i in range(0,100):
    predicted_rating=faster_rating_prediction(k,user_similarity_matrix,True,alpha,user_timestamp_array)
    MAE=MAE_calculator(predicted_rating,user_rating_array)
    MAE_list.append(MAE)
    alpha_list.append(alpha)
    print(" MAE : ",MAE)
    alpha+=0.1
plt.title("find best value of alpha")
plt.ylabel("MAE")
plt.xlabel("alpha")
plt.plot(alpha_list,MAE_list)
best_alpha_index=MAE_list.index(min(MAE_list))
best_alpha=alpha_list[best_alpha_index]

########################################################################
########################################################################
######################################################################## 
#compare performance of no time and time
alpha=best_alpha
MAE_time_list=[]
MAE_no_time_list=[]
k_list=[]
for k in range(1,100,10):
    print("k : ",k)
    time=faster_rating_prediction(k,user_similarity_matrix,True,alpha,user_timestamp_array)
    no_time=faster_rating_prediction(k,user_similarity_matrix,False,alpha,user_timestamp_array)
    
    MAE_time=MAE_calculator(time,user_rating_array)
    MAE_no_time=MAE_calculator(no_time,user_rating_array)
    
    MAE_time_list.append(MAE_time)
    MAE_no_time_list.append(MAE_no_time)
    k_list.append(k)
plt.figure()
plt.xlabel("number of neighbourhood")
plt.ylabel("MAE")
plt.plot(k_list,MAE_time_list,c="green",label="consider dynamic user interest")
plt.plot(k_list,MAE_no_time_list,c="red",label="do not consider dynamic user interest")
plt.legend()
plt.show()

AttributeError: 'MovieLens100K' object has no attribute 'tolist'

In [15]:
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.
