In [47]:
# import packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules,fpgrowth

import datetime
import os

In [2]:
# define folder where your data is stored
directory = 'C:/D_disk/02_Learning/27_Ughent_courses/32_Master_dissertation/Melika_Git/Data'

# get all files in directory
all_files = os.listdir(directory)
all_files

['film_links_by_genre.csv',
 'README.md',
 'reviews_10000.csv',
 'reviews_4000.csv',
 'usernames.csv']

In [3]:
# read files
film = pd.read_csv('C:/D_disk/02_Learning/27_Ughent_courses/32_Master_dissertation/Melika_Git/Data/film_links_by_genre.csv')
reviews = pd.read_csv('C:/D_disk/02_Learning/27_Ughent_courses/32_Master_dissertation/Melika_Git/Data/reviews_10000.csv')

In [9]:
# clean reviews

# check reviews null value
print(reviews.isnull().sum())

# show
reviews.head(5)

user_id     0
film        0
comment    88
date        0
rating      0
dtype: int64


Unnamed: 0,user_id,film,comment,date,rating
0,/film_addiction/,quills,Watched this trash only because Joaquin Phoeni...,03 Sep 2024,★
1,/film_addiction/,children-of-divorce,This was the first Gary Coopers movie that I w...,25 Aug 2024,★★★★★
2,/film_addiction/,morocco,Spoilers !A classic without a doubt. Dietrich ...,25 Aug 2024,★★★
3,/film_addiction/,design-for-living,Absolutely loved this movie ! The humor and th...,25 Aug 2024,★★★★★
4,/dustymoth/,jfk,It was so long that I stopped caring who kille...,22 Oct 2024,★★


In [4]:
# Transfer reviews date time
# Check for out-of-bounds dates
invalid_dates = reviews[~reviews['date'].str.match(r'^\d{2} \w{3} \d{4}$')]

# Attempt to convert the 'date' column to datetime, while handling errors
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')

# Check for rows with NaT (Not a Time) which indicates failed conversions
invalid_date_rows = reviews[reviews['date'].isna()]

# Optionally, drop or fill NaT values
reviews = reviews.dropna(subset=['date']) 


In [5]:
print('The first date:',reviews['date'].max())
print('The last date:',reviews['date'].min())

The first date: 2024-11-08 00:00:00
The last date: 1776-07-04 00:00:00


In [5]:
# remove the irreliable date   letterboxed.com was found in 2011-10-24
reviews = reviews[reviews['date']>='2011-10-24 00:00:00']

In [6]:
# Transfer the rating
# Define a function to convert star ratings to numerical values
def convert_rating(rating):
    # Remove any fractions from the ratings
    if '½' in rating:
        return (rating.count('★') + 0.5)  # Count stars and add 0.5 for half star
    return rating.count('★')  # Count stars

# Apply the conversion function to the 'rating' column
reviews['rating'] = reviews['rating'].apply(convert_rating)

In [30]:
# get the rating distribution
reviews['rating'].value_counts()

rating
4.0    172513
3.0    131675
3.5    131198
5.0    119327
4.5     77958
0.0     68549
2.5     63154
2.0     55145
1.0     24051
1.5     23295
0.5     14290
Name: count, dtype: int64

In [7]:
# split train and test
date = reviews[['user_id','date']].drop_duplicates().groupby('user_id')['date'].count().reset_index().sort_values(by=['date']).rename(columns={'date':'num_date'})
#only keep the number of watched date >2
date = date[date['num_date']>2]
reviews = date.merge(reviews, how='inner',on='user_id')
reviews.head()

Unnamed: 0,user_id,num_date,film,comment,date,rating
0,/camomila_loures/,3,deadpool,Filme ruim,2024-08-16,2.0
1,/camomila_loures/,3,a-boy-called-christmas,Odeio esse filme,2024-08-16,1.0
2,/camomila_loures/,3,black-widow-2021,AMOOOOOOO DEMAIS,2024-08-16,5.0
3,/camomila_loures/,3,clueless,É meio chatinho mas até que é legal,2024-08-16,3.0
4,/camomila_loures/,3,coraline,Amo muito,2024-08-16,5.0


In [8]:
# Cleaning the userid
reviews['user_id']=reviews['user_id'].str.replace('/','')

# last_watched 
last_watched = reviews.groupby('user_id')['date'].max().reset_index()
last_watched_df = reviews[['user_id','date','film','rating']].merge(last_watched,how='inner',on=['user_id','date'])

# training set
merged_df = reviews.merge(last_watched_df[['user_id', 'date', 'film']], on=['user_id', 'date', 'film'], how='left', indicator=True)
reviews = merged_df[merged_df['_merge']=='left_only']

# traing set = reviews
# test set = last_watched_df


In [9]:
# Get unique user_id and film
user_film = reviews[['user_id','film']].drop_duplicates()

# Get the unique films and unique users
print('Unique users: ',len(user_film['user_id'].unique()))
print('Unique films: ',len(user_film['film'].unique()))

Unique users:  8491
Unique films:  90418


In [10]:
# Check the film number per user
film_per_user = user_film.groupby('user_id')['film'].count().reset_index()
film_per_user = film_per_user.rename(columns={'film':'Count'})
print('The max number film per user:', film_per_user['Count'].max())
print('The min number film per user:', film_per_user['Count'].min())
print('The average number film per user:', film_per_user['Count'].mean())
print('The median number film per user:', film_per_user['Count'].median())
print('The 25 percent number film per user:', film_per_user['Count'].quantile(0.25))
print('The 75 percent number film per user:', film_per_user['Count'].quantile(0.75))

The max number film per user: 316
The min number film per user: 2
The average number film per user: 91.14968790484042
The median number film per user: 114.0
The 25 percent number film per user: 64.0
The 75 percent number film per user: 118.0


In [11]:
# check the percentage
data = film_per_user['Count']
bins = [0, 10, 30, 50, 70,90,110,130,150,170,200]  
labels = ['0-10', '10-30','30-50', '50-70','70-90','90-110','110-130','130-150','150-170','170-200']  
data_binned = pd.cut(data, bins=bins, labels=labels)
data_binned = data_binned.value_counts().reset_index().rename(columns={'Count':'Bin'})
data_binned['percentage'] = data_binned['count']/data_binned['count'].sum()*100
data_binned

Unnamed: 0,Bin,count,percentage
0,110-130,4611,54.317352
1,90-110,1044,12.298268
2,30-50,698,8.222405
3,10-30,695,8.187066
4,50-70,592,6.973731
5,70-90,511,6.019555
6,0-10,309,3.640005
7,130-150,23,0.270939
8,150-170,6,0.07068
9,170-200,0,0.0


In [12]:
# Check the user number per film
user_per_film = user_film.groupby('film')['user_id'].count().reset_index()
user_per_film = user_per_film.rename(columns={'user_id':'Count'})
print('The max number user per film:', user_per_film['Count'].max())
print('The min number user per film:', user_per_film['Count'].min())
print('The average number user per film:', user_per_film['Count'].mean())
print('The median number user per film:', user_per_film['Count'].median())
print('The 25 percent number user per film:', user_per_film['Count'].quantile(0.25))
print('The 75 percent number film per user:', user_per_film['Count'].quantile(0.75))

The max number user per film: 2384
The min number user per film: 1
The average number user per film: 8.559711561857153
The median number user per film: 1.0
The 25 percent number user per film: 1.0
The 75 percent number film per user: 4.0


In [13]:
# check the percentage
data_film = user_per_film['Count']
bins = [0, 1, 3, 5, 10,20,50,100,200,500,1000,10000]  
labels = ['0-1', '1-3','3-5', '5-10','10-20','20-50','50-100','100-200','200-500','500-1000','1000-10000']  
data_film_binned = pd.cut(data_film, bins=bins, labels=labels)
data_film_binned = data_film_binned.value_counts().reset_index().rename(columns={'Count':'Bin'})
data_film_binned['percentage'] = data_film_binned['count']/data_film_binned['count'].sum()*100
data_film_binned

Unnamed: 0,Bin,count,percentage
0,0-1,45943,50.811785
1,1-3,20094,22.223451
2,3-5,6788,7.507355
3,5-10,6773,7.490765
4,10-20,4618,5.10739
5,20-50,3499,3.869805
6,50-100,1402,1.550576
7,100-200,794,0.878144
8,200-500,399,0.441284
9,500-1000,81,0.089584


## Filtering

1. Filter users and film by isolation forest method

In [14]:
# define function
def insolationforest(df):
    # Use insolation Forest to test outliers
    contamination_levels = [0.1, 0.2, 0.3,0.4]
    best_contamination = None
    best_silhouette = -1  # Initialize with a low silhouette score

    for contamination in contamination_levels:
        model = IsolationForest(contamination=contamination)
        df['anomaly'] = model.fit_predict(df[['Count']])
        
        # Calculate the Silhouette Score
        if (df['anomaly'] == -1).sum() > 0:  # Ensure there are outliers
            score = silhouette_score(df[['Count']], df['anomaly'])
            print(f"Contamination: {contamination}, Silhouette Score: {score}")
            
            # Track the best contamination level
            if score > best_silhouette:
                best_silhouette = score
                best_contamination = contamination
    print(f"Best contamination level: {best_contamination} with Silhouette Score: {best_silhouette}")

In [15]:
# get the filtered dataframe
def filtered_df_IF(df,key,param):
    # Fit the model
    model = IsolationForest(contamination=param)  # Adjust contamination based on expected outlier ratio
    df['anomaly'] = model.fit_predict(df[['Count']])

    # Identify outliers
    outliers_film = df[df['anomaly'] == -1]
    outliers_film = outliers_film.reset_index()[[key,'Count']]
    outliers_film.sort_values(by='Count')

    # Filter outliters from film_per_user
    merged = df.merge(outliers_film, on=[key, 'Count'], how='left', indicator=True)
    filtered_df_IF = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
    return filtered_df_IF

In [52]:
# filter user
insolationforest(film_per_user)

Contamination: 0.1, Silhouette Score: 0.5419593996270637
Contamination: 0.2, Silhouette Score: 0.46895582843318756
Contamination: 0.3, Silhouette Score: 0.6125667390095253
Contamination: 0.4, Silhouette Score: 0.67652317496699
Best contamination level: 0.4 with Silhouette Score: 0.67652317496699


In [53]:
# filter film
insolationforest(user_per_film)

Contamination: 0.1, Silhouette Score: 0.8398832164867056
Contamination: 0.2, Silhouette Score: 0.6910720047349558
Contamination: 0.3, Silhouette Score: 0.5701559043050332
Contamination: 0.4, Silhouette Score: 0.44985803534722807
Best contamination level: 0.1 with Silhouette Score: 0.8398832164867056


In [16]:
# get the filtered user table
filtered_user_IF = filtered_df_IF(film_per_user,'user_id',0.4)

# get the filtered user table
filtered_film_IF = filtered_df_IF(user_per_film,'film',0.1)

2. Filter user and film manually

In [17]:
# Filter the user and film below medium value
filtered_user_median=film_per_user[(film_per_user['Count']>=film_per_user['Count'].median())]
filtered_film_median=user_per_film[(user_per_film['Count']>=user_per_film['Count'].median())]

# Filter the user and film below MEAN value
filtered_user_mean=film_per_user[(film_per_user['Count']>=film_per_user['Count'].mean())]
filtered_film_mean=user_per_film[(user_per_film['Count']>=user_per_film['Count'].mean())]

# Filter the user and film between 0.25-0.75
user_25 = film_per_user['Count'].quantile(0.25)
user_75 = film_per_user['Count'].quantile(0.75)
film_25 = user_per_film['Count'].quantile(0.25)
film_75 = user_per_film['Count'].quantile(0.75)
filtered_user_quantile = film_per_user[(film_per_user['Count']>user_25)&(film_per_user['Count']<user_75)]
filtered_film_quantile = user_per_film[(user_per_film['Count']>film_25)&(user_per_film['Count']<film_75)]

# Filter the user and film predefined(65% of users)
filtered_user_65 = film_per_user[(film_per_user['Count']>=90)&(film_per_user['Count']<=130)]
filtered_film_50_100 = user_per_film[(user_per_film['Count']>=50)&(user_per_film['Count']<=100)]
filtered_film_50_200 = user_per_film[(user_per_film['Count']>=50)&(user_per_film['Count']<=200)]
filtered_film_100_500 = user_per_film[(user_per_film['Count']>=100)&(user_per_film['Count']<=500)]
filtered_film_100_1000 = user_per_film[(user_per_film['Count']>=100)&(user_per_film['Count']<=1000)]
filtered_film_200_1000 = user_per_film[(user_per_film['Count']>=200)&(user_per_film['Count']<=1000)]
filtered_film_200_500 = user_per_film[(user_per_film['Count']>=200)&(user_per_film['Count']<=500)]

## Check sparsity

In [18]:
# Define a funtion to calcualte the matrix 
def filtered_user_film_matrix(filtered_user,filtered_film):

    # merge the final table user-film-rating
    final_filtered_df = filtered_user.merge(reviews,how='left',on='user_id').merge(filtered_film,how='inner',on='film')
    final_filtered_df=final_filtered_df[['user_id','film','rating']]
    final_filtered_df = final_filtered_df.drop_duplicates()

    # Initialize separate LabelEncoders for user_id and Film
    #user_encoder = LabelEncoder()
    #film_encoder = LabelEncoder()

    # Encode the 'user_id' column
    #final_filtered_df['user_id'] = user_encoder.fit_transform(final_filtered_df['user_id'])

    # Encode the 'Film' column
    #final_filtered_df['film'] = film_encoder.fit_transform(final_filtered_df['film'])

    # Create user-item matrix
    user_item_matrix = final_filtered_df.pivot_table(index='user_id', columns='film', values='rating', fill_value=0)

    # Calculate sparsity
    num_non_zero_entries = (user_item_matrix != 0).sum().sum()  # Count of non-zero entries
    total_entries = user_item_matrix.size  # Total entries in the matrix
    sparsity = 1 - (num_non_zero_entries / total_entries)  # Sparsity value
    sparsity_percentage = sparsity * 100  # Sparsity percentage

    print(f'Matix size:',user_item_matrix.shape)
    print(f"Sparsity: {sparsity:.4f}")
    print(f"Sparsity Percentage: {sparsity_percentage:.2f}%")
    return user_item_matrix

In [57]:
# Check sparsity
df_userIF_filmIF = filtered_user_film_matrix(filtered_user_IF,filtered_film_IF)

Matix size: (5112, 72218)
Sparsity: 0.9996
Sparsity Percentage: 99.96%


In [58]:
# Check sparsity
df_userMEDI_filmMEDI = filtered_user_film_matrix(filtered_user_median,filtered_film_median)

Matix size: (4276, 78204)
Sparsity: 0.9986
Sparsity Percentage: 99.86%


In [59]:
# Check sparsity
df_userAVG_filmAVG = filtered_user_film_matrix(filtered_user_mean,filtered_film_mean)

Matix size: (5659, 12702)
Sparsity: 0.9934
Sparsity Percentage: 99.34%


In [60]:
# Check sparsity
df_userPERCEN_filmPERCEN = filtered_user_film_matrix(filtered_user_quantile,filtered_film_quantile)

Matix size: (3086, 14032)
Sparsity: 0.9996
Sparsity Percentage: 99.96%


In [61]:
# Check sparsity
df_user65_film50_200 = filtered_user_film_matrix(filtered_user_65,filtered_film_50_200)

Matix size: (5662, 2254)
Sparsity: 0.9870
Sparsity Percentage: 98.70%


In [62]:
# Check sparsity
df_userIF_film50_200 = filtered_user_film_matrix(filtered_user_IF,filtered_film_50_200)

Matix size: (5128, 2254)
Sparsity: 0.9869
Sparsity Percentage: 98.69%


In [63]:
# Check sparsity
df_userIF_film50_100 = filtered_user_film_matrix(filtered_user_IF,filtered_film_50_100)

Matix size: (5116, 1460)
Sparsity: 0.9902
Sparsity Percentage: 99.02%


In [64]:
# Check sparsity
df_userIF_film100_500 = filtered_user_film_matrix(filtered_user_IF,filtered_film_100_500)

Matix size: (5122, 1213)
Sparsity: 0.9742
Sparsity Percentage: 97.42%


In [65]:
# Check sparsity
df_userIF_film100_1000 = filtered_user_film_matrix(filtered_user_IF,filtered_film_100_1000)

Matix size: (5123, 1294)
Sparsity: 0.9703
Sparsity Percentage: 97.03%


In [66]:
# Check sparsity
df_userIF_film200_1000 = filtered_user_film_matrix(filtered_user_IF,filtered_film_200_1000)

Matix size: (5094, 483)
Sparsity: 0.9522
Sparsity Percentage: 95.22%


In [19]:
# Check sparsity
df_user65_film200_1000 = filtered_user_film_matrix(filtered_user_65,filtered_film_200_1000)

Matix size: (5624, 483)
Sparsity: 0.9523
Sparsity Percentage: 95.23%


In [68]:
# Check sparsity
df_userIF_film200_500 = filtered_user_film_matrix(filtered_user_IF,filtered_film_200_500)

Matix size: (5074, 402)
Sparsity: 0.9604
Sparsity Percentage: 96.04%


We will choose df_user65_film200_1000 as basetable to go further analysis

In [22]:
# filtered table
def merge_table(filtered_user,filtered_film,reviews):
    # Merge the final table user-film-rating
    final_filtered_df = filtered_user.merge(reviews, how='left', on='user_id') \
                                        .merge(filtered_film, how='inner', on='film')
    final_filtered_df = final_filtered_df[['user_id', 'film', 'date','rating']].drop_duplicates()
    return final_filtered_df

In [23]:
# save the train set which is used to build the model
final_filtered_df = merge_table(filtered_user_65,filtered_film_200_1000,reviews)
final_filtered_df.to_csv('train_df.csv')

In [24]:
# save the test set which is used to evaluate the model
last_watched_df2 = final_filtered_df.merge(last_watched_df,how='left',on='user_id')[['user_id','film_y']].rename(columns={'film_y':'film'})
last_watched_df2 = last_watched_df2.groupby('user_id').agg({
        'film': lambda x: set(x)       
    }).reset_index().rename(columns = {'film':'true_items'})
last_watched_df2.to_csv('last_watched.csv')

## SVD model

In [25]:
# user-based collaborative filtering by using SVD

# define the recommendation function 
def get_recommendations_SVD(final_filtered_df, user_ids, num_recommendations=5):
    """
    Generate top N recommendations for each user in user_ids using SVD.
    """

    # Load data into Surprise's Dataset format
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(final_filtered_df[['user_id', 'film', 'rating']], reader)
    
    # Build the trainset and train the SVD model
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)

    # Initialize an empty list to store recommendations
    recommendations_list = []

    # Loop through each user and get recommendations
    for user_id in user_ids:
        try:
            # Get a list of all items
            all_items = trainset.all_items()
            all_item_ids = [trainset.to_raw_iid(item) for item in all_items]

            # Get items the user has already rated
            user_inner_id = trainset.to_inner_uid(user_id)
            user_rated_items = set([trainset.to_raw_iid(item[0]) for item in trainset.ur[user_inner_id]])

            # Filter out items the user has already rated
            items_to_predict = [item for item in all_item_ids if item not in user_rated_items]

            # Predict ratings for all items the user has not rated yet
            predictions = [algo.predict(user_id, item_id) for item_id in items_to_predict]

            # Sort predictions by estimated rating and get the top N
            predictions.sort(key=lambda x: x.est, reverse=True)
            top_n_recommendations = predictions[:num_recommendations]

            # Append each recommendation to the list as a tuple (user_id, film, predicted_rating)
            for pred in top_n_recommendations:
                recommendations_list.append((user_id, pred.iid, pred.est))
        
        except ValueError:
            # Skip users that aren't in the training set without printing a message
            continue
    # Convert the list of recommendations to a DataFrame
    recommendations_df = pd.DataFrame(recommendations_list, columns=['user_id', 'film', 'predicted_rating'])

    return recommendations_df


In [26]:
# Assuming you have lists of filtered users and films and a list of user IDs    
user_ids = final_filtered_df['user_id'].unique()  # Get unique user IDs from your data
pred_list = get_recommendations_SVD(final_filtered_df, user_ids, num_recommendations=5)    # -- > no split dataset can work fast

# Convert the list of recommendations to a DataFrame
svd_df = pd.DataFrame(pred_list)

# transform the table
prediction_svd_df = svd_df.groupby('user_id').agg({
    'film': lambda x: set(x)}).reset_index().rename(columns={'film':'predicted_items'})

In [27]:
# Final table
# merge recommended table and last watched table
svd_recommended_df = prediction_svd_df.merge(last_watched_df2, how='left',on='user_id')

## Similarity Model

In [28]:
# calculate the similarity
user_similarity = cosine_similarity(df_user65_film200_1000.fillna(0))
user_similarity_df = pd.DataFrame(user_similarity, index=df_user65_film200_1000.index, columns=df_user65_film200_1000.index)

In [29]:
# Function to predict ratings for a single user
def predict_ratings(user_id, df, user_similarity_df, top_k=2):
    # Get similar users for the target user
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:top_k+1]
    # Weighted average of similar users' ratings
    predicted_ratings = (similar_users.values[:, None] * df.loc[similar_users.index]).sum(axis=0) / similar_users.sum()
    return predicted_ratings

# Predict ratings for all users
def predict_all_users(df, user_similarity_df, top_k=2):
    predicted_ratings_all = {}
    for user_id in df.index:
        predicted_ratings_all[user_id] = predict_ratings(user_id, df, user_similarity_df, top_k)
    # Convert predictions into a DataFrame
    return pd.DataFrame(predicted_ratings_all).T

# Predict ratings for all users
predicted_ratings_df = predict_all_users(df_user65_film200_1000, user_similarity_df)


In [30]:
# extract top k recommended items
def get_top_k(predicted_df, k=5):
    return predicted_df.apply(lambda x: x.nlargest(k).index, axis=1)

top_k_recommendations = get_top_k(predicted_ratings_df, k=5)

# Transform data
def transform_data(data):
    # Convert each user's films into a set
    transformed_data = [{'user_id': user_id, 'film': set(films)} for user_id, films in data.items()]
    # Convert into a DataFrame
    return pd.DataFrame(transformed_data)

# Apply transformation
transformed_df = transform_data(top_k_recommendations)

# get the final table as data frame
similarity_df = transformed_df.reset_index().drop(columns=['index'])

In [31]:
# merge recommended table and last watched table
similarity_recommended_df = similarity_df.merge(last_watched_df2, how='left',on='user_id').rename(columns={'film':'predicted_items'})

## Matrix Factorization Model

In [32]:
# matrix factorization function
def matrix_factorization(final_filtered_df, num_factors=10, num_iterations=100, learning_rate=0.01, reg_param=0.02):
    """
    Perform matrix factorization using gradient descent.

    Parameters:
        final_filtered_df (pd.DataFrame): DataFrame with columns ['user_id', 'film', 'rating'].
        num_factors (int): Number of latent factors.
        num_iterations (int): Number of iterations for gradient descent.
        learning_rate (float): Learning rate for gradient updates.
        reg_param (float): Regularization parameter.

    Returns:
        dict: User latent factors (P) and Item latent factors (Q).
        pd.DataFrame: Predicted ratings DataFrame.
    """
    # Create a mapping for user and film IDs to matrix indices
    user_mapping = {user: idx for idx, user in enumerate(final_filtered_df['user_id'].unique())}
    film_mapping = {film: idx for idx, film in enumerate(final_filtered_df['film'].unique())}
    reverse_film_mapping = {idx: film for film, idx in film_mapping.items()}

    # Get the number of users and items
    num_users = len(user_mapping)
    num_items = len(film_mapping)

    # Initialize user and item latent factor matrices (random values)
    P = np.random.normal(scale=1.0 / num_factors, size=(num_users, num_factors))
    Q = np.random.normal(scale=1.0 / num_factors, size=(num_items, num_factors))

    # Create a ratings matrix
    R = np.zeros((num_users, num_items))
    for _, row in final_filtered_df.iterrows():
        R[user_mapping[row['user_id']], film_mapping[row['film']]] = row['rating']

    # Perform gradient descent
    for iteration in range(num_iterations):
        for u in range(num_users):
            for i in range(num_items):
                if R[u, i] > 0:  # Only update for non-zero ratings
                    # Compute the error for the prediction
                    error = R[u, i] - np.dot(P[u, :], Q[i, :])
                    
                    # Update user and item latent factors
                    P[u, :] += learning_rate * (error * Q[i, :] - reg_param * P[u, :])
                    Q[i, :] += learning_rate * (error * P[u, :] - reg_param * Q[i, :])
        
        # Compute total loss (optional, for monitoring)
        total_loss = 0
        for u in range(num_users):
            for i in range(num_items):
                if R[u, i] > 0:
                    total_loss += (R[u, i] - np.dot(P[u, :], Q[i, :])) ** 2
                    total_loss += reg_param * (np.linalg.norm(P[u, :]) ** 2 + np.linalg.norm(Q[i, :]) ** 2)
        print(f"Iteration {iteration + 1}/{num_iterations}, Loss: {total_loss:.4f}")

    # Predict ratings for all user-item pairs
    predicted_ratings = np.dot(P, Q.T)

    # Convert predictions back to a DataFrame
    recommendations = []
    for user_id, user_idx in user_mapping.items():
        for item_idx, predicted_rating in enumerate(predicted_ratings[user_idx]):
            film_id = reverse_film_mapping[item_idx]
            recommendations.append((user_id, film_id, predicted_rating))

    recommendations_df = pd.DataFrame(recommendations, columns=['user_id', 'film', 'predicted_rating'])

    return {'P': P, 'Q': Q}, recommendations_df


In [33]:
# train model
latent_factors, matrix_factorized_pred= matrix_factorization(final_filtered_df, num_factors=10, num_iterations=100, learning_rate=0.01, reg_param=0.02)

Iteration 1/100, Loss: 1334812.2122
Iteration 2/100, Loss: 158638.5386
Iteration 3/100, Loss: 113377.1348
Iteration 4/100, Loss: 107795.9137
Iteration 5/100, Loss: 104932.0882
Iteration 6/100, Loss: 102695.9267
Iteration 7/100, Loss: 100686.2910
Iteration 8/100, Loss: 98766.8722
Iteration 9/100, Loss: 96880.9683
Iteration 10/100, Loss: 95006.4936
Iteration 11/100, Loss: 93140.9287
Iteration 12/100, Loss: 91294.1729
Iteration 13/100, Loss: 89483.4530
Iteration 14/100, Loss: 87728.7321
Iteration 15/100, Loss: 86048.6996
Iteration 16/100, Loss: 84457.9539
Iteration 17/100, Loss: 82965.7427
Iteration 18/100, Loss: 81576.0628
Iteration 19/100, Loss: 80288.5716
Iteration 20/100, Loss: 79099.7723
Iteration 21/100, Loss: 78004.1400
Iteration 22/100, Loss: 76995.0429
Iteration 23/100, Loss: 76065.4231
Iteration 24/100, Loss: 75208.2601
Iteration 25/100, Loss: 74416.8547
Iteration 26/100, Loss: 73684.9835
Iteration 27/100, Loss: 73006.9639
Iteration 28/100, Loss: 72377.6637
Iteration 29/100, Los

In [34]:
# transform the table
prediction_MF_df = matrix_factorized_pred.groupby('user_id').agg({'film': lambda x: set(x)}).reset_index().rename(columns={'film':'predicted_items'})

# Final table
# merge recommended table and last watched table
MF_recommended_df = prediction_MF_df.merge(last_watched_df2, how='left',on='user_id')

## Clustering Model

In [38]:
# clustering function
def clustering_recommendations(final_filtered_df, num_clusters=5, top_n=5):
    """
    Generate recommendations using clustering-based methods.

    Parameters:
        final_filtered_df (pd.DataFrame): DataFrame with columns ['user_id', 'film', 'rating'].
        num_clusters (int): Number of clusters for K-Means.
        top_n (int): Number of recommendations per user.

    Returns:
        pd.DataFrame: DataFrame with recommendations for each user.
    """
    # Create a user-item interaction matrix
    interaction_matrix = final_filtered_df.pivot_table(
        index='user_id', columns='film', values='rating', fill_value=0
    )

    # Perform K-Means clustering on users
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    user_clusters = kmeans.fit_predict(interaction_matrix)

    # Add cluster information to the DataFrame
    interaction_matrix['cluster'] = user_clusters
    cluster_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=interaction_matrix.columns[:-1])

    # Generate recommendations for each user
    recommendations = []
    for user_id, row in interaction_matrix.iterrows():
        user_cluster = row['cluster']
        user_ratings = row.drop('cluster')

        # Find the cluster centroid and suggest top items not rated by the user
        cluster_mean = cluster_centroids.loc[user_cluster]
        unrated_items = user_ratings[user_ratings == 0].index
        recommended_items = cluster_mean[unrated_items].sort_values(ascending=False).head(top_n)

        for film, score in recommended_items.items():
            recommendations.append((user_id, film, score))

    # Convert recommendations to a DataFrame
    recommendations_df = pd.DataFrame(recommendations, columns=['user_id', 'film', 'predicted_rating'])

    return recommendations_df


In [39]:
# run the funciton
clustering_pred = clustering_recommendations(final_filtered_df, num_clusters=5, top_n=5)

In [40]:
# transform the table
prediction_Clustering_df = clustering_pred.groupby('user_id').agg({'film': lambda x: set(x)}).reset_index().rename(columns={'film':'predicted_items'})

# Final table
# merge recommended table and last watched table
Clustering_recommended_df = prediction_Clustering_df.merge(last_watched_df2, how='left',on='user_id')

## Association Rule

In [68]:
def association_rule_recommendations_fpgrowth(final_filtered_df, min_support=0.05, min_confidence=0.6, top_n=5):
    """
    Generate recommendations using FP-Growth for frequent itemset mining.
    """
    interaction_matrix = final_filtered_df.pivot_table(
        index='user_id', columns='film', values='rating', aggfunc='count', fill_value=0
    ).applymap(lambda x: 1 if x > 0 else 0)

    frequent_itemsets = fpgrowth(interaction_matrix, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence,num_itemsets=10)

    print(f"Number of rules generated: {len(rules)}")

    recommendations = []
    for user_id, row in interaction_matrix.iterrows():
        user_items = set(row[row == 1].index)
        applicable_rules = rules[rules['antecedents'].apply(lambda x: x.issubset(user_items))]

        print(f"User {user_id} - {len(applicable_rules)} applicable rules found")

        for _, rule in applicable_rules.nlargest(top_n, 'confidence').iterrows():
            for item in rule['consequents']:
                if item not in user_items:
                    recommendations.append((user_id, item, rule['confidence']))

    return pd.DataFrame(recommendations, columns=['user_id', 'film', 'confidence'])


In [79]:
# apply
AR_pred = association_rule_recommendations_fpgrowth(final_filtered_df, min_support=0.03, min_confidence=0.5, top_n=5)

  interaction_matrix = final_filtered_df.pivot_table(


Number of rules generated: 35
User 00skywalkerr - 9 applicable rules found
User 0dsanyu - 1 applicable rules found
User 0nly0ne0f - 0 applicable rules found
User 0nn109 - 2 applicable rules found
User 10aflyviper - 2 applicable rules found
User 10romeo - 0 applicable rules found
User 11us - 14 applicable rules found
User 13visions - 6 applicable rules found
User 15digitusername - 4 applicable rules found
User 1_lick_walls - 0 applicable rules found
User 1carly - 1 applicable rules found
User 1so - 0 applicable rules found
User 1zzythe_strange - 0 applicable rules found
User 2bladesbalin - 0 applicable rules found
User 2ndrules - 1 applicable rules found
User 2old2dieyoung - 0 applicable rules found
User 2spiderheads - 2 applicable rules found
User 360smash - 0 applicable rules found
User 3a_kd - 1 applicable rules found
User 3goldballs - 1 applicable rules found
User 3lillen - 2 applicable rules found
User 46tomatoes - 1 applicable rules found
User 47ronin100 - 4 applicable rules found

In [77]:
# transform the table
prediction_AR_df = AR_pred.groupby('user_id').agg({'film': lambda x: set(x)}).reset_index().rename(columns={'film':'predicted_items'})

# Final table
# merge recommended table and last watched table
AR_recommended_df = prediction_AR_df.merge(last_watched_df2, how='left',on='user_id')

## Evaluation

In [36]:
# Evalution function
# Function to calculate precision, recall, F1, and accuracy
def calculate_metrics_with_accuracy(df, k):
    results = []
    for _, row in df.iterrows():
        predicted_items = set(row['predicted_items'])
        true_items = set(row['true_items'])
        
        predicted_k = set(list(predicted_items)[:k])
        intersection = predicted_k.intersection(true_items)

        # Precision, Recall, and F1
        precision = len(intersection) / len(predicted_k) if len(predicted_k) > 0 else 0
        recall = len(intersection) / len(true_items) if len(true_items) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Accuracy
        total_items = len(predicted_k.union(true_items))
        accuracy = len(intersection) / total_items if total_items > 0 else 0
        
        results.append({
            'user_id': row['user_id'], 
            'precision_at_k': precision, 
            'recall_at_k': recall, 
            'f1_score': f1,
            'accuracy': accuracy
        })

    return pd.DataFrame(results)


def evaluator(df,k=5):
    metrics = calculate_metrics_with_accuracy(df,k)
    # Calculate overall precision, recall, and F1-score by taking the mean
    overall_precision = metrics['precision_at_k'].mean()
    overall_recall = metrics['recall_at_k'].mean()
    overall_f1 = metrics['f1_score'].mean()
    overall_accuracy = metrics['accuracy'].mean()

    # Print the overall accuracy metrics
    print(f"Overall Precision: {overall_precision:.4f}")
    print(f"Overall Recall: {overall_recall:.4f}")
    print(f"Overall F1-Score: {overall_f1:.4f}")
    print(f"Overall Accuracy: {overall_accuracy:.4f}")

In [None]:
# SVD model
evaluator(svd_recommended_df)

Overall Precision: 0.0006
Overall Recall: 0.0025
Overall F1-Score: 0.0009
Overall Accuracy: 0.0006


In [None]:
# Similarity
evaluator(similarity_recommended_df)

Overall Precision: 0.0013
Overall Recall: 0.0046
Overall F1-Score: 0.0020
Overall Accuracy: 0.0012


In [81]:
# Matrix Factorization
print(MF_recommended_df.shape)
evaluator(MF_recommended_df)

(5624, 3)
Overall Precision: 0.0002
Overall Recall: 0.0006
Overall F1-Score: 0.0003
Overall Accuracy: 0.0002


In [82]:
# Clustering
print(Clustering_recommended_df.shape)
evaluator(Clustering_recommended_df)

(5624, 3)
Overall Precision: 0.0032
Overall Recall: 0.0116
Overall F1-Score: 0.0048
Overall Accuracy: 0.0028


In [83]:
# Association Rule
print(AR_recommended_df.shape)
evaluator(AR_recommended_df)

(421, 3)
Overall Precision: 0.0000
Overall Recall: 0.0000
Overall F1-Score: 0.0000
Overall Accuracy: 0.0000
