In [1]:
!pip install surprise

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
folder_path = '../data/raw/ml-100k'


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Load and preprocess the data
ratings_df = pd.read_csv(folder_path+'/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
users_df = pd.read_csv(folder_path+'/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
movies_df = pd.read_csv(folder_path+'/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDB_URL'] + ['genre'+str(i) for i in range(19)])
movies_df['release_year'] = movies_df['title'].str.extract(r'\((\d{4})\)')


In [4]:
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import train_test_split

# Load the dataset
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], Reader(rating_scale=(1, 5)))

# Split the data into training and test set (e.g., 80-20 split)
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize the KNN model (using KNNWithMeans for this example)
knn_model = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})

# Train the model on the training set
knn_model.fit(trainset)

# Predict ratings for the test set
predictions = knn_model.test(testset)

# Calculate and print the RMSE
rmse = accuracy.rmse(predictions)
print(f"Test RMSE: {rmse}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9596
Test RMSE: 0.9595528113887499


In [5]:
user_id = 1

In [6]:
# Get all movies that the user hasn't rated yet
user_unrated_movies = ratings_df[~ratings_df['movie_id'].isin(ratings_df[ratings_df['user_id'] == user_id]['movie_id'])]
user_unrated_movies = user_unrated_movies['movie_id'].unique()

# Predict ratings for these movies
predicted_ratings = []
for movie_id in user_unrated_movies:
    predicted = knn_model.predict(uid=user_id, iid=movie_id)
    predicted_ratings.append((movie_id, predicted.est))

# Sort the predictions by estimated ratings in descending order and select top N
predicted_ratings.sort(key=lambda x: x[1], reverse=True)
top_n = 10  # Number of top recommendations
top_movies = predicted_ratings[:top_n]

# Fetching movie titles and ratings
recommended_movies = [(movies_df[movies_df['movie_id'] == m_id]['title'].iloc[0], rating) for m_id, rating in top_movies]

# Display the recommendations with estimated ratings
for movie, rating in recommended_movies:
    print(f"Movie: {movie}, Estimated Rating: {rating}")


Movie: Santa with Muscles (1996), Estimated Rating: 5
Movie: Boys, Les (1997), Estimated Rating: 5
Movie: Great Day in Harlem, A (1994), Estimated Rating: 5
Movie: Someone Else's America (1995), Estimated Rating: 5
Movie: Saint of Fort Washington, The (1993), Estimated Rating: 5
Movie: Anna (1996), Estimated Rating: 4.950557333627635
Movie: Faust (1994), Estimated Rating: 4.875776249504647
Movie: Entertaining Angels: The Dorothy Day Story (1996), Estimated Rating: 4.860756890168655
Movie: The Deadly Cure (1996), Estimated Rating: 4.778645453218508
Movie: Star Kid (1997), Estimated Rating: 4.771326869306938
