# Setup Environment

In [2]:
# Install the surprise library
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357245 sha256=9c3e37eec78b0aa2146a9a8d893b12e711a78eb87c73ff7cdbd21950a0e22477
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [30]:
!pip install joblib



In [31]:
# Import libraries
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise import accuracy
from sklearn.metrics import accuracy_score
import joblib
from surprise.model_selection import LeaveOneOut

# Load Dataset

In [4]:
# Load the MovieLens dataset
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
dataset_path = '/content/ml-latest-small.zip'

# Download and unzip the dataset
!wget -nc $url -O $dataset_path
!unzip -n $dataset_path -d /content/

# Load data into pandas dataframes
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
movies = pd.read_csv('/content/ml-latest-small/movies.csv')

--2024-06-30 06:36:51--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘/content/ml-latest-small.zip’


2024-06-30 06:36:53 (806 KB/s) - ‘/content/ml-latest-small.zip’ saved [978202/978202]

Archive:  /content/ml-latest-small.zip
   creating: /content/ml-latest-small/
  inflating: /content/ml-latest-small/links.csv  
  inflating: /content/ml-latest-small/tags.csv  
  inflating: /content/ml-latest-small/ratings.csv  
  inflating: /content/ml-latest-small/README.txt  
  inflating: /content/ml-latest-small/movies.csv  


# Exploring Data

In [5]:
# Display the first few rows of the ratings dataset
print("Ratings Data:")
print(ratings.head())

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [6]:
# Display the first few rows of the movies dataset
print("\nMovies Data:")
print(movies.head())



Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [7]:
# Check for missing values
print("\nMissing Values in Ratings:")
print(ratings.isnull().sum())


Missing Values in Ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [8]:
print("\nMissing Values in Movies:")
print(movies.isnull().sum())


Missing Values in Movies:
movieId    0
title      0
genres     0
dtype: int64


# Data Preprocessing

In [9]:
# Merge ratings with movies to get movie titles in the ratings dataframe
ratings = pd.merge(ratings, movies, on='movieId')
print("Merged Ratings Data:")
print(ratings.head())


Merged Ratings Data:
   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [10]:
# Create user-item interaction matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='title', values='rating')
print("User-Item Interaction Matrix:")
user_item_matrix.head()


User-Item Interaction Matrix:


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Load the dataset into the surprise library format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [12]:
# Define the LeaveOneOut cross-validator
loo = LeaveOneOut(n_splits=1, random_state=42)


In [27]:
# Split the dataset using LeaveOneOut and train the model
for trainset, testset in loo.split(data):
    # Initialize the SVD algorithm
    svd = SVD()

    # Train the algorithm on the training set
    svd.fit(trainset)

    # Predict ratings for the test set
    predictions = svd.test(testset)

    # Calculate RMSE
    rmse = accuracy.rmse(predictions)
    print(f"Test RMSE: {rmse}")

    # Calculate accuracy
    threshold = 3.5
    y_true = [1 if true_r >= threshold else 0 for (_, _, true_r, _, _) in predictions]
    y_pred = [1 if est >= threshold else 0 for (_, _, _, est, _) in predictions]

    acc = accuracy_score(y_true, y_pred)
    print(f"Test Accuracy: {acc}")

RMSE: 0.8562
Test RMSE: 0.8562435749668361
Test Accuracy: 0.7475409836065574


## Hyper parameter tuning

In [28]:
# Split the dataset using LeaveOneOut and train the model
for trainset, testset in loo.split(data):
    # Initialize the SVD algorithm
    svd2 = SVD(n_factors=100, n_epochs=40, lr_all=0.01, reg_all=0.2)

    # Train the algorithm on the training set
    svd2.fit(trainset)

    # Predict ratings for the test set
    predictions = svd2.test(testset)

    # Calculate RMSE
    rmse = accuracy.rmse(predictions)
    print(f"Test RMSE: {rmse}")

    # Calculate accuracy
    threshold = 3.5
    y_true = [1 if true_r >= threshold else 0 for (_, _, true_r, _, _) in predictions]
    y_pred = [1 if est >= threshold else 0 for (_, _, _, est, _) in predictions]

    acc = accuracy_score(y_true, y_pred)
    print(f"Test Accuracy: {acc}")

RMSE: 0.8740
Test RMSE: 0.8740234432363941
Test Accuracy: 0.7131147540983607


In [29]:
# Split the dataset using LeaveOneOut and train the model
for trainset, testset in loo.split(data):
    # Initialize the SVD algorithm
    svd3 = SVD(n_factors=10, n_epochs=40, lr_all=0.001, reg_all=0.2)

    # Train the algorithm on the training set
    svd3.fit(trainset)

    # Predict ratings for the test set
    predictions = svd3.test(testset)

    # Calculate RMSE
    rmse = accuracy.rmse(predictions)
    print(f"Test RMSE: {rmse}")

    # Calculate accuracy
    threshold = 3.5
    y_true = [1 if true_r >= threshold else 0 for (_, _, true_r, _, _) in predictions]
    y_pred = [1 if est >= threshold else 0 for (_, _, _, est, _) in predictions]

    acc = accuracy_score(y_true, y_pred)
    print(f"Test Accuracy: {acc}")

RMSE: 0.8808
Test RMSE: 0.88079012058602
Test Accuracy: 0.740983606557377


# Save model

In [34]:
# Save the trained model
joblib.dump(svd, 'svd_model.pkl')


['svd_model.pkl']