### Download the relevant packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn.metrics as metrics
#from surprise import Dataset, Reader
import warnings
warnings.filterwarnings('ignore')

### Loading the Data and Data Preparation

In [3]:
# upload links datasets
links = pd.read_csv("links.csv")
print(links.shape)
# links.head(5)

#upload movies dataset
movies = pd.read_csv("movies.csv")
print(movies.shape)
# movies.head(5)

# upload ratings dataset
ratings = pd.read_csv("ratings.csv")
print(ratings.shape)
#ratings.head(5)


# Merge links and movies dataset using 'movieId'
links_movies = pd.merge(links, movies, on="movieId", how="inner")

# Now merge the result with ratings using 'movieId'
links_movies_ratings = pd.merge(links_movies, ratings, on="movieId", how="inner")

# Print shapes and display the first few rows
print(links_movies_ratings.shape)
links_movies_ratings.head(5)

(9742, 3)
(9742, 3)
(43084, 4)
(43084, 8)


Unnamed: 0,movieId,imdbId,tmdbId,title,genres,userId,rating,timestamp
0,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982700.0
1,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847435000.0
2,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106636000.0
3,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510578000.0
4,1,114709,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696000.0


In [4]:
links_movies_ratings.dropna(inplace=True)
links_movies_ratings.isnull().sum()

Unnamed: 0,0
movieId,0
imdbId,0
tmdbId,0
title,0
genres,0
userId,0
rating,0
timestamp,0


In [5]:
# drop columns
links_movies_ratings.drop(["imdbId", "tmdbId"], axis=1, inplace=True)
links_movies_ratings.head(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696000.0


#### preparing the data for Surprise

In [6]:
links_movies_ratings["rating"].min()

0.5

In [7]:
links_movies_ratings["rating"].max()

5.0

In [8]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505166 sha256=b77767b0a8a4f80cbb481260a62fb59682a4f2bccf4bd1f2fa2f86a6f8cfb196
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6

In [15]:
from surprise import Dataset, Reader

# Define the rating scale (e.g., 0.5 to 5.0)
reader = Reader(rating_scale=(0.5, 5.0))

# Convert the DataFrame to a Surprise dataset
df = Dataset.load_from_df(links_movies_ratings[["userId", "movieId", "rating"]], reader)


### Split the data

In [18]:
from surprise.model_selection import train_test_split

# Split the dataset
trainset, testset = train_test_split(df, test_size=0.2, random_state=42)


### Item-based collaborative filtering

#### Configure the model

In [20]:
# if you are using cousin similarity
from surprise import KNNBasic

# Define similarity options
sim_options = {
    'name': 'cosine',  # Use cosine similarity to measure the similarity between items
    'user_based': False  # Set to False for item-based filtering (True would be for user-based filtering)
}

# Build the model using the KNNBasic algorithm
item_cf_model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
item_cf_model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7eb1a0282350>

In [None]:
# if you want to use pearson
from surprise import KNNBasic

# Define similarity options
sim_options = {
    'name': 'pearson',  # Use cosine similarity to measure the similarity between items
    'user_based': False  # Set to False for item-based filtering (True would be for user-based filtering)
}

# Build the model using the KNNBasic algorithm
item_cf_model = KNNBasic(sim_options=sim_options)

# Train the model on the training set
item_cf_model.fit(trainset)