## 1. Project Summary

## 2. Imports and Loading of Data

In [140]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate


In [141]:
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')

In [142]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [143]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [144]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [145]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


## 3. Data Preparation
### Data Cleaning

In [146]:
#drop unnecessary columns
ratings = ratings.drop('timestamp', axis = 1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [147]:
from surprise import Reader, Dataset
# reading in values as Surprise dataset 
reader = Reader()
data = Dataset.load_from_df(ratings, reader)

### We want to see how Sparse are our Users and Items 

In [148]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


### Data is highly sparsed, we need to remove Sparse Users and Items

In [149]:
# Filtering users who are inactive 
ratings = ratings.groupby("userId").filter(lambda x: len(x) >= 5)

# Filtering movies which are rarely rated 
movie_counts = ratings.groupby("movieId").size()
ratings = ratings[ratings["movieId"].isin(movie_counts[movie_counts >= 20].index)]

### Feature Engineering via Mean Centering

In [150]:
ratings["user_mean"] = ratings.groupby("userId")["rating"].transform("mean")
ratings["rating_centered"] = ratings["rating"] - ratings["user_mean"]

### Item - User Matrix Pipeline

In [None]:
class UserItemTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.pivot_table(
            index="userId",
            columns="movieId",
            values="rating_centered"  #We will centre the values here
        )

pipeline = Pipeline([
    ("user_item", UserItemTransformer())
])

user_item = pipeline.fit_transform(ratings).fillna(0)

### Final Sparsity Check

In [None]:
user_item_binary = user_item.notna().astype(int)
user_overlap = user_item_binary.dot(user_item_binary.T)
MIN_OVERLAP = 3  # Minimum co-rated movies for Pearson to reduce noise from sparsity

### Users with very few ratings and rarely-rated movies were removed. Ratings were mean-centered and missing values filled with zero. Pearson similarity calculations require at least 3 co-rated movies to reduce noise from sparsity.

## 4. Model 1: User-Based Collaborative Filtering

In [None]:
# Here we will Compute Similarity Metrics
cosine_sim = cosine_similarity(user_item)
pearson_sim = user_item.T.corr(method="pearson", min_periods=MIN_OVERLAP).fillna(0).values
euclidean_sim = 1 / (1 + euclidean_distances(user_item))  # Here we have scaled for comparison


In [None]:
# Here we will have the Top-5 Recommendation Function
def recommend_cf(user_id, sim_matrix, n=5):
    user_idx = user_item.index.get_loc(user_id)
    scores = sim_matrix[user_idx]
    neighbors = np.argsort(scores)[::-1][1:11]  # Here we will deal with 10 neighbors

    weighted_ratings = user_item.iloc[neighbors].mean()
    unseen_mask = user_item.iloc[user_idx] == 0

    recs = weighted_ratings[unseen_mask].sort_values(ascending=False).head(n)
    return movies[movies["movieId"].isin(recs.index)][["title", "genres"]]


In [None]:
# Here we will do a sample Recommendation
recommend_cf(user_id=1, sim_matrix=cosine_sim)

Unnamed: 0,title,genres
474,Blade Runner (1982),Action|Sci-Fi|Thriller
896,One Flew Over the Cuckoo's Nest (1975),Drama
951,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
975,Cool Hand Luke (1967),Drama
2195,Ferris Bueller's Day Off (1986),Comedy


### Above Cosine similarity was chosen as the primary metric due to the sparsity of the matrix. Only the top 10 most similar users contribute to predictions.