## 1. Project Summary

## 2. Imports and Loading of Data

In [40]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate


In [41]:
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')

In [42]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [43]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [44]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


## 3. Data Preparation
### Data Cleaning

In [46]:
#drop unnecessary columns
ratings = ratings.drop('timestamp', axis = 1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [47]:
from surprise import Reader, Dataset
# reading in values as Surprise dataset 
reader = Reader()
data = Dataset.load_from_df(ratings, reader)

### We want to see how Sparse are our Users and Items 

In [48]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


### Data is highly sparsed, we need to remove Sparse Users and Items

In [49]:
# Filtering users who are inactive 
ratings = ratings.groupby("userId").filter(lambda x: len(x) >= 5)

# Filtering movies which are rarely rated 
movie_counts = ratings.groupby("movieId").size()
ratings = ratings[ratings["movieId"].isin(movie_counts[movie_counts >= 20].index)]

### Feature Engineering via Mean Centering

In [50]:
ratings["user_mean"] = ratings.groupby("userId")["rating"].transform("mean")
ratings["rating_centered"] = ratings["rating"] - ratings["user_mean"]

ratings.describe()

Unnamed: 0,userId,movieId,rating,user_mean,rating_centered
count,67898.0,67898.0,67898.0,67898.0,67898.0
mean,316.273189,12869.939512,3.62306,3.62306,-2.2394790000000002e-17
std,181.360247,26771.478528,1.005625,0.430541,0.9087993
min,1.0,1.0,0.5,1.315789,-4.063636
25%,164.0,849.0,3.0,3.389535,-0.52
50%,313.0,2193.0,4.0,3.628571,0.09243697
75%,474.0,5418.0,4.0,3.921348,0.6032316
max,610.0,168252.0,5.0,5.0,3.431818


### Item - User Matrix Pipeline

In [51]:
class UserItemTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.pivot_table(
            index="userId",
            columns="movieId",
            values="rating_centered"  #We will centre the values here
        )

pipeline = Pipeline([
    ("user_item", UserItemTransformer())
])

user_item = pipeline.fit_transform(ratings).fillna(0)

### Final Sparsity Check

In [52]:
user_item_binary = user_item.notna().astype(int)
user_overlap = user_item_binary.dot(user_item_binary.T)
MIN_OVERLAP = 3  # Minimum co-rated movies for Pearson to reduce noise from sparsity


### Users with very few ratings and rarely-rated movies were removed. Ratings were mean-centered and missing values filled with zero. Pearson similarity calculations require at least 3 co-rated movies to reduce noise from sparsity.

## 4. Model 1: User-Based Collaborative Filtering

### Similarity Metrics

In [None]:
# Here we will Compute Similarity Metrics
cosine_sim = cosine_similarity(user_item)
pearson_sim = user_item.T.corr(method="pearson", min_periods=MIN_OVERLAP).fillna(0).values
euclidean_sim = 1 / (1 + euclidean_distances(user_item))  # Here we have scaled for comparison

### CF Top-5 Recommendation Function

In [54]:
# Here we will have the Top-5 Recommendation Function
def recommend_cf(user_id, sim_matrix, n=5):
    user_idx = user_item.index.get_loc(user_id)
    scores = sim_matrix[user_idx]
    neighbors = np.argsort(scores)[::-1][1:11]  # Here we will deal with 10 neighbors

    weighted_ratings = user_item.iloc[neighbors].mean()
    unseen_mask = user_item.iloc[user_idx] == 0

    recs = weighted_ratings[unseen_mask].sort_values(ascending=False).head(n)
    return movies[movies["movieId"].isin(recs.index)][["title", "genres"]]


### CF sample Recommendation

In [55]:
# Here we will do a sample Recommendation
recommend_cf(user_id=1, sim_matrix=cosine_sim)

Unnamed: 0,title,genres
474,Blade Runner (1982),Action|Sci-Fi|Thriller
896,One Flew Over the Cuckoo's Nest (1975),Drama
951,Chinatown (1974),Crime|Film-Noir|Mystery|Thriller
975,Cool Hand Luke (1967),Drama
2195,Ferris Bueller's Day Off (1986),Comedy


### Above Cosine similarity was chosen as the primary metric due to the sparsity of the matrix. Only the top 10 most similar users contribute to predictions.

## 5. Model 2: Matrix Factorization - SVD

In [56]:
# SVD here learns latent user and item factors. 
# Cross-validation ensures proper estimation of model performance on unseen data.

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

svd_model = SVD(n_factors=50, random_state=42) # Here we use 

cv_results = cross_validate(svd_model, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8443  0.8348  0.8513  0.8412  0.8401  0.8423  0.0054  
MAE (testset)     0.6479  0.6363  0.6529  0.6405  0.6462  0.6448  0.0058  
Fit time          7.37    6.63    6.85    5.85    8.10    6.96    0.75    
Test time         0.29    0.19    0.30    0.34    0.35    0.30    0.06    


### SVD Top-5 Recommendation Function

In [57]:
svd_model.fit(data.build_full_trainset())

def recommend_svd(user_id, n=5):
    user_movies = ratings[ratings["userId"] == user_id]["movieId"]
    all_movies = movies["movieId"].values
    unseen_movies = [m for m in all_movies if m not in user_movies]

    predictions = [ (m, svd_model.predict(user_id, m).est) for m in unseen_movies ]
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    top_movie_ids = [m[0] for m in top_n]

    return movies[movies["movieId"].isin(top_movie_ids)][["title", "genres"]]


### SVD Function For Top 5 Recommendations

In [58]:
svd_model.fit(data.build_full_trainset())

def recommend_svd(user_id, n=5):
    user_movies = ratings[ratings["userId"] == user_id]["movieId"]
    all_movies = movies["movieId"].values
    unseen_movies = [m for m in all_movies if m not in user_movies]

    predictions = [ (m, svd_model.predict(user_id, m).est) for m in unseen_movies ]
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    top_movie_ids = [m[0] for m in top_n]

    return movies[movies["movieId"].isin(top_movie_ids)][["title", "genres"]]


### SVD Top 5 Recommendations

In [59]:
recommend_svd(user_id=1, n=5)

Unnamed: 0,title,genres
210,Hoop Dreams (1994),Documentary
224,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
277,"Shawshank Redemption, The (1994)",Crime|Drama
413,In the Name of the Father (1993),Drama
602,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
