In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity  # To compute similarity scores
from sklearn.decomposition import TruncatedSVD, NMF  # For matrix factorization

Tasks:

Task 1: EDA
Task 2: Data Wrangling / Feature Engineering

Task 2: Data Wrangling and Feature Engineering

Check for outliers, missing data, null values, etc...

Can you come up with any new features?

Handle the 'genres' feature




In [166]:
df_movie = pd.read_csv('movies.csv')
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [167]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [168]:
df_movie.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [169]:
# Can you come up with any new features? , Handle the 'genres' feature

genre_features = df_movie['genres'].str.get_dummies(sep='|') # .str--> Tells pandas: “Treat each value as a string”, 

# get_dummies() --> Means: “Create binary (0/1) columns for each category”
# sep='|' --> Tells pandas: “Split each string using | before creating columns”
df_model = pd.concat([df_movie[['movieId','title']], genre_features], axis=1)
df_model.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [170]:
df_ratin = pd.read_csv('ratings.csv')
df_ratin.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [171]:
df_ratin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [172]:
df_ratin.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [173]:
skew_rat = df_ratin.skew()
skew_rat

userId      -0.079036
movieId      2.210804
rating      -0.637199
timestamp   -0.008777
dtype: float64

Merge both data frames on movieId, and call this DataFrame, Merged_df

Create a new DataFrame to hold the User-Item Matrix data, called UI_Matrix_df

index = 'userId'
columns = 'title'
values = 'rating'

In [174]:
# merging both data into one as "Merged_df"
Merged_df = df_ratin.merge(df_model, on='movieId', how='left')
Merged_df.head(10)


Unnamed: 0,userId,movieId,rating,timestamp,title,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story (1995),0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,964981247,Grumpier Old Men (1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,964982224,Heat (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
6,1,101,5.0,964980868,Bottle Rocket (1996),0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,1,110,4.0,964982176,Braveheart (1995),0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,1,151,5.0,964984041,Rob Roy (1995),0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
9,1,157,5.0,964984100,Canadian Bacon (1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [175]:
# Create a new DataFrame to hold the User-Item Matrix data, called UI_Matrix_df
# pivot_table converts long-format ratings data into a user–item matrix suitable for recommendation algorithms

UI_Matrix_df = Merged_df.pivot_table( 
    index='userId',  # Users as rows
    columns='title',  # Animes as columns
    values='rating'  # Ratings as values
)


print("USER-item MATRIX : ", UI_Matrix_df.shape)
# user_item_filled
UI_Matrix_df

USER-item MATRIX :  (610, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


## Task 3: Model Based Collaborative Filtering:

Imputing (Predict) the missing rating data

Create three separate Model based Collaborative Filtering matrices. Ensure that each model's prediction doesn't interfere with the others. 

SVD
NMF

In [176]:
# Create SVD model with 50 latent features
svd_model = TruncatedSVD(n_components=50, random_state=315)


# fiiling NANs to "0"
filled_UI_MATRIX = UI_Matrix_df.fillna(0)

# fit model and transfrom user-item matrix to user features
user_features = svd_model.fit_transform(filled_UI_MATRIX)
print(user_features.shape)


# Reconstruct ratings matrix by multiplying user and item features
# Produces a dense matrix of predicted ratings
# This fills in the previously missing values
# This matrix is the model-based CF output

predicted_ratings = np.dot(user_features, svd_model.components_) # predicted ratings= U × V   " U = user_features , V = svd_model.components_ ""
print(svd_model.components_.shape)


# converting bac to DataFrame
predicted_ratings_svd = pd.DataFrame(
    predicted_ratings,
    index= UI_Matrix_df.index,
    columns= UI_Matrix_df.columns
)
predicted_ratings_svd.head()


(610, 50)
(50, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.08274,0.048794,-0.01379,-0.039568,-0.056897,0.015895,0.691834,-0.015127,-0.385778,0.15806,...,0.024886,-0.475251,0.010278,-0.071406,-0.041261,0.45585,-0.199114,-0.197554,1.837912,0.028208
2,-0.028178,-0.010367,-0.007643,0.000993,-0.002397,-0.006548,0.020958,0.009001,0.115253,-0.006367,...,0.005711,-0.00315,-0.025738,-0.03395,0.015034,0.047643,-0.114609,-0.010136,0.060009,-0.001486
3,0.017417,0.00026,0.00139,0.00084,0.001359,-0.000268,0.087615,-0.006015,0.02904,-0.001896,...,0.000101,0.03923,0.024782,0.023442,-0.002092,0.030693,-0.008811,0.011097,0.059605,0.003108
4,-0.001392,0.00446,-0.000501,0.001076,0.040952,-0.02484,-0.276581,0.01226,-0.27681,-0.095794,...,-0.000678,-0.0747,-0.050944,-0.037873,-0.007254,0.101067,-0.190319,0.026171,0.105872,-0.010201
5,0.016179,0.002953,-0.001306,-0.007883,-0.003863,-0.008193,-0.082238,0.002894,0.004599,-0.085393,...,0.003168,0.039157,0.015396,0.00968,-0.005165,-0.031679,-0.03354,0.03551,-0.082131,0.000799


In [177]:
# NMF model with 50 latent features

nmf_model = NMF(n_components=50, random_state=315)


# fit model and transfrom user-item matrix to user features
user_features_nmf = nmf_model.fit_transform(filled_UI_MATRIX)
print(user_features_nmf.shape)

# Reconstruct ratings matrix by multiplying user and item features

prediction_rating_nmf = np.dot(user_features_nmf,nmf_model.components_ )
print(nmf_model.components_.shape)

# convertion into dataframe 

predicted_ratings_nmf = pd.DataFrame(
    prediction_rating_nmf,
    index= UI_Matrix_df.index,
    columns=  UI_Matrix_df.columns
)

predicted_ratings_nmf


(610, 50)
(50, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.008734,0.008512,0.001853,0.001900,0.000968,0.302830,0.0,0.000000,5.207064e-04,...,0.0,0.023138,0.027870,0.000000,0.000000,0.541027,0.000000,0.005061,0.563850,0.019278
2,0.000000,0.003589,0.002615,0.000125,0.000112,0.004405,0.005770,0.0,0.120562,0.000000e+00,...,0.0,0.024616,0.009366,0.000000,0.005778,0.023245,0.000000,0.007747,0.006447,0.000000
3,0.006128,0.000291,0.002383,0.001639,0.001586,0.000032,0.052875,0.0,0.006084,6.826951e-06,...,0.0,0.008431,0.006469,0.005711,0.000000,0.019213,0.004135,0.002284,0.037408,0.000995
4,0.000000,0.016574,0.020983,0.008025,0.030379,0.000098,0.006530,0.0,0.252343,1.658123e-03,...,0.0,0.023398,0.002225,0.000000,0.000000,0.407980,0.019374,0.002671,0.157411,0.003637
5,0.000000,0.004612,0.004562,0.000737,0.003320,0.000000,0.000000,0.0,0.045866,9.690567e-04,...,0.0,0.005336,0.006692,0.000000,0.000000,0.008068,0.003639,0.001843,0.008696,0.000677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.000000,0.025649,0.020833,0.015853,0.082234,0.000203,0.000319,0.0,0.064706,0.000000e+00,...,0.0,0.000000,0.000000,0.000000,0.000313,0.141076,0.000000,0.006716,0.000084,0.000000
607,0.000000,0.004853,0.005980,0.002263,0.003312,0.000721,0.115761,0.0,0.035079,1.017550e-03,...,0.0,0.030365,0.012109,0.000000,0.000053,0.571570,0.176211,0.013984,0.147285,0.014990
608,0.000000,0.000000,0.000000,0.000002,0.000002,0.000826,0.001282,0.0,0.007824,0.000000e+00,...,0.0,0.123229,0.000000,0.000000,0.001252,5.249919,3.009389,0.019768,0.000336,0.000000
609,0.000000,0.001906,0.001389,0.000000,0.000000,0.000006,0.000502,0.0,0.005823,0.000000e+00,...,0.0,0.003966,0.004973,0.000000,0.000534,0.022654,0.000000,0.001104,0.000215,0.000133


## Task 4-A:  Memory Based Collaborative Filtering:

1 - Create a User-based Collaborative Filtering Method (User-User Matrix) 
   - Create two distinct User-User Matrices, one each from the Model based Collaborative Filtering matrices created task 3
   - Populate the row-wise NaN's in UI_Matrix_df based on the User's Ratings
   - Try both Cosine and Pearson Similarities

2 - Write a function to predict the Rating that a UserId (UID) might give for a given MovieId (MID)
   - Rank order the similarity of UID to all other UserIds 
   - Predicted Rating = weighted avg. (based on similarity score) of k similar users' ratings of MID
   
3 -Test your function by predicting what User 1 might rate movieId 32 based on the top 50 users. 

  

In [178]:
# 1 - Creating the User-User Matrix 
#    - Populate the row-wise NaN's in UI_Matrix_df based on the User's Ratings
#    - Try both Cosine and Pearson Similarities

# cosine_similarity(UI) → user-user (rows compared)
# cosine_similarity(UI.T) → item-item (columns compared)

# Cosine Similarities 

user_similarity = cosine_similarity(predicted_ratings_svd)

# Convert to DataFrame with anime IDs as row and column labels
user_similarity_df = pd.DataFrame(
    user_similarity ,
    index=UI_Matrix_df.index,
    columns=UI_Matrix_df.index
)
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.007480,0.423977,0.484500,0.330827,0.199530,0.379868,0.254239,0.199127,0.004448,...,0.160156,0.309284,0.339554,0.170715,0.439189,0.229989,0.635607,0.484623,0.250141,0.194065
2,-0.007480,1.000000,0.031666,-0.034407,0.044846,0.080896,0.061307,0.079249,0.079289,0.364617,...,0.631031,0.053717,0.039630,0.083174,-0.020377,0.059690,0.095221,0.175323,0.118014,0.227142
3,0.423977,0.031666,1.000000,-0.063874,0.008660,0.093251,0.077799,0.000996,0.070322,0.046630,...,-0.003377,0.059067,0.204723,0.069367,0.121715,0.091538,0.429703,0.264419,-0.025681,0.253097
4,0.484500,-0.034407,-0.063874,1.000000,0.349405,0.165571,0.298051,0.215570,0.219201,0.107527,...,0.251070,0.250340,0.599464,0.143110,0.319141,0.360601,0.349042,0.274424,0.111720,0.192319
5,0.330827,0.044846,0.008660,0.349405,1.000000,0.774650,0.299550,0.932983,0.013924,0.083604,...,0.144226,0.906987,0.186024,0.794951,0.376550,0.178227,0.434465,0.249940,0.845987,0.106721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.229989,0.059690,0.091538,0.360601,0.178227,0.137151,0.358065,0.130490,0.270493,0.178243,...,0.257124,0.146830,0.304656,0.111179,0.299477,1.000000,0.252001,0.306857,0.128894,0.203840
607,0.635607,0.095221,0.429703,0.349042,0.434465,0.288553,0.518778,0.443493,0.241302,0.050960,...,0.263289,0.448258,0.342255,0.319832,0.329414,0.252001,1.000000,0.547836,0.460996,0.233300
608,0.484623,0.175323,0.264419,0.274424,0.249940,0.299223,0.602001,0.268147,0.404695,0.176272,...,0.279620,0.303516,0.292754,0.364540,0.397781,0.306857,0.547836,1.000000,0.228375,0.380942
609,0.250141,0.118014,-0.025681,0.111720,0.845987,0.690212,0.320397,0.932434,0.041315,0.086850,...,0.145273,0.861118,0.099454,0.756482,0.287823,0.128894,0.460996,0.228375,1.000000,0.100181


In [179]:
# Pearson Similarities
#  Use Pearson when:  You have ratings
                    # You want to compare rating behavior
                    # You care about relative preferences


# "T"(transpose, swap) --> it cahnges row to colmun and column to row, so since i am doing user-user,
#  i need to be the column as user instead of item
#  becasue "corr" goes through colmuns not rows , if i need item-item then I dont need to have "T" 
Pearson_user_similarity = predicted_ratings_svd.T.corr(method='pearson')
Pearson_user_similarity 


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.033117,0.397385,0.457476,0.315597,0.166900,0.350254,0.241368,0.164169,-0.037603,...,0.133737,0.286570,0.297571,0.142490,0.407315,0.174222,0.617581,0.450423,0.236211,0.131392
2,-0.033117,1.000000,0.006672,-0.065532,0.031449,0.060010,0.036408,0.069098,0.056972,0.350639,...,0.624711,0.035911,0.004763,0.065564,-0.054017,0.021493,0.070459,0.145118,0.107760,0.199275
3,0.397385,0.006672,1.000000,-0.121534,-0.016557,0.055475,0.032774,-0.019061,0.028868,0.005502,...,-0.036001,0.026934,0.152058,0.036867,0.069664,0.022566,0.400712,0.210428,-0.047583,0.194484
4,0.457476,-0.065532,-0.121534,1.000000,0.333316,0.125878,0.258775,0.200054,0.179413,0.063432,...,0.224310,0.221855,0.569536,0.109165,0.272723,0.306005,0.310991,0.212066,0.091365,0.117179
5,0.315597,0.031449,-0.016557,0.333316,1.000000,0.771682,0.282756,0.932540,-0.009780,0.062512,...,0.129271,0.905992,0.159742,0.791825,0.360962,0.148853,0.421797,0.225495,0.844389,0.070331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.174222,0.021493,0.022566,0.306005,0.148853,0.080251,0.309776,0.106297,0.220285,0.122503,...,0.222837,0.101889,0.228786,0.062409,0.232511,1.000000,0.191604,0.220620,0.102881,0.096807
607,0.617581,0.070459,0.400712,0.310991,0.421797,0.256766,0.493423,0.435780,0.204855,0.006761,...,0.238337,0.429116,0.295374,0.294919,0.286574,0.191604,1.000000,0.513985,0.452880,0.167145
608,0.450423,0.145118,0.210428,0.212066,0.225495,0.255450,0.575003,0.252934,0.365714,0.120610,...,0.247059,0.270230,0.215926,0.334682,0.340853,0.220620,0.513985,1.000000,0.208934,0.298208
609,0.236211,0.107760,-0.047583,0.091365,0.844389,0.687428,0.308028,0.931837,0.022352,0.069593,...,0.132991,0.860518,0.074409,0.754165,0.273130,0.102881,0.452880,0.208934,1.000000,0.070594


In [180]:

# Cosine Similarities 

user_similarity_nmf = cosine_similarity(predicted_ratings_nmf)

# Convert to DataFrame with anime IDs as row and column labels
user_similarity_df = pd.DataFrame(
    user_similarity_nmf  ,
    index=UI_Matrix_df.index,
    columns=UI_Matrix_df.index
)
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.463379,0.841989,0.618202,0.428222,0.407595,0.697312,0.380135,0.677629,0.431788,...,0.474134,0.415200,0.389438,0.409553,0.650064,0.343166,0.919700,0.551361,0.372181,0.337501
2,0.463379,1.000000,0.346015,0.370966,0.301179,0.214738,0.619006,0.284210,0.715989,0.815827,...,0.918051,0.253960,0.178061,0.229399,0.646477,0.305139,0.424337,0.396966,0.269086,0.506600
3,0.841989,0.346015,1.000000,0.551535,0.355311,0.358009,0.607988,0.309796,0.608518,0.351667,...,0.364292,0.355427,0.384715,0.373665,0.580021,0.308503,0.861693,0.507904,0.307700,0.373627
4,0.618202,0.370966,0.551535,1.000000,0.381654,0.335196,0.537700,0.298144,0.712314,0.394120,...,0.426730,0.348290,0.699614,0.327775,0.562078,0.446406,0.594999,0.417198,0.268441,0.284081
5,0.428222,0.301179,0.355311,0.381654,1.000000,0.914287,0.430242,0.990747,0.380069,0.278571,...,0.329759,0.972149,0.205328,0.967201,0.526092,0.206484,0.564053,0.315598,0.987138,0.149588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.343166,0.305139,0.308503,0.446406,0.206484,0.163934,0.382471,0.170714,0.446957,0.313717,...,0.330361,0.180248,0.312124,0.169732,0.381646,1.000000,0.333862,0.291368,0.156740,0.206215
607,0.919700,0.424337,0.861693,0.594999,0.564053,0.530495,0.722244,0.523282,0.658176,0.391432,...,0.465291,0.552208,0.393879,0.546844,0.635703,0.333862,1.000000,0.564952,0.519513,0.313955
608,0.551361,0.396966,0.507904,0.417198,0.315598,0.297577,0.620467,0.284248,0.566899,0.390545,...,0.376309,0.306738,0.255990,0.312172,0.524710,0.291368,0.564952,1.000000,0.273972,0.337737
609,0.372181,0.269086,0.307700,0.268441,0.987138,0.880094,0.371440,0.998547,0.307105,0.222469,...,0.286186,0.952537,0.143419,0.949956,0.450856,0.156740,0.519513,0.273972,1.000000,0.126773


In [181]:
Pearson_user_similarity_nmf = predicted_ratings_nmf.T.corr(method='pearson')
Pearson_user_similarity_nmf 


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.408938,0.824032,0.571563,0.398852,0.375025,0.664816,0.356811,0.635655,0.362931,...,0.427179,0.384844,0.327346,0.376927,0.604907,0.267407,0.912017,0.507172,0.349554,0.255885
2,0.408938,1.000000,0.270067,0.294583,0.263802,0.169662,0.578988,0.255368,0.681347,0.795291,...,0.911358,0.213369,0.096188,0.185091,0.602246,0.226689,0.369376,0.338728,0.240569,0.447741
3,0.824032,0.270067,1.000000,0.486818,0.317928,0.318625,0.559814,0.280059,0.544404,0.260344,...,0.299745,0.317923,0.313161,0.335382,0.513825,0.216405,0.847930,0.452948,0.279242,0.284129
4,0.571563,0.294583,0.486818,1.000000,0.345651,0.292577,0.478245,0.266775,0.662621,0.304377,...,0.366552,0.309118,0.664310,0.284215,0.488904,0.369038,0.549161,0.348876,0.236113,0.176470
5,0.398852,0.263802,0.317928,0.345651,1.000000,0.911381,0.400462,0.991133,0.341917,0.233988,...,0.297302,0.971232,0.161576,0.966106,0.504097,0.157859,0.544064,0.279925,0.987601,0.093051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.267407,0.226689,0.216405,0.369038,0.157859,0.109485,0.308459,0.131150,0.358394,0.220051,...,0.264007,0.129385,0.234280,0.115497,0.286714,1.000000,0.261804,0.213704,0.117639,0.096285
607,0.912017,0.369376,0.847930,0.549161,0.544064,0.506933,0.694447,0.508529,0.617712,0.322391,...,0.420090,0.531399,0.335875,0.524162,0.592715,0.261804,1.000000,0.524561,0.505557,0.234754
608,0.507172,0.338728,0.452948,0.348876,0.279925,0.258923,0.581836,0.255937,0.512024,0.320519,...,0.322979,0.270399,0.183964,0.274262,0.465522,0.213704,0.524561,1.000000,0.246260,0.260431
609,0.349554,0.240569,0.279242,0.236113,0.987601,0.877897,0.348479,0.998527,0.276721,0.186872,...,0.260747,0.952103,0.107437,0.949721,0.435151,0.117639,0.505557,0.246260,1.000000,0.082962


In [182]:
# 2 - Write a function to predict the Rating, that a UserId (UID) might give for a given MovieId (MID) --> “If user U has not rated movie M, how can we estimate what rating they would give?”
#    - Rank order the similarity of UID to all other UserIds 
#    - Predicted Rating = weighted avg. (based on similarity score) of k similar users' ratings of MID

In [191]:
userId = 10
neighbors = (
    Pearson_user_similarity.loc[userId]
    .dropna()
    .drop(index=userId)
)
neighbors = neighbors[neighbors > 0]
neighbors = neighbors.sort_values(ascending=False).head(5)
neighbors


userId
143    0.822243
490    0.740309
466    0.737137
598    0.722503
159    0.707216
Name: 10, dtype: float64

In [None]:
MID="Toy Story (1995)"
movie_ids = Merged_df.loc[Merged_df['title'] == MID, "movieId"].unique() # .loc is “select by label” in pandas.
movie_ids


In [186]:
user_id = 10
title = "Toy Story (1995)"

rating = UI_Matrix_df.loc[user_id, title] #  .iloc = position , .loc = label  , .loc[row_label, column_label] 
#                                                             means: Row labels = userId (index of UI_Matrix_df and similarity matrix)
                                                            #    Column labels = movie titles or userIds (depending on the table)
# rating
if pd.isna(rating):
    print("User has NOT rated this movie")
else:
    print("User already rated it:", rating)


User has NOT rated this movie


In [None]:
# You must filter neighbors who actually rated the target movie.
movie_ratings = UI_Matrix_df[title]
movie_ratings

# Similarity tells you who to trust.
# Ratings tell you what they said.

userId
1      4.0
2      NaN
3      NaN
4      NaN
5      4.0
      ... 
606    2.5
607    4.0
608    2.5
609    3.0
610    5.0
Name: Toy Story (1995), Length: 610, dtype: float64

In [None]:
# From my similar users, keep only those who actually rated this movie

neighbors_who_rated = neighbors[movie_ratings.notna()]
neighbors_who_rated

# the outcome has  "Name: 10 " refers to the target userId.

Series([], Name: 10, dtype: float64)

In [None]:
#  after neighbors_who_rated is computed, cehecking if this movie is rated from similar user

# Use the ratings of similar users, weighted by how similar they are, to predict the rating.
neighbor_ratings = movie_ratings.loc[neighbors_who_rated.index] #--< index → userIds (neighbors) , values → ratings they gave to this movie

if neighbors_who_rated.empty or neighbors_who_rated.abs().sum() == 0:
    predicted_rating = UI_Matrix_df.loc[userId].mean()
else:
    neighbor_ratings = movie_ratings.loc[neighbors_who_rated.index]
    predicted_rating = (
        (neighbors_who_rated * neighbor_ratings).sum()
        / neighbors_who_rated.abs().sum()
    )


print("Predicted rating:", predicted_rating)

# Weighted average formula
# predicted_rating = (
#     (neighbors_who_rated * neighbor_ratings).sum() / neighbors_who_rated.abs().sum()                                        
# each rating × similarity score then sum them (weighted_sum ) /  total “weight” (how strong similarities are)(total_weight)
# abs() → similarity strength (ignore sign)


Predicted rating: 3.2785714285714285


In [None]:
# How weights help with rating prediction (numeric example)
# Assume these users rated the same movie:
# | User | Similarity (weight) | Rating |
# |    A |                 0.9 |      5 |
# |    B |                 0.6 |      4 |
# |    C |                 0.2 |      1 |

# Step 1: Multiply rating × weight

# | User | weight × rating |
# | ---: | --------------: |
# |    A |   0.9 × 5 = 4.5 |
# |    B |   0.6 × 4 = 2.4 |
# |    C |   0.2 × 1 = 0.2 |

# Step 2: Sum them (weighted sum)

# weighted_sum = 4.5 + 2.4 + 0.2 = 7.1

# Step 3: Sum the weights

# total_weight = 0.9 + 0.6 + 0.2 = 1.7

# Step 4: Divide
# predicted_rating = 7.1 / 1.7 ≈ 4.18



2 - Write a function to predict the Rating that a UserId (UID) might give for a given MovieId (MID)
   - Rank order the similarity of UID to all other UserIds 
   - Predicted Rating = weighted avg. (based on similarity score) of k similar users' ratings of MID

In [None]:
import numpy as np
import pandas as pd

def predict_rating_uid_mid(UID, MID, Merged_df, UI_Matrix_df, Pearson_user_similarity, k=50):

    # 1) MID -> title (because UI_Matrix_df columns are titles)
    title_series = Merged_df.loc[Merged_df["movieId"] == MID, "title"]
    if title_series.empty:
        raise ValueError(f"MID={MID} not found in Merged_df.")
    title = title_series.iloc[0]

    # 2) If user already rated it, return the actual rating
    if title in UI_Matrix_df.columns:
        existing = UI_Matrix_df.loc[UID, title]
        if not pd.isna(existing):
            return float(existing)
    else:
        raise ValueError(f"Title '{title}' not found as a column in UI_Matrix_df.")

    # 3) Rank-order similarity of UID to all other users
    sim = (Pearson_user_similarity.loc[UID]
           .dropna()
           .drop(index=UID, errors="ignore")
          )
# Assume sim looks like this (Series: index=userId, value=similarity):
# | other userId | similarity |
# |            2 |       0.90 |
# |            5 |       0.85 |
# |            7 |       0.60 |
# |            8 |       0.40 |
# |           12 |       0.20 |
# |           20 |       0.10 |


    # optional (common): keep positive similarities only
    sim = sim[sim > 0]


  

    # 4) Keep only top-k users who rated this movie
    # movie_ratings comes from UI matrix (who rated the movie)
    movie_ratings = UI_Matrix_df[title]  # index=userId, values=rating or NaN , Ratings for one movie title (movie_ratings)
    # Assume the target movie is "Toy Story (1995)". Then movie_ratings (Series: index=userId, value=rating or NaN) could be:
# | userId | rating for Toy Story |
# |      2 |                  NaN |
# |      5 |                  4.0 |
# |      7 |                  5.0 |.    ---> movie_ratings (index=userId, values=rating or NaN )
# |      8 |                  NaN |
# |     12 |                  3.0 |
# |     20 |                  NaN |
# |     33 |                  4.0 |

# top-k similar users
# “Rank order the similarity” means sort users by similarity score from highest to lowest. (topk)
    topk = sim.sort_values(ascending=False).head(k) # topk comes from similarity matrix (who is most similar to UID)
# Let’s say k = 3.
# Sorted sim is already in descending order, so top k becomes:
# 2    0.90
# 5    0.85
# 7    0.60


# Who rated this movie, Keep only top-k users who rated the movie
    neighbors_who_rated = topk[movie_ratings.notna()] #neighbors_who_rated comes from combining both: “similar users” ∩ “users who rated this movie”                 
                                                                    # This line does index alignment:

                                                                    #    topk has users: 2, 5, 7
                                                                    #    movie_ratings.notna() tells:
                                                                    #    user 2 → False (didn’t rate)
                                                                    #    user 5 → True (rated)
                                                                    #    user 7 → True (rated)
                                                                    # So the filter keeps only users 5 and 7:
                                                                    # neighbors_who_rate --> 5    0.85
                                                                    #                        7    0.60

# Meaning: among the top-3 most similar users, only users 5 and 7 actually rated the movie, so only they can be used for prediction.
# movie_ratings.notna()
# 2     False
# 5      True
# 7      True
# 8     False
# 12     True
# 20    False
# 33     True       -->. Meaning: users 5, 7, 12, 33 rated the movie; the others did not.



    # 5) If empty (or zero weight), fallback
    if neighbors_who_rated.empty or neighbors_who_rated.abs().sum() == 0:
        return float(UI_Matrix_df.loc[UID].mean())

    # 6) Weighted average base on similarity score --> 
    # Users who are more similar to the target user should influence the predicted rating more than users who are less similar.
    neighbor_ratings = movie_ratings.loc[neighbors_who_rated.index]
    weighted_sum = (neighbors_who_rated * neighbor_ratings).sum()
    total_weight = neighbors_who_rated.abs().sum()
    return float(weighted_sum / total_weight)


In [None]:
pred = predict_rating_uid_mid(
    UID=1,
    MID=32,
    Merged_df=Merged_df,
    UI_Matrix_df=UI_Matrix_df,
    Pearson_user_similarity=Pearson_user_similarity,
    k=50
)

print("Predicted rating for (User 1, movieId 32):", pred)


Predicted rating for (User 1, movieId 32): 4.303182113322988


Task 3-B:  Create a Item-based Collaborative Filtering Method:
 1 - Creating the Item-Item Matrix
   - Populate column-wise NaN's in UI_Matrix_df based on the Movie's Ratings
   - Try both Cosine and Pearson Similarities
 2 - Write a function to predict the top-N most similar movies to MovieId (MID) based on all ratings
   - Rank order the similarity of MID to all other movies
   - Return the top-N most similar movies
 3 - Test your function by predicting what are the most similar movies to Jurassic Park (1993)  