# Data Preprocessing:
## Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading Data from CSV Files

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


non-greedy match

## Data cleaning
we preprocess the movie data by extracting the year from the title and splitting the genres into separate columns.

In [3]:
movies = movies_df.copy()

# Using regular expressions with the str.extract() method,
# we extract the year from the title of each movie and store it in a new column called 'year'.
movies['year'] = movies.title.str.extract('(\d{4})', expand=False)
# We remove the year from the title using regular expressions and the str.replace() method. 
# Then, we strip any leading or trailing whitespace from the titles.
movies['title'] = movies.title.str.replace('\(\d{4}\)', '', regex=True)
movies['title'] = movies['title'].str.strip()
# We split the genres listed for each movie, which are separated by '|',
# into a list of genres and store them in 'genres'.
movies['genres'] = movies_df.genres.str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


## Encoding Genres as Binary Features

we encode movie genres as binary features in a new DataFrame.

In [4]:
GenresMovies_df = movies.copy()
# We iterate through each row of the DataFrame and for each movie, 
# we set the value of the corresponding genre column to 1, indicating the presence of that genre.
for index, row in movies.iterrows():
    for genre in row['genres']:
        GenresMovies_df.at[index, genre] = 1
# We fill missing values (NaN) with 0, indicating that the movie does not belong to that genre.
GenresMovies_df = GenresMovies_df.fillna(0)
GenresMovies_df = GenresMovies_df.set_index(GenresMovies_df['movieId'])
GenresMovies = GenresMovies_df.drop(columns=['movieId', 'title', 'genres', 'year'])
GenresMovies

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Content-Based recommendation system
## Creating User Input Movies

In [5]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
            ] 
inputMovies = pd.DataFrame(userInput)
# We retrieve the 'movieId' of each user input movie from the 'movies' DataFrame by matching the movie titles.
inputId = movies[movies['title'].isin(inputMovies['title'])]
inputMovies.insert(0, 'movieId', inputId['movieId'].tolist())
inputMovies

Unnamed: 0,movieId,title,rating
0,1,"Breakfast Club, The",5.0
1,2,Toy Story,3.5
2,296,Jumanji,2.0
3,1274,Pulp Fiction,5.0
4,1968,Akira,4.5


## Extracting User Input Movies

we extract the encoded genre features for the user input movies.

In [6]:
userMovies = GenresMovies_df[GenresMovies_df['movieId'].isin(inputMovies['movieId'])]
userMovies = userMovies.drop(columns=['movieId', 'title', 'genres', 'year'])
userMovies = userMovies.reset_index(drop=True)
userMovies

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating User Profile

In this section, we create a user profile based on the input movie ratings and encoded genre features.
We transpose the 'userMovies' DataFrame to have genres as rows and movies as columns, then perform dot product with input movie ratings to calculate the weighted sum of genre features.

In [7]:
userProfile = userMovies.transpose().dot(inputMovies['rating'])
userProfile = userProfile/sum(userProfile)
userProfile

Adventure             0.186207
Animation             0.137931
Children              0.117241
Comedy                0.158621
Fantasy               0.117241
Romance               0.000000
Drama                 0.089655
Action                0.068966
Crime                 0.027586
Thriller              0.027586
Horror                0.000000
Mystery               0.000000
Sci-Fi                0.068966
War                   0.000000
Musical               0.000000
Documentary           0.000000
IMAX                  0.000000
Western               0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
dtype: float64

## Generating Movie Recommendations

In this section, we generate movie recommendations based on content similarity.
We calculate the dot product of the 'GenresMovies' DataFrame (containing movie genre features) and the user profile ('userProfile') to compute a weighted sum of genre scores for each movie.

In [8]:
recommendationTable_df = (GenresMovies*userProfile).sum(axis=1)
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
recommendationTable_df

movieId
134853    0.806897
26340     0.786207
51939     0.786207
673       0.786207
108932    0.786207
            ...   
104272    0.000000
50658     0.000000
8577      0.000000
50740     0.000000
100044    0.000000
Length: 9742, dtype: float64

## Top Recommended Movies

In [9]:
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres
478,546,Super Mario Bros. (1993),Action|Adventure|Children|Comedy|Fantasy|Sci-Fi
559,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...
2250,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
5490,26340,"Twelve Tasks of Asterix, The (Les douze travau...",Action|Adventure|Animation|Children|Comedy|Fan...
5819,32031,Robots (2005),Adventure|Animation|Children|Comedy|Fantasy|Sc...
6047,40339,Chicken Little (2005),Action|Adventure|Animation|Children|Comedy|Sci-Fi
6448,51939,TMNT (Teenage Mutant Ninja Turtles) (2007),Action|Adventure|Animation|Children|Comedy|Fan...
6455,52287,Meet the Robinsons (2007),Action|Adventure|Animation|Children|Comedy|Sci-Fi
6462,52462,Aqua Teen Hunger Force Colon Movie Film for Th...,Action|Adventure|Animation|Comedy|Fantasy|Myst...


# Collaborative-Based recommendation system

In [10]:
Movies = movies.drop(columns=['genres'])
Movies

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017
9738,193583,No Game No Life: Zero,2017
9739,193585,Flint,2017
9740,193587,Bungo Stray Dogs: Dead Apple,2018


In [11]:
ratings_df = ratings_df.drop(columns=['timestamp'])
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


## Creating User Subset for Collaborative Filtering
we create a subset of user ratings for collaborative filtering.

In [12]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
...,...,...,...
99510,609,296,4.0
99534,610,1,5.0
99552,610,296,5.0
99636,610,1274,5.0


## Grouping and Filtering User Subset
we group and filter the user subset for collaborative filtering based on user engagement.

In [13]:
# We group the user subset DataFrame by 'userId' to organize ratings into groups based on users.
userSubsetGroup = userSubset.groupby(['userId'])
# We sort the user groups based on the number of ratings each user has provided, prioritizing users with more ratings.
userSubsetGroup = sorted(
    userSubsetGroup, key=lambda x: len(x[1]), reverse=True)
# We filter the sorted user groups to include only those with more than 2 ratings.
userSubsetGroup = [x for x in userSubsetGroup if len(x[1]) > 2]
userSubsetGroup

[((91,),
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 ((177,),
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 ((219,),
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0),
 ((274,),
         userId  movieId  rating
  39229     274        1     4.0
  39230     274        2     3.5
  39288     274      296     5.0
  39448     274     1274     4.0
  39549     274     1968     4.0),
 ((298,),
         userId  movieId  rating
  44535     298        1     2.0
  44536     298        2     0.5
  44555     298      296     4.5
  44620     298    

## Calculating Correlation Coefficient

we calculate the Spearman's rank correlation coefficient between user ratings in the user subset and input movie ratings.

In [14]:
from scipy.stats.mstats import spearmanr

pearsonCorrelationDict = {}

for name, group in userSubsetGroup:
# We iterate over each user group in the 'userSubsetGroup'.
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    # We create temporary DataFrames containing ratings for movies common to both the user group and input movies, 
    # and extract the rating lists from these DataFrames.
    temp_df = inputMovies[inputMovies['movieId'].isin(
        group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    # We calculate the Spearman's rank correlation coefficient between the ratings from the user group and the input movies,
    # and store the result in the 'pearsonCorrelationDict' dictionary.
    corr, _ = spearmanr(tempRatingList, tempGroupList)
    pearsonCorrelationDict[name] = format(corr, '.3f')

pearsonCorrelationDict

{(91,): '0.263',
 (177,): '-0.351',
 (219,): '-0.342',
 (274,): '-0.229',
 (298,): '-0.205',
 (414,): '-0.270',
 (474,): '-0.289',
 (477,): '0.057',
 (480,): '-0.162',
 (483,): '-0.649',
 (599,): '-0.026',
 (608,): '-0.289',
 (50,): '-0.056',
 (57,): '0.389',
 (68,): '0.632',
 (103,): '-0.775',
 (135,): '0.258',
 (182,): '-0.544',
 (202,): '0.258',
 (217,): '0.894',
 (226,): '-0.400',
 (288,): '-0.200',
 (307,): '-0.211',
 (318,): '0.316',
 (322,): '-0.316',
 (330,): '0.316',
 (357,): '0.400',
 (434,): '-0.056',
 (448,): '0.000',
 (469,): '-0.943',
 (561,): '-0.775',
 (600,): '-0.949',
 (606,): '-0.833',
 (610,): '0.272',
 (18,): '-0.500',
 (19,): '0.500',
 (21,): '0.000',
 (45,): '-0.866',
 (63,): '0.000',
 (64,): '-0.500',
 (66,): '-0.866',
 (107,): '0.500',
 (122,): '-0.500',
 (132,): '-0.866',
 (140,): '-1.000',
 (141,): '0.000',
 (144,): '-0.500',
 (153,): '-0.866',
 (156,): '-0.500',
 (160,): '-0.866',
 (166,): '0.500',
 (198,): '-0.500',
 (200,): '-1.000',
 (201,): '0.000',
 (23

## Creating DataFrame of Similar Users

we create a DataFrame containing similar users based on their similarity index.

In [15]:
keys = list(pearsonCorrelationDict.keys())
values = list(pearsonCorrelationDict.values())
clean_keys = [value for (value,) in keys]
# We create a DataFrame 'pearsonDF' with columns 'userId' and 'SimilarityIndex' to store user similarity indices.
pearsonDF = pd.DataFrame({'userId': clean_keys, 'SimilarityIndex': values})
# We convert the 'SimilarityIndex' column to numeric data type, handling any errors that may occur during conversion.
pearsonDF['SimilarityIndex'] = pd.to_numeric(pearsonDF['SimilarityIndex'], errors='coerce')
# We sort the DataFrame 'pearsonDF' by 'SimilarityIndex' in descending order to prioritize similar users with higher similarity indices.
topUsers = pearsonDF.sort_values(by='SimilarityIndex', ascending=False)
# We filter the DataFrame to include only those users with a similarity index greater than 0.5, ensuring significant similarity.
topUsers = topUsers[topUsers['SimilarityIndex'] > 0.5]
topUsers = topUsers.reset_index(drop=True)
topUsers.head()

Unnamed: 0,userId,SimilarityIndex
0,605,1.0
1,559,1.0
2,484,1.0
3,217,0.894
4,282,0.866


## Merging Similar Users with Ratings Data
We merge the 'topUsers' DataFrame containing similar users with the 'ratings_df' DataFrame based on the 'userId' column, using an inner join to retain only the overlapping rows.

In [16]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,userId,SimilarityIndex,movieId,rating
0,605,1.0,1,4.0
1,605,1.0,2,3.5
2,605,1.0,28,4.0
3,605,1.0,73,3.0
4,605,1.0,110,3.0


## Calculating Weighted Ratings for Similar Users
We calculate the weighted rating for each user by multiplying their similarity index with their respective rating.

In [17]:
topUsersRating['weightedRating'] = topUsersRating['SimilarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,userId,SimilarityIndex,movieId,rating,weightedRating
0,605,1.0,1,4.0,4.0
1,605,1.0,2,3.5,3.5
2,605,1.0,28,4.0,4.0
3,605,1.0,73,3.0,3.0
4,605,1.0,110,3.0,3.0


## Aggregating Weighted Ratings for Movies

we aggregate the weighted ratings for each movie based on similar users' ratings.
We group the 'topUsersRating' DataFrame by 'movieId' and calculate the sum of 'SimilarityIndex' and 'weightedRating' columns for each movie.

In [18]:
tempTopUsersRating = topUsersRating.groupby(
    'movieId').sum()[['SimilarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum similarity Index', 'sum weighted Rating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum similarity Index,sum weighted Rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9.722,40.739
2,7.99,26.791
3,3.258,8.22
5,1.498,3.862
6,4.258,15.378


## Generating Movie Recommendations

we generate movie recommendations based on weighted average recommendation scores that calculated by dividing the sum of weighted ratings by the sum of similarity indices. We also include the 'movieId' column to identify movies.

In [19]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum weighted Rating'] / \
    tempTopUsersRating['sum similarity Index']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df = recommendation_df.sort_values(
    by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.reset_index(drop=True)
recommendation_df

Unnamed: 0,weighted average recommendation score,movieId
0,5.0,69069
1,5.0,74946
2,5.0,1753
3,5.0,93320
4,5.0,1641
...,...,...
2023,1.0,3433
2024,0.5,118900
2025,0.5,2953
2026,0.5,42018


## Top Recommended Movies

In [20]:
movies_df.loc[movies_df['movieId'].isin(
    recommendation_df.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres
228,265,Like Water for Chocolate (Como agua para choco...,Drama|Fantasy|Romance
286,328,Tales from the Crypt Presents: Demon Knight (1...,Horror|Thriller
340,383,Wyatt Earp (1994),Western
599,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy
759,998,Set It Off (1996),Action|Crime
821,1081,Victor/Victoria (1982),Comedy|Musical|Romance
996,1298,Pink Floyd: The Wall (1982),Drama|Musical
1084,1408,"Last of the Mohicans, The (1992)",Action|Romance|War|Western
1232,1641,"Full Monty, The (1997)",Comedy|Drama
1307,1753,Half Baked (1998),Comedy
