In [157]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer



## Data Preparations


In [158]:
#read csv(s)
movies_df = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings_df = pd.read_csv("dataset/small_dataset/ratings.csv")
tags_df = pd.read_csv("dataset/small_dataset/tags.csv")
links_df = pd.read_csv("dataset/small_dataset/links.csv")


In [159]:
# print(ratings_df.duplicated)
# print(ratings_df.shape)
# print(ratings_df['userId'].nunique())

In [160]:
#drop timestamp as it is only consuming memory
ratings_df.drop('timestamp', axis=1, inplace=True)
tags_df.drop('timestamp', axis=1, inplace=True)
#movies_df.drop(columns=(['imdbId', 'url', 'titleLower']), inplace=True)


In [161]:
movies_df.head()

Unnamed: 0,movieId,title,genres,imdbId,year,url,titleLower
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story
1,2,Jumanji,Adventure|Children|Fantasy,tt0113497,1995.0,https://m.media-amazon.com/images/M/MV5BZTk2Zm...,jumanji
2,3,Grumpier Old Men,Comedy|Romance,tt0113228,1995.0,https://m.media-amazon.com/images/M/MV5BMjQxM2...,grumpier old men
3,4,Waiting to Exhale,Comedy|Drama|Romance,tt0114885,1995.0,https://m.media-amazon.com/images/M/MV5BYzcyMD...,waiting to exhale
4,5,Father of the Bride Part II,Comedy,tt0113041,1995.0,https://m.media-amazon.com/images/M/MV5BOTEyNz...,father of the bride part ii


In [162]:


movies_2 = movies_df.copy()




movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df['genres'] = movies_df['genres'].apply(lambda x: ', '.join(x)) 
# genres_encoded = movies_df['genres'].str.get_dummies(sep='|')
# movies_df['year'] = movies_df['title'].str.extract('\((\d{4})\)')


# movies_df['year'] = movies_df['year'].astype(int)
movies_df['decade'] = (movies_df['year'] // 10) * 10

movies_df['title'].replace('\((\d{4})\)', '', regex=True, inplace=True)
movies_df.head()


# Flatten the list of genres
movies_2['genres'] = movies_2['genres'].str.split('|')
all_genres = [genre for sublist in movies_2['genres'] for genre in sublist]

# Extract unique genres
unique_genres = list(set(all_genres))

# Sort the unique genres for better readability (optional)
unique_genres.sort()

# Display the unique genres
# print(unique_genres)

del all_genres
del movies_2



# print(len(unique_genres))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['title'].replace('\((\d{4})\)', '', regex=True, inplace=True)


In [163]:
print(ratings_df.isnull().sum())
ratings_df.head()

userId     0
movieId    0
rating     0
dtype: int64


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [164]:
#print(tags_df.isnull().sum()) #It has 16 null tags
#print(tags_df[tags_df['tag'].isnull()].iloc[0])
tags_df.dropna(inplace=True)
print(tags_df.isnull().sum())
tags_df.head()

userId     0
movieId    0
tag        0
dtype: int64


Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA


In [165]:
print(links_df.isnull().sum())
links_df.head()

movieId    0
imdbId     0
tmdbId     8
dtype: int64


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [166]:
movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")
movies_rating_user_df.head()

Unnamed: 0,movieId,title,genres,imdbId,year,url,titleLower,decade,userId,rating
0,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1990.0,1,4.0
1,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1990.0,5,4.0
2,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1990.0,7,4.5
3,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1990.0,15,2.5
4,1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",tt0114709,1995.0,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,toy story,1990.0,17,4.5


In [167]:

movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'decade']].groupby(['movieId', 'title', 'genres', 'year', 'decade'])['rating'].agg(['count', 'mean']).round(1)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count,mean
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Toy Story,"Adventure, Animation, Children, Comedy, Fantasy",1995.0,1990.0,215,3.9
2,Jumanji,"Adventure, Children, Fantasy",1995.0,1990.0,110,3.4
3,Grumpier Old Men,"Comedy, Romance",1995.0,1990.0,52,3.3
4,Waiting to Exhale,"Comedy, Drama, Romance",1995.0,1990.0,7,2.4
5,Father of the Bride Part II,Comedy,1995.0,1990.0,49,3.1


In [168]:
movies_rating_df.sort_values('count', ascending=False, inplace=True)
movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)
movies_rating_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Num_ratings,Average_rating
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
356,Forrest Gump,"Comedy, Drama, Romance, War",1994.0,1990.0,329,4.2
318,"Shawshank Redemption, The","Crime, Drama",1994.0,1990.0,317,4.4
296,Pulp Fiction,"Comedy, Crime, Drama, Thriller",1994.0,1990.0,307,4.2
593,"Silence of the Lambs, The","Crime, Horror, Thriller",1991.0,1990.0,279,4.2
2571,"Matrix, The","Action, Sci-Fi, Thriller",1999.0,1990.0,278,4.2
260,Star Wars: Episode IV - A New Hope,"Action, Adventure, Sci-Fi",1977.0,1970.0,251,4.2
480,Jurassic Park,"Action, Adventure, Sci-Fi, Thriller",1993.0,1990.0,238,3.8
110,Braveheart,"Action, Drama, War",1995.0,1990.0,237,4.0
589,Terminator 2: Judgment Day,"Action, Sci-Fi",1991.0,1990.0,224,4.0
527,Schindler's List,"Drama, War",1993.0,1990.0,220,4.2


# Data Preprocessing

In [169]:
#let's use the beysian average to calculate a more accurate rating
#this is because a review of 5 with only 1 review is worthless, while a 4.2 with multiple reviews is more reliable

def calculate_weighted_rating(df, C, m):
    """
    Calculate Bayesian weighted rating for each movie in the DataFrame.

    Parameters:
    df (DataFrame): DataFrame containing movie ratings.
    C (float): Average rating across all movies (prior assumption).
    m (int): Minimum number of ratings required to be considered.

    Returns:
    DataFrame: DataFrame with Bayesian weighted rating column added.
    """
    
    # Add the Bayesian weighted rating as a new column in the DataFrame
    df['Bayesian_rating'] = (df['Num_ratings'] / (df['Num_ratings'] + m)) * df['Average_rating'] + (m / (df['Num_ratings'] + m)) * C

    return df

C = round(ratings_df['rating'].mean(), 2)
movies_rating_df = calculate_weighted_rating(movies_rating_df, C, 500)
movies_rating_df.drop(columns='Average_rating', inplace=True)
movies_rating_df.sort_values(by='Bayesian_rating', ascending=False, inplace=True)
movies_rating_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Num_ratings,Bayesian_rating
movieId,title,genres,year,decade,Unnamed: 5_level_1,Unnamed: 6_level_1
318,"Shawshank Redemption, The","Crime, Drama",1994.0,1990.0,317,3.849204
356,Forrest Gump,"Comedy, Drama, Romance, War",1994.0,1990.0,329,3.777805
296,Pulp Fiction,"Comedy, Crime, Drama, Thriller",1994.0,1990.0,307,3.766295
593,"Silence of the Lambs, The","Crime, Horror, Thriller",1991.0,1990.0,279,3.750706
2571,"Matrix, The","Action, Sci-Fi, Thriller",1999.0,1990.0,278,3.750129


# Non-personalized Recommendation System



### This will be applied to the website if there isn't any user information, aka, one is not logged in.

Despite not being logged in, recommendations will be provided such as:
- Best movies overall
- Best movies for each genre
- Spoting trends
- etc

In [170]:
# let's recommend the best movies of all time (doesn't matter the genre, only it's popularity)
def non_personalized_recommendations_overall_rating(num_movies, df):
    movies_best = df.copy()
    movies_best = movies_best.iloc[:num_movies]
    movies_best.reset_index(inplace=True)
    
    return movies_best[['movieId', 'title', 'Num_ratings', 'Bayesian_rating']]

print(non_personalized_recommendations_overall_rating(10, movies_rating_df))

   movieId                                           title  Num_ratings  \
0      318                       Shawshank Redemption, The          317   
1      356                                    Forrest Gump          329   
2      296                                    Pulp Fiction          307   
3      593                       Silence of the Lambs, The          279   
4     2571                                     Matrix, The          278   
5     2959                                      Fight Club          218   
6      260              Star Wars: Episode IV - A New Hope          251   
7      858                                  Godfather, The          192   
8      527                                Schindler's List          220   
9     1196  Star Wars: Episode V - The Empire Strikes Back          211   

   Bayesian_rating  
0         3.849204  
1         3.777805  
2         3.766295  
3         3.750706  
4         3.750129  
5         3.742897  
6         3.733955  
7     

In [171]:


#unique_genres is a list with all available genres

# Function to find the best movie for each genre
def find_best_movies_for_genres(df, possible_genres, num_of_movies):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True)
    df_copy['genres'] = df_copy['genres'].str.split(', ')
    best_movies_for_genres = {}
    for genre in possible_genres:
        # Filter DataFrame by genre

        genre_df = df_copy[df_copy['genres'].apply(lambda x: genre in x)]
        if not genre_df.empty:
           
            # Select the top movie for the genre
            best_movie = genre_df[['movieId', 'title', 'Num_ratings', 'Bayesian_rating']].iloc[:num_of_movies]
            best_movies_for_genres[genre] = best_movie
    del df_copy
    del genre_df
    return best_movies_for_genres

# Find the best movie for each genre
best_movies_for_genres = find_best_movies_for_genres(movies_rating_df, unique_genres, 10)

# Display the best movies for each genre
for genre, movie in best_movies_for_genres.items():
    print(f"Best movie for genre '{genre}':")
    print(movie[['movieId', 'title']])
    print()


Best movie for genre '(no genres listed)':
      movieId                                              title
1172   166024                                           Whiplash
1341   122896   Pirates of the Caribbean: Dead Men Tell No Tales
1359   171495                                             Cosmos
1588   176601                                       Black Mirror
1607   147250  The Adventures of Sherlock Holmes and Doctor W...
1696   171749                              Death Note: Desu nôto
2123   169034                                           Lemonade
2201   159161                               Ali Wong: Baby Cobra
2220   173535  The Adventures of Sherlock Holmes and Doctor W...
2291   156605                                           Paterson

Best movie for genre 'Action':
    movieId                                              title
4      2571                                        Matrix, The
5      2959                                         Fight Club
6       260          

In [172]:
#unique_genres is a list with all available genres

# Function to find the best movie for each genre
def find_best_movies_for_genres(df, genre, num_of_movies):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True)
    df_copy['genres'] = df_copy['genres'].str.split(', ')
    best_movies_for_genres = {}
   

    genre_df = df_copy[df_copy['genres'].apply(lambda x: genre in x)]
    if not genre_df.empty:
        
        # Select the top movie for the genre
        best_movie = genre_df[['movieId', 'title','genres', 'Num_ratings', 'Bayesian_rating']].iloc[:num_of_movies]
        best_movies_for_genres[genre] = best_movie
    del df_copy
    del genre_df
    return best_movies_for_genres

# Find the best movie for each genre
#print(movies_rating_df)
genre = 'Crime'
best_movies_for_genres = find_best_movies_for_genres(movies_rating_df, genre, 10)
#print(best_movies_for_genres)

# Display the best movies for each genre

print(f"Best movie for genre '{genre}':")
movie_genre_df = best_movies_for_genres[genre][['movieId', 'title','genres', 'Num_ratings', 'Bayesian_rating']]
movie_genre_df.head()

Best movie for genre 'Crime':


Unnamed: 0,movieId,title,genres,Num_ratings,Bayesian_rating
0,318,"Shawshank Redemption, The","[Crime, Drama]",317,3.849204
2,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",307,3.766295
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",279,3.750706
5,2959,Fight Club,"[Action, Crime, Drama, Thriller]",218,3.742897
7,858,"Godfather, The","[Crime, Drama]",192,3.721965


In [173]:
#best movies of a given year

def recommend_movies_best_year(df, year, num_movies):
    movies_year = df.copy()
    movies_year.reset_index(inplace=True)
    movies_year = movies_year[movies_year['year'] == year]
    movies_year_best = movies_year.iloc[:num_movies]
    del movies_year 
    
    
    return movies_year_best[['movieId', 'title', 'Num_ratings', 'Bayesian_rating', 'year']]

    
best_year_movies = recommend_movies_best_year(movies_rating_df, 2010, 10)
best_year_movies.head()


Unnamed: 0,movieId,title,Num_ratings,Bayesian_rating,year
35,79132,Inception,143,3.633437,2010.0
127,78499,Toy Story 3,55,3.559459,2010.0
129,74458,Shutter Island,67,3.559083,2010.0
156,81845,"King's Speech, The",58,3.551971,2010.0
199,81834,Harry Potter and the Deathly Hallows: Part 1,47,3.542962,2010.0


In [174]:
#recommend the bets movies of a given decade

def recommend_movies_best_decade(df, decade, num_movies):
    movies_decade = df.copy()
    movies_decade.reset_index(inplace=True)
    movies_decade = movies_decade[movies_decade['decade'] == decade]
    movies_decade_best = movies_decade.iloc[:num_movies]
    del movies_decade 
    
    
    return movies_decade_best[['movieId', 'title', 'Num_ratings', 'Bayesian_rating', 'decade']]#'decade','year',  'Bayesian_rating']]

    
best_year_movies = recommend_movies_best_decade(movies_rating_df, 2010, 10)
best_year_movies.head()

Unnamed: 0,movieId,title,Num_ratings,Bayesian_rating,decade
35,79132,Inception,143,3.633437,2010.0
108,91529,"Dark Knight Rises, The",76,3.565972,2010.0
113,109487,Interstellar,73,3.5637,2010.0
114,112852,Guardians of the Galaxy,59,3.563327,2010.0
127,78499,Toy Story 3,55,3.559459,2010.0
