# **Movie Recommendation System: Version 1**

In [7]:
# Importing libraries/packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Reading in the data
df = pd.read_csv('/content/Movies - Large Dataset.csv')
df

Unnamed: 0,name,year,movie_rated,run_length,genres,release_date,rating,num_raters,num_reviews
0,The Dark Knight,2008,PG-13,2h 32min,Action; Crime; Drama;,18 July 2008 (USA),9.0,2224522,6836
1,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
2,The Matrix,1999,R,2h 16min,Action; Sci-Fi;,31 March 1999 (USA),8.7,1619761,4281
3,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
4,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979
...,...,...,...,...,...,...,...,...,...
1695,Catch-22,1970,R,2h 2min,Comedy; Drama; War;,24 June 1970 (USA),7.1,21424,145
1696,The Great Raid,2005,R,2h 12min,Action; Drama; War;,12 August 2005 (USA),6.7,20965,194
1697,Saints and Soldiers,2003,PG-13,1h 30min,Action; Drama; War;,25 March 2005 (USA),6.7,19730,163
1698,Stop-Loss,2008,R,1h 52min,Drama; War;,28 March 2008 (USA),6.4,19456,102


In [3]:
# Listing column names
df.columns

Index(['name', 'year', 'movie_rated', 'run_length', 'genres', 'release_date',
       'rating', 'num_raters', 'num_reviews'],
      dtype='object')

In [10]:
# Updating column names
df = df.rename(columns={'name': 'Movie_Name', 'year': 'Year', 'movie_rated': 'Rating',
                   'run_length': 'Run_Length', 'genres': 'Genres',
                   'release_date': 'Release_Date', 'rating': 'Review_Rating',
                   'num_raters': 'Num_Raters', 'num_reviews': 'Num_Reviews'})

In [11]:
# Verifying update of column names
df.head()

Unnamed: 0,Movie_Name,Year,Rating,Run_Length,Genres,Release_Date,Review_Rating,Num_Raters,Num_Reviews
0,The Dark Knight,2008,PG-13,2h 32min,Action; Crime; Drama;,18 July 2008 (USA),9.0,2224522,6836
1,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
2,The Matrix,1999,R,2h 16min,Action; Sci-Fi;,31 March 1999 (USA),8.7,1619761,4281
3,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
4,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979


In [12]:
# Checking data types
df.dtypes

Unnamed: 0,0
Movie_Name,object
Year,int64
Rating,object
Run_Length,object
Genres,object
Release_Date,object
Review_Rating,float64
Num_Raters,int64
Num_Reviews,int64


In [13]:
# Cleaning spaces in genre/converting to lower case
df['Genres'] = df['Genres'].apply(lambda x: [i.strip().lower() for i in x.split(";")])

print(df['Genres'].head())

0         [action, crime, drama, ]
1    [action, adventure, sci-fi, ]
2               [action, sci-fi, ]
3     [action, adventure, drama, ]
4            [action, adventure, ]
Name: Genres, dtype: object


In [20]:
# Convert lists of genres back to strings for TF-IDF
df['Genre_String'] = df['Genres'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Genre_String'])

print(tfidf_matrix.shape) # (number of movies, number of unique genres)

(1700, 19)


In [21]:
# Calculating Cosine Simularity (similarity between two vectors-->movie genres)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape) # a similarity matrix for number of movies

(1700, 1700)


In [37]:
def get_recommendations(movie_title, cosine_sim=cosine_sim, df=df):
    """
    Recommends movies similar to the given movie title based on genre.

    Args:
        movie_title: The title of the movie to find recommendations for.
        cosine_sim: Cosine similarity matrix.

    Returns:
        list: A list of recommended movie titles.
    """
    # Get the index of the movie
    try:
        movie_indices = df[df['Movie_Name'] == movie_title].index.tolist() # Get all indices
        print(f"Movie Indices: {movie_indices}")

        if not movie_indices:
            return "Movie not found." # handle when movie is not found

        idx = movie_indices[0] # Pick the first index if multiple exist

        # Get the pairwise similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Remove ALL instances of the movie from the recommendations
        sim_scores = [item for item in sim_scores if item[0] not in movie_indices]

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:11] # NOW get the top 10

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the top 10 most similar movies
        return df['Movie_Name'].iloc[movie_indices].tolist()

    except IndexError as e:
        print(f"IndexError: {e}")
        return "Movie not found or index out of range."
    except KeyError as e:
        print(f"KeyError: {e}")
        return "Column 'name' not found in DataFrame."
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return "An error occurred while generating recommendations."

In [46]:
movie_title = "The Matrix"
recommendations = get_recommendations(movie_title)

print(f"Recommendations for '{movie_title}':")
if isinstance(recommendations, str):
    print(recommendations)  # Print "Movie not found" message
else:
    for movie in recommendations:
        print(movie)

Movie Indices: [2, 1301]
Recommendations for 'The Matrix':
The Terminator
The Matrix Reloaded
Terminator 2: Judgment Day
The Terminator
The Matrix Reloaded
The Matrix Revolutions
Terminator 3: Rise of the Machines
V for Vendetta
Logan
Looper


The movie recommendation system worked for the most part. I tested it with recommendation for 'The Dark Knight' which returned the following:

* Léon: The Professional
* Baby Driver
* Sicario
* Man on Fire
* Lord of War
* 3:10 to Yuma
* Law Abiding Citizen
* Léon: The Professional
* Pulp Fiction
* The Godfather

However, recommendations for movies similar in genre to The Matrix resulted in this:

* ***The Terminator***
* ***The Matrix Reloaded***
* Terminator 2: Judgment Day
* ***The Terminator***
* ***The Matrix Reloaded***
* The Matrix Revolutions
* Terminator 3: Rise of the Machines
* V for Vendetta
* Logan
* Looper

# Conclusion

I believe implementing randomization into the system would resolve the static ten recommendations received for each movie title. Plus, it should help to eliminate duplicate recommendations. I would also like to tweak the code, updating it with user input and interactivity.