# **Movie Recommendation System: Version 2**

#### Changes made:

*   Implemented randomization
*   Updated for user input and interactivity



In [72]:
# Importing libraries/packages
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
# Reading in the data
df = pd.read_csv('/content/Movies - Large Dataset.csv')
df

Unnamed: 0,name,year,movie_rated,run_length,genres,release_date,rating,num_raters,num_reviews
0,The Dark Knight,2008,PG-13,2h 32min,Action; Crime; Drama;,18 July 2008 (USA),9.0,2224522,6836
1,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
2,The Matrix,1999,R,2h 16min,Action; Sci-Fi;,31 March 1999 (USA),8.7,1619761,4281
3,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
4,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979
...,...,...,...,...,...,...,...,...,...
1695,Catch-22,1970,R,2h 2min,Comedy; Drama; War;,24 June 1970 (USA),7.1,21424,145
1696,The Great Raid,2005,R,2h 12min,Action; Drama; War;,12 August 2005 (USA),6.7,20965,194
1697,Saints and Soldiers,2003,PG-13,1h 30min,Action; Drama; War;,25 March 2005 (USA),6.7,19730,163
1698,Stop-Loss,2008,R,1h 52min,Drama; War;,28 March 2008 (USA),6.4,19456,102


In [74]:
# Listing column names
df.columns

Index(['name', 'year', 'movie_rated', 'run_length', 'genres', 'release_date',
       'rating', 'num_raters', 'num_reviews'],
      dtype='object')

In [75]:
# Updating column names
df = df.rename(columns={'name': 'Movie_Name', 'year': 'Year', 'movie_rated': 'Rating',
                   'run_length': 'Run_Length', 'genres': 'Genres',
                   'release_date': 'Release_Date', 'rating': 'Review_Rating',
                   'num_raters': 'Num_Raters', 'num_reviews': 'Num_Reviews'})

In [76]:
# Verifying update of column names
df.head()

Unnamed: 0,Movie_Name,Year,Rating,Run_Length,Genres,Release_Date,Review_Rating,Num_Raters,Num_Reviews
0,The Dark Knight,2008,PG-13,2h 32min,Action; Crime; Drama;,18 July 2008 (USA),9.0,2224522,6836
1,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
2,The Matrix,1999,R,2h 16min,Action; Sci-Fi;,31 March 1999 (USA),8.7,1619761,4281
3,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
4,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979


In [77]:
# Checking data types
df.dtypes

Unnamed: 0,0
Movie_Name,object
Year,int64
Rating,object
Run_Length,object
Genres,object
Release_Date,object
Review_Rating,float64
Num_Raters,int64
Num_Reviews,int64


In [78]:
# Cleaning spaces in genre/converting to lower case
df['Genres'] = df['Genres'].apply(lambda x: [i.strip().lower() for i in x.split(";")])

print(df['Genres'].head())

0         [action, crime, drama, ]
1    [action, adventure, sci-fi, ]
2               [action, sci-fi, ]
3     [action, adventure, drama, ]
4            [action, adventure, ]
Name: Genres, dtype: object


In [79]:
# Convert lists of genres back to strings for TF-IDF
df['Genre_String'] = df['Genres'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Genre_String'])

print(tfidf_matrix.shape) # (number of movies, number of unique genres)

(1700, 19)


In [80]:
# Calculating Cosine Simularity (similarity between two vectors-->movie genres)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape) # a similarity matrix for number of movies

(1700, 1700)


In [81]:
def get_recommendations(movie_title, cosine_sim=cosine_sim, df=df):
    """
    Recommends movies similar to the given movie title based on genre.

    Args:
        movie_title (str): The title of the movie to find recommendations for.
        cosine_sim (numpy.ndarray): Cosine similarity matrix.
        df (pandas.DataFrame): DataFrame containing movie data.

    Returns:
        list: A list of recommended movie titles.
    """
    try:
        movie_indices = df[df['Movie_Name'] == movie_title].index.tolist()

        if not movie_indices:
            return "Movie not found."

        idx = movie_indices[0]

        # Get the pairwise similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Remove all instances of the movie from the recommendations
        sim_scores = [item for item in sim_scores if item[0] not in movie_indices]

        # Shuffle the similar movies
        random.shuffle(sim_scores)

        # Get the scores of the 10 most similar movies (or fewer if there are less than 10)
        num_recommendations = min(10, len(sim_scores))
        sim_scores = sim_scores[:num_recommendations]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return the most similar movies
        return df['Movie_Name'].iloc[movie_indices].tolist()

    except IndexError:
        return "Movie not found or index out of range."
    except KeyError:
        return "Column 'name' not found in DataFrame."
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return "An error occurred while generating recommendations."

In [None]:
def get_user_input_recommendations(df, cosine_sim):
    """
    Prompts the user for a movie title and provides recommendations.

    Args:
        df (pandas.DataFrame): DataFrame containing movie data.
        cosine_sim (numpy.ndarray): Cosine similarity matrix.
    """
    while True:
        movie_title = input("Enter a movie title (or type 'exit' to quit): ")
        if movie_title.lower() == 'exit':
            break  # Exit the loop

        recommendations = get_recommendations(movie_title, cosine_sim, df)

        print(f"\nRecommendations for '{movie_title}':")
        if isinstance(recommendations, str):
            print(recommendations)  # Prints "Movie not found" message
        else:
            for movie in recommendations:
                print(movie)
        print("\n")

if __name__ == "__main__":
    get_user_input_recommendations(df, cosine_sim)


# Conclusion

By implementing randomization and user interactivity into version 2, the recommendations returned were more fruitful. As predicted, the randomization removed duplicate and static recommendations, taking full advantage of the large 1700 movie title dataset.

Testing search results for 'The Matrix' twice (to ensure both returned randomized recommendations) resulted in the following:

**TEST 1**

***Recommendations for 'The Matrix':***
* War Dogs
* Parasite
* Crash
* Vertigo
* Begin Again
* The Perks of Being a Wallflower
* Unbroken
* The Sound of Music
* Aladdin
* Dumb and Dumber

**TEST 2**

***Recommendations for 'The Matrix':***
* Jackie
* Yesterday
* Silent Hill
* Wanted
* Avengers: Age of Ultron
* Blue Crush
* The Bourne Supremacy
* Deadpool
* The King's Speech
* Abraham Lincoln: Vampire Hunter