# Importing necessary packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading in the data

In [2]:
# Movie Titles
movies_df = pd.read_csv('movies.csv')

print(movies_df.shape)
display(movies_df.head())

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Movie Tags
tags_df = pd.read_csv('tags.csv',usecols=['movieId', 'tag'])

print(tags_df.shape)
display(tags_df.head())

(3683, 2)


Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA


In [4]:
# Movie Ratings
ratings_df = pd.read_csv('ratings.csv',usecols=['movieId', 'rating'])

print(ratings_df.shape)
display(ratings_df.head())

(100836, 2)


Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0


In [5]:
# Taking the average ratings per movie ID
ratings_df = pd.DataFrame(ratings_df.groupby('movieId')['rating'].mean()).reset_index()
ratings_df.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [6]:
# Using replace on the genre feature to remove the symbol
movies_df['genres'] = [split_genre.replace('|',' ') for split_genre in movies_df['genres']]
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Merging the dataframes
combined_df = movies_df.merge(tags_df, on='movieId').merge(ratings_df, on='movieId')
print(combined_df.shape)
display(combined_df.head())

(3662, 5)


Unnamed: 0,movieId,title,genres,tag,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,3.92093
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,3.92093
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun,3.92093
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy,3.431818
4,2,Jumanji (1995),Adventure Children Fantasy,magic board game,3.431818


In [8]:
# Movie ID duplicates are present which may interfer with the system so duplicates are dropped
no_dup_df = combined_df.drop_duplicates('movieId')
print(no_dup_df.shape)
display(no_dup_df.head())

(1554, 5)


Unnamed: 0,movieId,title,genres,tag,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,3.92093
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy,3.431818
7,3,Grumpier Old Men (1995),Comedy Romance,moldy,3.259615
9,5,Father of the Bride Part II (1995),Comedy,pregnancy,3.071429
11,7,Sabrina (1995),Comedy Romance,remake,3.185185


In [9]:
# Combining the tags and genres to create collective data that can be analyzed
no_dup_df['data'] = no_dup_df['genres']+' '+no_dup_df['tag'].reset_index(drop=True)

# Dropping the not needed columns and resetting the index
no_dup_df = no_dup_df.drop(columns=['genres', 'tag'])
no_dup_df.reset_index(drop=True, inplace=True)
no_dup_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_dup_df['data'] = no_dup_df['genres']+' '+no_dup_df['tag'].reset_index(drop=True)


Unnamed: 0,movieId,title,rating,data
0,1,Toy Story (1995),3.92093,Adventure Animation Children Comedy Fantasy pixar
1,2,Jumanji (1995),3.431818,Adventure Children Fantasy pregnancy
2,3,Grumpier Old Men (1995),3.259615,Comedy Romance Mafia
3,5,Father of the Bride Part II (1995),3.071429,Comedy Hollywood
4,7,Sabrina (1995),3.185185,Comedy Romance alcoholism


In [10]:
# Null fields for the data column were discover so they are replaced with a space
ready_df = no_dup_df.fillna(' ')

# Calculating similarity scores

In [11]:
# Creating a CountVectorizer instance
count_vect = CountVectorizer(stop_words='english')

# Applying the vectorizer
count_vectorized_features = count_vect.fit_transform(ready_df['data'])

In [12]:
count_vectorized_features.shape

(1554, 494)

In [13]:
# Computing the similarity score
similarity = cosine_similarity(count_vectorized_features)

# Recommender creation

In [14]:
# This function takes the respective movie index and analyzes the other indexes correlating similarites
def similarity_recommender(index):
    # Looking at the similarites between the argument vs other indexes
    # sorting the similarity scores in descending order
    sim_scores = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    # Printing the title of the movie selected by the user
    print('-'*40,f'\nRecommended Movies For: {no_dup_df.iloc[index][1]}\n','-'*40)
    # Looping over the similarity scores in the range and printing the title and rating
    for i in sim_scores[1:11]:
        print(no_dup_df.iloc[i[0]][1], round(no_dup_df.iloc[i[0]][2], 1))

In [15]:
# This function takes the user input and connects it to the exact or closest movie title
def movie_input_cleanser(input):
    # Creating a list of titles and the respective index
    indexed_title = list(enumerate(no_dup_df['title']))
    for i in range(len(indexed_title)):
        # Looping over every index/title combination, making it lower case, and then searching if the user input fits
        result = indexed_title[i][1].lower().find(input.lower())
        # results variable shows 0 for matches and -1 for no match
        if result == 0:
            similarity_recommender(i)
            break
    else:
        print('Sorry, please try spelling differently or a choose a different movie')
        # Sending the user back to the starting fuction to enter another title
        begin_recommender()

In [16]:
# Function created to collect user input
def begin_recommender():
    user_interact = input('Are you ready to use the recommender? (Y or N) ')  
    # Multiple responses added to catch multiple responses from the user
    if user_interact.lower() in ('y', 'yes', 'yup'):
        get_movie = input('Please enter the movie name or part: ')
        movie_input_cleanser(get_movie)        
    elif user_interact.lower() in ('n', 'no', 'nope'):
        print('Farewell//Session Has Ended')
    # Starting the user input over to give the user another chance to enter a valid response
    else:
        print('Invalid entry, try again')
        begin_recommender()

# <h1><center>Movie Recommender</center></h1>

In [None]:
# Run to start the recommender
begin_recommender()