In [1]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#reading the csv file
path = r'C:\Users\nesha\Desktop\movies_preprocessed1.csv' # data path
data = pd.read_csv(path)
data.head()

Unnamed: 0,release_date,title,overview_lemm
0,1995-10-30,Toy Story,"['Led', 'toy', 'live', 'happily', 'room', 'bir..."
1,1995-12-15,Jumanji,"['When', 'sibling', 'discover', 'enchant', 'bo..."
2,1995-12-22,Grumpier Old Men,"['family', 'wedding', 'ancient', 'feud', 'next..."
3,1995-12-22,Waiting to Exhale,"['Cheated', 'mistreat', 'step', 'woman', 'be',..."
4,1995-02-10,Father of the Bride Part II,"['Just', 'when', 'recover', 'daughter', 's', '..."


In [3]:
#making movie title  the index
data.set_index('title', inplace=True)

In [4]:
#repeating the finalized topic modeling 
my_stop_words = text.ENGLISH_STOP_WORDS.union(['this', 'when', 'each','film', 'by', 'as', 'live', 'make',
                                               'ago','later','earlier', 'spend','tell','say','base','based',
                                              'know', 'want', 'try','just', 'begin', 'special',
                                              'soon','look','come', 'decide', 's','kill', 'turn',])
vectorizer = TfidfVectorizer(stop_words=my_stop_words)
doc_word = vectorizer.fit_transform(data['overview_lemm'])

#creating topic modeler
nmf_model = NMF(20, random_state=10, max_iter=1000)
doc_topic = nmf_model.fit_transform(doc_word)

#creating a dataframe showing probabilities for each movie in each topic
doc_topic_nmf = pd.DataFrame(doc_topic.round(5), index = data.index)

#finding similarity between each movie based on the degree to which they belong to each topic
cosine_sim = cosine_similarity(doc_topic_nmf, doc_topic_nmf)

In [5]:
#recommendation system function

#creating a series for the movie titles 
indices = pd.Series(data.index)
indices[:5]

#defining the function that takes in movie title as input and returns the top 5 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    similar_idx = list((pd.Series(cosine_sim[idx]).sort_values(ascending = False)).index)
    
     # initializing the empty list of recommended movies
    recommended_movies = []

    # populating the list with the titles of the best 10 matching movies
    for i in similar_idx[1:6]:
            recommended_movies.append(list(data.index)[i])
            
    return recommended_movies

In [6]:
recommendations('Bonnie and Clyde')

['The Foreigner',
 'The Interrupters',
 'Small Town Gay Bar',
 'Ivanhoe',
 'The Rosa Parks Story']

In [7]:
recommendations('Robin Hood')

['Mulberry Street', 'San Pietro', 'The Speak', "Rustlers' Rhapsody", 'Kill!']

In [8]:
recommendations('The Notebook')

['The Fountain',
 'Leila',
 'Last Year at Marienbad',
 'The Wildest Dream',
 'I Was a Communist for the FBI']

In [9]:
recommendations('Zorro')

['Spies', 'Guns', 'House of Flying Daggers', 'Shrek', 'Casablanca Express']