In [59]:
import pandas as pd
import matplotlib.pyplot as plt
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


## Read the data

In [60]:
df = pd.read_csv("https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7")

### Getting the relevant features for processing

In [61]:
df = df[['Title','Genre','Director','Actors','Plot']]

In [62]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [63]:
df.tail()

Unnamed: 0,Title,Genre,Director,Actors,Plot
245,The Lost Weekend,"Drama, Film-Noir",Billy Wilder,"Ray Milland, Jane Wyman, Phillip Terry, Howard...",The desperate life of a chronic alcoholic is f...
246,Short Term 12,Drama,Destin Daniel Cretton,"Brie Larson, John Gallagher Jr., Stephanie Bea...",A 20-something supervising staff member of a r...
247,His Girl Friday,"Comedy, Drama, Romance",Howard Hawks,"Cary Grant, Rosalind Russell, Ralph Bellamy, G...",A newspaper editor uses every trick in the boo...
248,The Straight Story,"Biography, Drama",David Lynch,"Sissy Spacek, Jane Galloway Heitz, Joseph A. C...",An old man makes a long journey by lawn-mover ...
249,Slumdog Millionaire,Drama,"Danny Boyle, Loveleen Tandan","Dev Patel, Saurabh Shukla, Anil Kapoor, Raj Zu...",A Mumbai teen reflects on his upbringing in th...


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
Title       250 non-null object
Genre       250 non-null object
Director    250 non-null object
Actors      250 non-null object
Plot        250 non-null object
dtypes: object(5)
memory usage: 9.9+ KB


In [65]:
df.isnull().sum()

Title       0
Genre       0
Director    0
Actors      0
Plot        0
dtype: int64

## Data Cleaning

In [66]:
## initialize a new column 
df['KeyWords'] =''
for index,row in df.iterrows():
    plot = row['Plot']
## instantiate Rake
    r=Rake()
## Extract the keywords by passing a text
    r.extract_keywords_from_text(plot)
## get a dictionary with keywords as keys and scores as values
    key_words_dic_scores = r.get_word_degrees()
## Assign the Keywords to the newly created column of respective movies
    row['Key_words']= list(key_words_dic_scores.keys())
## drop the plot column
df.drop(columns='Plot',inplace=True)

In [67]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,KeyWords
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",


In [68]:
##Convert everything to lowercase to avoid duplicacy
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [69]:
## make a BOW to be vectorized
df['b_o_w']=df['Title']+','+df['Genre']+','+df['Director']+','+df['Actors']
df.head()

Unnamed: 0,Title,Genre,Director,Actors,KeyWords,b_o_w
0,the shawshank redemption,"crime, drama",frank darabont,"tim robbins, morgan freeman, bob gunton, willi...",,"the shawshank redemption,crime, drama,frank da..."
1,the godfather,"crime, drama",francis ford coppola,"marlon brando, al pacino, james caan, richard ...",,"the godfather,crime, drama,francis ford coppol..."
2,the godfather: part ii,"crime, drama",francis ford coppola,"al pacino, robert duvall, diane keaton, robert...",,"the godfather: part ii,crime, drama,francis fo..."
3,the dark knight,"action, crime, drama",christopher nolan,"christian bale, heath ledger, aaron eckhart, m...",,"the dark knight,action, crime, drama,christoph..."
4,12 angry men,"crime, drama",sidney lumet,"martin balsam, john fiedler, lee j. cobb, e.g....",,"12 angry men,crime, drama,sidney lumet,martin ..."


In [70]:
## Instantiate the count vectorizer
count=CountVectorizer()
count_matrix =count.fit_transform(df['b_o_w'])


In [71]:
## Generate a similarity matrix
cos_sim = cosine_similarity(count_matrix,count_matrix)

In [80]:
cos_sim[0]


array([1.        , 0.2       , 0.17320508, 0.19364917, 0.13801311,
       0.06666667, 0.20672456, 0.20701967, 0.07161149, 0.21223818,
       0.06454972, 0.05634362, 0.        , 0.1987616 , 0.12524486,
       0.13801311, 0.05773503, 0.        , 0.25048972, 0.12909944,
       0.22019275, 0.18786729, 0.19364917, 0.06262243, 0.06454972,
       0.06666667, 0.12909944, 0.05923489, 0.06900656, 0.31311215,
       0.        , 0.12524486, 0.13333333, 0.        , 0.2       ,
       0.0745356 , 0.        , 0.06085806, 0.06900656, 0.12171612,
       0.12909944, 0.06666667, 0.        , 0.11547005, 0.12909944,
       0.        , 0.05383819, 0.06666667, 0.06900656, 0.13801311,
       0.18786729, 0.06262243, 0.        , 0.06900656, 0.12171612,
       0.        , 0.06900656, 0.        , 0.        , 0.05504819,
       0.06454972, 0.2       , 0.12909944, 0.06900656, 0.06454972,
       0.11846978, 0.12909944, 0.06900656, 0.2       , 0.11268723,
       0.05383819, 0.20701967, 0.18257419, 0.06666667, 0.06454

In [73]:
## Create a series for the movie titles
indices = pd.Series(df.index,index=df['Title']).drop_duplicates()

## Define a function that takes the title of the movie and returns the first 10 similar movies

In [83]:
def movie_recommendations(movie_title,cos_sim=cos_sim,df=df,indices=indices):
    movies_recommended = []
## get the index of the movie matching the title
    idx = indices[movie_title]
## create a series of similarity scores in a descending order
    series_score= list(enumerate(cos_sim[idx]))
    series_score=sorted(series_score,key=lambda x:x[1],reverse=True)
    series_score= series_score[1:11]
##Get the indices of the top 10 similar movies
    movie_indices =[i[0] for i in series_score]
## populate the list with the titles for the first 10
    return df['Title'].iloc[movie_indices]


In [90]:
m= movie_recommendations('amadeus')

In [91]:
m

153                          spotlight
172                             gandhi
5                     schindler's list
60                          braveheart
143                       hotel rwanda
157                   12 years a slave
229                  the king's speech
14     one flew over the cuckoo's nest
23                 saving private ryan
117                      hacksaw ridge
Name: Title, dtype: object