In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


## Read the data

In [2]:
df = pd.read_csv("https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7")

### Getting the relevant features for processing

In [3]:
df = df[['Title','Genre','Director','Actors','Plot']]

In [4]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [5]:
df.tail()

Unnamed: 0,Title,Genre,Director,Actors,Plot
245,The Lost Weekend,"Drama, Film-Noir",Billy Wilder,"Ray Milland, Jane Wyman, Phillip Terry, Howard...",The desperate life of a chronic alcoholic is f...
246,Short Term 12,Drama,Destin Daniel Cretton,"Brie Larson, John Gallagher Jr., Stephanie Bea...",A 20-something supervising staff member of a r...
247,His Girl Friday,"Comedy, Drama, Romance",Howard Hawks,"Cary Grant, Rosalind Russell, Ralph Bellamy, G...",A newspaper editor uses every trick in the boo...
248,The Straight Story,"Biography, Drama",David Lynch,"Sissy Spacek, Jane Galloway Heitz, Joseph A. C...",An old man makes a long journey by lawn-mover ...
249,Slumdog Millionaire,Drama,"Danny Boyle, Loveleen Tandan","Dev Patel, Saurabh Shukla, Anil Kapoor, Raj Zu...",A Mumbai teen reflects on his upbringing in th...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
Title       250 non-null object
Genre       250 non-null object
Director    250 non-null object
Actors      250 non-null object
Plot        250 non-null object
dtypes: object(5)
memory usage: 9.9+ KB


## Data Cleaning

In [27]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
Title       250 non-null object
Genre       250 non-null object
Director    250 non-null object
Actors      250 non-null object
Plot        250 non-null object
b_o_w       250 non-null object
dtypes: object(6)
memory usage: 11.8+ KB


Title       0
Genre       0
Director    0
Actors      0
Plot        0
b_o_w       0
dtype: int64

In [9]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [10]:
##Convert everything to lowercase to avoid duplicacy
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [11]:
## make a BOW(bag of words) to be vectorized
df['b_o_w']=df['Title']+','+df['Genre']+','+df['Director']+','+df['Actors']+','+df['Plot']
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot,b_o_w
0,the shawshank redemption,"crime, drama",frank darabont,"tim robbins, morgan freeman, bob gunton, willi...",two imprisoned men bond over a number of years...,"the shawshank redemption,crime, drama,frank da..."
1,the godfather,"crime, drama",francis ford coppola,"marlon brando, al pacino, james caan, richard ...",the aging patriarch of an organized crime dyna...,"the godfather,crime, drama,francis ford coppol..."
2,the godfather: part ii,"crime, drama",francis ford coppola,"al pacino, robert duvall, diane keaton, robert...",the early life and career of vito corleone in ...,"the godfather: part ii,crime, drama,francis fo..."
3,the dark knight,"action, crime, drama",christopher nolan,"christian bale, heath ledger, aaron eckhart, m...",when the menace known as the joker emerges fro...,"the dark knight,action, crime, drama,christoph..."
4,12 angry men,"crime, drama",sidney lumet,"martin balsam, john fiedler, lee j. cobb, e.g....",a jury holdout attempts to prevent a miscarria...,"12 angry men,crime, drama,sidney lumet,martin ..."


In [12]:
## Instantiate the count vectorizer
count=CountVectorizer()
count_matrix =count.fit_transform(df['b_o_w'])


In [13]:
## Generate a similarity matrix
cos_sim = cosine_similarity(count_matrix,count_matrix)

In [28]:
cos_sim[249]


array([0.16610265, 0.27050089, 0.2795085 , 0.37341524, 0.20412415,
       0.11135885, 0.31774445, 0.21516574, 0.06004806, 0.31441407,
       0.04508348, 0.19113821, 0.24743583, 0.32129478, 0.19287919,
       0.28180093, 0.22453656, 0.14322297, 0.12376844, 0.08804509,
       0.28867513, 0.23668232, 0.17376201, 0.16390252, 0.20833333,
       0.15309311, 0.14433757, 0.20801257, 0.07654655, 0.30024029,
       0.19153272, 0.29568851, 0.2773501 , 0.14048787, 0.17817416,
       0.13055824, 0.10416667, 0.19641855, 0.15779219, 0.13266716,
       0.18490007, 0.17407766, 0.048795  , 0.12632279, 0.0949158 ,
       0.15032921, 0.13878403, 0.1898316 , 0.17327582, 0.27216553,
       0.12376844, 0.34096545, 0.21356055, 0.05270463, 0.25301216,
       0.12768848, 0.26837252, 0.16012815, 0.13693064, 0.23448415,
       0.17817416, 0.17078251, 0.04454354, 0.15811388, 0.30065841,
       0.14638501, 0.1753336 , 0.16390252, 0.14048787, 0.23273733,
       0.09901475, 0.17057206, 0.22017621, 0.20134682, 0.13867

In [15]:
## Create a series for the movie titles
indices = pd.Series(df.index,index=df['Title']).drop_duplicates()

## Define a function that takes the title of the movie and returns the first 10 similar movies

In [16]:
def movie_recommendations(movie_title,cos_sim=cos_sim,df=df,indices=indices):
    movies_recommended = []
## get the index of the movie matching the title
    idx = indices[movie_title]
## create a series of similarity scores in a descending order
    series_score= list(enumerate(cos_sim[idx]))
    series_score=sorted(series_score,key=lambda x:x[1],reverse=True)
    series_score= series_score[1:11]
##Get the indices of the top 10 similar movies
    movie_indices =[i[0] for i in series_score]
## populate the list with the titles for the first 10
    return df['Title'].iloc[movie_indices]


In [29]:
m= movie_recommendations('anatomy of a murder')

In [30]:
m

3                                        the dark knight
15                                            goodfellas
172                                               gandhi
13                 the lord of the rings: the two towers
6          the lord of the rings: the return of the king
229                                    the king's speech
2                                 the godfather: part ii
51                                 the dark knight rises
224                                            rio bravo
9      the lord of the rings: the fellowship of the ring
Name: Title, dtype: object

In [22]:
k=movie_recommendations('the dark knight')

In [23]:
k

51                                 the dark knight rises
9      the lord of the rings: the fellowship of the ring
6          the lord of the rings: the return of the king
229                                    the king's speech
13                 the lord of the rings: the two towers
115                              the wolf of wall street
31                               raiders of the lost ark
49                                           the shining
192                                             the help
153                                            spotlight
Name: Title, dtype: object