In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
movies = pd.read_csv('imdb_top_1000.csv')

In [7]:
movies.head(1)

Unnamed: 0,Movie_id,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,Actors,No_of_Votes,Gross
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2343110,28341469


In [10]:
#Get a count of the number of row/movies in the data set and the number of columns
movies.shape

(1000, 18)

In [16]:
#Create a list of important columns for the recommendation engine
columns = ['Actors', 'Director', 'Genre', 'Series_Title']

In [17]:
#Check for any missing values in the important columns
movies[columns].isnull().values.any()

False

In [18]:
#Create a function to combine the values of the important columns into a single string
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i] + ' ' + data['Director'][i] + ' ' + data['Genre'][i] + ' ' + data['Series_Title'][i])

  return important_features

In [19]:
#Create a column to hold the combined strings
movies['important_features'] = get_important_features(movies)

#Show the data
movies.head(3)

Unnamed: 0,Movie_id,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,Actors,No_of_Votes,Gross,important_features
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",2343110,28341469,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,"Marlon Brando, Al Pacino, James Caan, Diane Ke...",1620367,134966411,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",2303232,534858444,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."


In [21]:
#Convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(movies['important_features'])

In [23]:
#Get the cosine similarity matrix fromt he count matrix
cs = cosine_similarity(cm)
#Print the cosine similarity matrx
print(cs)

[[1.         0.13801311 0.13363062 ... 0.06482037 0.14824986 0.06681531]
 [0.13801311 1.         0.19364917 ... 0.06262243 0.07161149 0.12909944]
 [0.13363062 0.19364917 1.         ... 0.06063391 0.06933752 0.125     ]
 ...
 [0.06482037 0.06262243 0.06063391 ... 1.         0.13453456 0.        ]
 [0.14824986 0.07161149 0.06933752 ... 0.13453456 1.         0.13867505]
 [0.06681531 0.12909944 0.125      ... 0.         0.13867505 1.        ]]


In [24]:
#Get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [27]:
#Get the title of the movie that the user likes
def recommend(title):
    #Find the movie id
    movie_id = movies[movies.Series_Title == title]['Movie_id'].values[0]
    #Create a list of enumerations for the similarity score
    scores = list(enumerate(cs[movie_id]))
    #Sort the list
    sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
    sorted_scores = sorted_scores[1:]
    #Create a loop to print the first 7 similar movies
    j= 0
    print('The 7 most recommended movies', title, 'are:\n')
    for item in sorted_scores:
        movie_title = movies[movies.Movie_id == item[0]]['Series_Title'].values[0]
        print(j+1, movie_title)
        j = j + 1
        if j > 6:
            break

In [28]:
recommend('Pulp Fiction')

The 7 most recommended movies Pulp Fiction are:

1 Kill Bill: Vol. 1
2 The Hateful Eight
3 Kill Bill: Vol. 2
4 Die Hard: With a Vengeance
5 Reservoir Dogs
6 Enter the Dragon
7 Stagecoach


In [32]:
new_movies = movies[['Movie_id', 'Series_Title', 'Overview']]

In [33]:
new_movies.head()

Unnamed: 0,Movie_id,Series_Title,Overview
0,0,The Shawshank Redemption,Two imprisoned men bond over a number of years...
1,1,The Godfather,An organized crime dynasty's aging patriarch t...
2,2,The Dark Knight,When the menace known as the Joker wreaks havo...
3,3,The Godfather: Part II,The early life and career of Vito Corleone in ...
4,4,12 Angry Men,A jury holdout attempts to prevent a miscarria...


In [29]:
import pickle

In [35]:
pickle.dump(new_movies, open('movies.pkl', 'wb'))