In [1]:
import pandas as pd

In [2]:
movie_data = pd.read_csv('dataset.csv')
movie_data.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
# focus on only the four following features, the rest are not relevant for recommending movies
# update movie_data with only those four features
movie_data = movie_data[['id','title','genre','overview']]


In [4]:
# create a new feature 'tags', which combines the genre along with the overview (this will simplify the recommendation process a bit)
# once 'tags' is added, create a new dataset that drops 'genre' and 'overview'
movie_data['tags'] = movie_data['genre'] + movie_data['overview']
new_data = movie_data.drop(columns=['genre', 'overview'])

In [6]:
# convert the text from 'tags' into a vector for natural language processing
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000, stop_words='english')

In [7]:
vector = cv.fit_transform(new_data['tags'].values.astype('U')).toarray()
vector.shape

(10000, 10000)

In [8]:
# use cosine similarity to compare how similar the tags are to each other
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [9]:
# the similarity table can be visualized as a 2D table with both the x and y axis as tag1, tag2, tag3...
# for example, the major diagonal value is always 1 because the angle between the same tag (tag1 v. tag1, tag2 v. tag2, ...) is always 0, and cos(0) = 1.
similarity

array([[1.        , 0.06253054, 0.05802589, ..., 0.07963978, 0.07597372,
        0.03798686],
       [0.06253054, 1.        , 0.08980265, ..., 0.        , 0.        ,
        0.        ],
       [0.05802589, 0.08980265, 1.        , ..., 0.02541643, 0.03636965,
        0.        ],
       ...,
       [0.07963978, 0.        , 0.02541643, ..., 1.        , 0.03327792,
        0.03327792],
       [0.07597372, 0.        , 0.03636965, ..., 0.03327792, 1.        ,
        0.04761905],
       [0.03798686, 0.        , 0.        , ..., 0.03327792, 0.04761905,
        1.        ]])

In [10]:
# pick any movie (in this case the 2nd movie "The Godfather"), list out all of the (index,distance) pairs for that movie, and sort them in reverse order by the distance value
distances = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector:vector[1])

In [11]:
# match the indices from distances to the title
# for this recommendation system, we will only recommend the 5 most similar movies
for (i,d) in distances[0:5]:
  print(new_data.iloc[i].title)

The Godfather
The Godfather: Part II
Felon
House of Gucci
Gotti


In [12]:
def recommend(movie):
  # find the index of the movie entry that matches the input
  index = new_data[new_data['title']==movie].index[0]

  # calculate the (index,distance) pairs for the movie at the calculated index above
  distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])

  # note that distances[1:6] is being used instead of distances[0:5], that is because we don't want to recommend the same input movie
  # the input movie will always be the only movie with a similarity value of 1
  for (i,d) in distances[1:6]:
    print(new_data.iloc[i].title)

In [13]:
# use the pickle files for the web recommender application
import pickle

In [14]:
pickle.dump(new_data, open('movies_list.pkl', 'wb'))

In [18]:
similarity = pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [15]:
pickle.load(open('movies_list.pkl', 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."
