In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('imdb.csv')
df1 = df[['Title', 'Genre', 'Rating', 'Votes']]
df1.head()

Unnamed: 0,Title,Genre,Rating,Votes
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",8.1,757074
1,Prometheus,"Adventure,Mystery,Sci-Fi",7.0,485820
2,Split,"Horror,Thriller",7.3,157606
3,Sing,"Animation,Comedy,Family",7.2,60545
4,Suicide Squad,"Action,Adventure,Fantasy",6.2,393727


The raw number of votes isn't very useful for computing distances between movies, so we'll create a new DataFrame that contains the normalized number of ratings. So, a value of 0 means nobody rated it, and a value of 1 will mean it's the most popular movie there is.

In [2]:
popularity = df1[['Votes','Rating']]
movieNormalizedNumRatings = popularity.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()


Unnamed: 0,Votes,Rating
0,0.422474,0.873239
1,0.271093,0.71831
2,0.087923,0.760563
3,0.033755,0.746479
4,0.219697,0.605634


In [3]:
df.Genre = pd.factorize(df.Genre)[0]
print(df)

     Rank                                    Title  Genre  \
0       1                  Guardians of the Galaxy      0   
1       2                               Prometheus      1   
2       3                                    Split      2   
3       4                                     Sing      3   
4       5                            Suicide Squad      4   
5       6                           The Great Wall      4   
6       7                               La La Land      5   
7       8                                 Mindhorn      6   
8       9                       The Lost City of Z      7   
9      10                               Passengers      8   
10     11  Fantastic Beasts and Where to Find Them      9   
11     12                           Hidden Figures     10   
12     13                                Rogue One      0   
13     14                                    Moana     11   
14     15                                 Colossal     12   
15     16               


We want to predict the rating of a movie based on the genres, but to do that we need to know how many genres we have. Let's count how many unique genres are in the original dataframe

In [4]:
movieDict = {}
for index, row in df.iterrows():
    name = row.Title
    genre = row.Genre
    votes = movieNormalizedNumRatings.loc[index].get('Votes')
    avgRating = row.Rating
    movieDict[index] = (name,genre,votes,avgRating)
    print(movieDict)
    

{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1)}
{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0)}
{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0), 2: ('Split', 2, 0.08792285090032396, 7.3)}
{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0), 2: ('Split', 2, 0.08792285090032396, 7.3), 3: ('Sing', 3, 0.0337549634317509, 7.2)}
{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0), 2: ('Split', 2, 0.08792285090032396, 7.3), 3: ('Sing', 3, 0.0337549634317509, 7.2), 4: ('Suicide Squad', 4, 0.21969746435956033, 6.2)}
{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0), 2: ('Split', 2, 0.08792285090032396, 7.3), 3: ('Sing', 3, 0.0337549634317509, 7.2), 4: ('Suicide Squad', 4, 0.21969746435956033, 6.2), 5:

{0: ('Guardians of the Galaxy', 0, 0.4224744747761398, 8.1), 1: ('Prometheus', 1, 0.2710928060585259, 7.0), 2: ('Split', 2, 0.08792285090032396, 7.3), 3: ('Sing', 3, 0.0337549634317509, 7.2), 4: ('Suicide Squad', 4, 0.21969746435956033, 6.2), 5: ('The Great Wall', 4, 0.031238576782161503, 6.1), 6: ('La La Land', 5, 0.1443314330679659, 8.3), 7: ('Mindhorn', 6, 0.001355578436871287, 6.4), 8: ('The Lost City of Z', 7, 0.003977442371174007, 7.1), 9: ('Passengers', 8, 0.10721626470891897, 7.0), 10: ('Fantastic Beasts and Where to Find Them', 9, 0.12948090107737512, 7.5), 11: ('Hidden Figures', 10, 0.051924960446018235, 7.8), 12: ('Rogue One', 0, 0.18029193210388117, 7.9), 13: ('Moana', 11, 0.06590377011532741, 7.7), 14: ('Colossal', 12, 0.00477214953218871, 6.4), 15: ('The Secret Life of Pets', 11, 0.0670802045924475, 6.6), 16: ('Hacksaw Ridge', 10, 0.11814516241548563, 8.2), 17: ('Jason Bourne', 13, 0.08413738834894564, 6.7), 18: ('Lion', 14, 0.05692424889290707, 8.1), 19: ('Arrival', 15, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Now let's define a function that computes the "distance" between two movies based on how similar their genres are, and how similar their popularity is. Just to make sure it works, we'll compute the distance between movie with index 2 and movie with index 4:

The higher the distance, the less similar the movies are.

In [5]:
from scipy import spatial

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
    
ComputeDistance(movieDict[2], movieDict[4])

0.13177461345923636

In [6]:
print(movieDict[2])
print(movieDict[4])

('Split', 2, 0.08792285090032396, 7.3)
('Suicide Squad', 4, 0.21969746435956033, 6.2)



Now we can choose a movie and use KNN to compute the movies which are most similat to it

In [7]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating = 0
myMovieIndex = 54
neighbors = getNeighbors(myMovieIndex, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))
    
avgRating /= K

Guardians of the Galaxy 8.1
Interstellar 8.6
Star Wars: Episode VII - The Force Awakens 8.1
Deadpool 8.0
Prometheus 7.0
Pirates of the Caribbean: On Stranger Tides 6.7
Suicide Squad 6.2
Arrival 8.0
John Wick 7.2
Doctor Strange 7.6


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [8]:
avgRating


7.55

In [9]:
movieDict[1]

('Prometheus', 1, 0.2710928060585259, 7.0)