In [None]:
## KNN ##
# Classify new data points based on distance to know data
# Define distance metric between items in dataset and find K closest items
# Then use these items to predict some test item's property by voting on it

In [1]:
# KNN on MovieLens. Predict rating using 10 similar movie's genre + popularity (total number of ratings)
import pandas as pd
import numpy as np

rating_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv(r'C:\Users\MHayden\Desktop\ml-100k\u.data',sep='\t',names=rating_cols,usecols=range(3),encoding="ISO-8859-1")
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [2]:
# Group by movie ID. Compute each movies popularity and average rating
movieProperties = ratings.groupby('movie_id').agg({'rating':[np.size,np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [3]:
# Normalise no. of ratings to compute distance. 
# 0: no one watched it, 1: most popular
numRatings = pd.DataFrame(movieProperties['rating']['size'])

# Take the max and min number of ratings (most+least popular movie) and normalise that range
# Note: lambda applys function to entire DataFrame
normaliseRatings = numRatings.apply(lambda x: (x-np.min(x)/(np.max(x)-np.min(x))))
normaliseRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,451.998285
2,130.998285
3,89.998285
4,208.998285
5,85.998285


In [4]:
# Get genre info. In file, there's 19 fields each corresponding to a genre
# 0: not in that genre, 1: it's in that genre. 
# Create dict with movie name, genre values, normalised popularity score and average rating
movieDict = {}
with open(r'C:\Users\MHayden\Desktop\ml-100k\u.item') as file:
    for line in file:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genre = fields[5:25]
        genre = map(int,genre)
        movieDict[movieID] = (name, np.array(list(genre)), normaliseRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))
        
movieDict[1]        
## LOOK INTO PANDAS ALTERNATIVE ##

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 451.9982847341338,
 3.8783185840707963)

In [8]:
# Function to compute distance metric between 2 movies based on:
# 1) How similar their genres are and 2) how similar their popularity is.
# Think of below as graph: one axis is genre, other is popularity
from scipy import spatial

def distanceMetric(x,y):
    genreA = x[1]
    genreB = y[1]
    # Cosine similarity between 2 genre vectors
    genreDistance = spatial.distance.cosine(genreA,genreB)
    popularityA = x[2]
    popularityB = y[2]
    # Compare popularity scores and get difference
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance

# Further the distance, less similar they are.
# Movies 2 and 4
print(movieDict[2],'\n',movieDict[4])
distanceMetric(movieDict[2],movieDict[4])

('GoldenEye (1995)', array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 130.9982847341338, 3.2061068702290076) 
 ('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 208.9982847341338, 3.550239234449761)


78.66666666666667

In [24]:
# Function to compute distance between a test movie and the dataset.
# Then sort by distance and print KNN
import operator

def getKNN(movieID,K):
    distances = []
    for movie in movieDict:
        # Compute distance score between test movie and dataset
        if(movie != movieID):
            distance = distanceMetric(movieDict[movieID],movieDict[movie])
            distances.append((movie,distance))
    distances.sort(key=operator.itemgetter(1))
    knn = []
    # Get K results defined below
    for i in range(K):
        knn.append(distances[i][0])
    return knn

K = 10 # 10 nearest neighbours
averageRating = 0
knn = getKNN(1,K) # MovieID 1, 10 nearest neighbours
# Iterate through K, compute average rating from each neighbour for prediction
for n in knn:
    # Average rating per movie
    averageRating += movieDict[n][3]
    print(movieDict[n][0] + ' ' + str(movieDict[n][3]))
averageRating /= float(K)

Air Force One (1997) 3.6310904872389793
Independence Day (ID4) (1996) 3.438228438228438
Scream (1996) 3.4414225941422596
English Patient, The (1996) 3.656964656964657
Raiders of the Lost Ark (1981) 4.252380952380952
Liar Liar (1997) 3.156701030927835
Godfather, The (1972) 4.283292978208232
Return of the Jedi (1983) 4.007889546351085
Fargo (1996) 4.155511811023622
Contact (1997) 3.8035363457760316


In [25]:
# Average rating of 10 nearest neighbours of Toy Story
print(averageRating)
movieDict[1]

3.782701884124209


('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 451.9982847341338,
 3.8783185840707963)