# KNN (K-Nearest-Neighbors) 

In [1]:
import pandas as pd

r_cols=["user_id", "movie_id", "rating"]
ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [2]:
import numpy as np

movieProperties = ratings.groupby("movie_id").agg({"rating": [np.size, np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [3]:
movieNumRatings = pd.DataFrame(movieProperties["rating"]["size"])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x-np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()
# The formula for normalize

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [4]:
movieNumRatings2 = pd.DataFrame(movieProperties["rating"]["mean"])
movieNumRatings2.head()
# Average rating of movies

Unnamed: 0_level_0,mean
movie_id,Unnamed: 1_level_1
1,3.878319
2,3.206107
3,3.033333
4,3.550239
5,3.302326


In [5]:
df=pd.read_csv("ml-100k/u.item", sep="|", encoding="iso-8859-1", header=None)
# Let's get the movie names.

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
df.loc[:2, 5:23].values.tolist()

[[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]

In [8]:
movieNormalizedNumRatings.head(2)

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985


In [9]:
# Shows the categories of movies + Percentage + Rating average
movieDict = {}
with open(r'ml-100k/u.item', encoding="iso-8859-1") as f:
    
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:24]
        genres = list(map(int, genres))
        movieDict[movieID] = (name, genres, movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [10]:
genres

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [11]:
movieDict

{1: ('Toy Story (1995)',
  [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  0.7735849056603774,
  3.8783185840707963),
 2: ('GoldenEye (1995)',
  [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  0.22298456260720412,
  3.2061068702290076),
 3: ('Four Rooms (1995)',
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  0.15265866209262435,
  3.033333333333333),
 4: ('Get Shorty (1995)',
  [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  0.3567753001715266,
  3.550239234449761),
 5: ('Copycat (1995)',
  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  0.1457975986277873,
  3.302325581395349),
 6: ('Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  0.04288164665523156,
  3.576923076923077),
 7: ('Twelve Monkeys (1995)',
  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  0.6706689536878216,
  3.798469387755102),
 8: ('Babe (1995)',
  [0, 0, 0, 0, 1, 1, 0, 

In [12]:
movieDict[2][0], movieDict[4][0] 

('GoldenEye (1995)', 'Get Shorty (1995)')

In [13]:
movieDict[2][1], movieDict[4][1]

([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
from scipy import spatial

def ComputeDistance(a, b): # Similarity algorithm 
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB) # Category similarity between the 2 movies
    popularityA = a[2]
    popularityB = b[2]
    popularitiyDistance = abs(popularityA - popularityB) # Popularity similarity
    return genreDistance + popularitiyDistance

$c^2=a^2+b^2-2abcos\alpha$
Remember the higher the distance, the less similar the movies are. Let's check what movies 2 and 4 actually are - and confirm they're not really all that similar:

In [15]:
print (movieDict[2])
print (movieDict[4])

('GoldenEye (1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.3567753001715266, 3.550239234449761)


In [16]:
ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [17]:
import operator

def getNeighbors(movieID, K): # Recommendation systems for a movie's similarity to other movies
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 30
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print( movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463
Lion King, The (1994) 3.7818181818181817
Jungle2Jungle (1997) 2.4393939393939394
Babe (1995) 3.9954337899543377
Wrong Trousers, The (1993) 4.466101694915254
Raising Arizona (1987) 3.875
Beauty and the Beast (1991) 3.792079207920792
Back to the Future (1985) 3.834285714285714
101 Dalmatians (1996) 2.908256880733945
Fish Called Wanda, A (1988) 3.785425101214575
Pinocchio (1940) 3.6732673267326734
Matilda (1996) 3.210526315789474
Mary Poppins (1964) 3.7247191011235956
In & Out (1997) 3.3043478260869565
Fantasia (1940) 3.770114

In [18]:
avgRating

104.82306167105551

In [19]:
avgRating /=float(K)
avgRating

3.4941020557018505

In [20]:
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)

## Assignment 

Our choice of 10 for K was arbitrary - what effect do different K values have on the results?

Our distance metric was also somewhat arbitrary - we just took the cosine distance between the genres and added it to the difference between the normalized popularity scores. Can you improve on that?

In [21]:
# We recommend the 30 films closest to each other.
K = 30
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463
Lion King, The (1994) 3.7818181818181817
Jungle2Jungle (1997) 2.4393939393939394
Babe (1995) 3.9954337899543377
Wrong Trousers, The (1993) 4.466101694915254
Raising Arizona (1987) 3.875
Beauty and the Beast (1991) 3.792079207920792
Back to the Future (1985) 3.834285714285714
101 Dalmatians (1996) 2.908256880733945
Fish Called Wanda, A (1988) 3.785425101214575
Pinocchio (1940) 3.6732673267326734
Matilda (1996) 3.210526315789474
Mary Poppins (1964) 3.7247191011235956
In & Out (1997) 3.3043478260869565
Fantasia (1940) 3.770114