# K-Nearest Neighbors KNN

## Finalidade do notebook ##

Escrever testes e possíveis anotações importantes sobre o uso de KNN com scikit-learn e pandas.

In [1]:
# Definindo e limpando dataset
import pandas as pd
import numpy as np

cols = ['user_id', 'movie_id', 'stars']
df = pd.read_csv("./Dataset-Curso/u.data", sep='\t', names=cols, usecols=range(3))

df.head()

Unnamed: 0,user_id,movie_id,stars
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


Fazendo uma análise rápida do dataset é possível perceber que as notas vão de 0 a 5, deixando uma margem baixa para cálculos muito distantes.

In [2]:
notas = df.groupby('movie_id').agg({'stars': [np.size, np.mean]})
notas.head()

  notas = df.groupby('movie_id').agg({'stars': [np.size, np.mean]})


Unnamed: 0_level_0,stars,stars
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [3]:
movie_ratings = pd.DataFrame(notas['stars']['size'])
movie_ratings_normalized = movie_ratings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movie_ratings_normalized.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [19]:
movie_dict = {}

with open (r'./Dataset-Curso/u.item', encoding='ISO-8859-1') as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieId = int(fields[0])
        name = fields[1]
        genre = fields[5:25]
        genre = map(int, genre)

        if movieId in movie_ratings_normalized.index and movieId in notas.index:
            movie_dict[movieId] = (name, list(genre), movie_ratings_normalized.loc[movieId].get('size'), notas.loc[movieId].stars.get('mean'))            

In [20]:
movie_dict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 np.float64(0.7735849056603774),
 np.float64(3.8783185840707963))

In [21]:
from scipy import spatial

def distancia(a, b):
    genres_a = a[1]
    genres_b = b[1]
    genre_distance = spatial.distance.cosine(genres_a, genres_b)
    popularity_a = a[2]
    popularity_b = b[2]
    popularity_distance = abs(popularity_a - popularity_b)
    return genre_distance + popularity_distance

In [22]:
import operator

def KNN(movieId, K):
    distances = []
    for movie in movie_dict:
        if (movie != movieId):
            dist = distancia(movie_dict[movieId], movie_dict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 5
avg_rating = 0
neighbors = KNN(1, K)
for neighbor in neighbors:
    avg_rating += movie_dict[neighbor][3]
    print(movie_dict[neighbor][0] + " " + str(movie_dict[neighbor][3]))

avg_rating = avg_rating / float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127


In [23]:
avg_rating

np.float64(3.7189656165466287)

## Atividade ##
Melhorar os dados arbitrários dados pelo tutor. Dados arbitrários são o valor de K e a métrica da distância que foi apenas o cosseno das diferenças das distâncias entre as notas normalizadas.