In [2]:
import pandas as pd
import numpy as np

r_col = ["user_id", "movie_id", "rating"]
ratings = pd.read_csv("../../datasets/MLCourse/ml-100k/u.data", sep="\t",
                     names=r_col, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [8]:
movie_properties = ratings.groupby(by="movie_id").agg({"rating": [np.size, np.mean]})

In [9]:
movie_properties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [14]:
# apply min-max scaling to size
movie_num_ratings = pd.DataFrame(movie_properties["rating"]["size"])
movie_normalized_num_ratings = movie_num_ratings.apply(
    lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movie_normalized_num_ratings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [25]:
movie_dict = {}

with open("../../datasets/MLCourse/ml-100k/u.item", encoding="latin-1") as f:
    for line in f:
        fields = line.rstrip("\n").split("|")
        movie_id = int(fields[0])
        title = fields[1]
        genres = fields[5:25]
        genres = list(map(int, genres))
        movie_dict[movie_id] = (title, genres, 
                                movie_normalized_num_ratings.loc[movie_id].get("size"),
                                movie_properties.loc[movie_id]["rating"]["mean"])
movie_dict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)

In [28]:
# define similarity metric
from scipy import spatial

def compute_distance(a, b):
    genres_A = a[1]
    genres_B = b[1]
    genre_distance = spatial.distance.cosine(genres_A, genres_B)
    popularity_A = a[2]
    popularity_B = b[2]
    popularity_distance = abs(popularity_A - popularity_B)
    return genre_distance + popularity_distance

print(movie_dict[2])
print(movie_dict[4])
print(compute_distance(movie_dict[2], movie_dict[4]))

('GoldenEye (1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.3567753001715266, 3.550239234449761)
0.8004574042309892


In [36]:
import operator

def get_neighbors(movie_id, K):
    distances = []
    for movie in movie_dict:
        if (movie != movie_id):
            dist = compute_distance(movie_dict[movie], movie_dict[movie_id])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for i in range(K):
        neighbors.append(distances[i][0])
    return neighbors

K = 10
avg_rating = 0
neighbors = get_neighbors(1, K)
for neighbor in neighbors:
    avg_rating += movie_dict[neighbor][3]
    print(f"{movie_dict[neighbor][0]} {movie_dict[neighbor][3]}")

avg_rating /= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


In [37]:
avg_rating

3.3445905900235564

In [75]:
orig_df = pd.DataFrame(movie_dict).T
genres_df = pd.DataFrame(orig_df[1].tolist(), index=orig_df.index)
orig_df

Unnamed: 0,0,1,2,3
1,Toy Story (1995),"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.773585,3.87832
2,GoldenEye (1995),"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.222985,3.20611
3,Four Rooms (1995),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.152659,3.03333
4,Get Shorty (1995),"[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0.356775,3.55024
5,Copycat (1995),"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",0.145798,3.30233
...,...,...,...,...
1678,Mat' i syn (1997),"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0,1
1679,B. Monkey (1998),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0,3
1680,Sliding Doors (1998),"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...",0,2
1681,You So Crazy (1994),"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,3


In [87]:
new_df = orig_df[[0, 2, 3]].join(genres_df, lsuffix="orig")
new_df

Unnamed: 0,0orig,2orig,3orig,0,1,2,3,4,5,6,...,9,10,11,12,13,14,15,16,17,18
1,Toy Story (1995),0.773585,3.87832,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),0.222985,3.20611,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),0.152659,3.03333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),0.356775,3.55024,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),0.145798,3.30233,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1679,B. Monkey (1998),0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1680,Sliding Doors (1998),0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1681,You So Crazy (1994),0,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
from sklearn.neighbors import KNeighborsRegressor

X = new_df.drop(["0orig", "3orig"], axis=1).values
y = new_df["3orig"].values

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X, y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [105]:
knn.predict([X[0, :]])

array([3.2021809083998436], dtype=object)

In [108]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "n_neighbors": range(1, 15, 2)
    }
]

gs = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=param_grid, cv=10, n_jobs=-1,
                 scoring="neg_mean_squared_error")
gs.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'n_neighbors': range(1, 15, 2)}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [115]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006927,0.000782,0.006724,0.000452,1,{'n_neighbors': 1},-0.399842,-0.291936,-0.488318,-0.418994,-0.576773,-0.569327,-0.544838,-0.874622,-1.108546,-2.328512,-0.760171,0.570085,7
1,0.007658,0.001544,0.007005,0.001033,3,{'n_neighbors': 3},-0.253042,-0.255201,-0.332117,-0.266021,-0.364795,-0.398593,-0.380998,-0.656584,-0.964841,-1.44652,-0.531871,0.370528,6
2,0.007776,0.001528,0.008066,0.000769,5,{'n_neighbors': 5},-0.247669,-0.246892,-0.269998,-0.293317,-0.31532,-0.344929,-0.362434,-0.568327,-0.889331,-1.385633,-0.492385,0.352199,5
3,0.007545,0.001637,0.008377,0.000985,7,{'n_neighbors': 7},-0.258923,-0.255321,-0.284565,-0.284666,-0.295885,-0.340495,-0.332748,-0.578017,-0.814223,-1.387096,-0.483194,0.345656,1
4,0.00756,0.001378,0.008765,0.000869,9,{'n_neighbors': 9},-0.264625,-0.256595,-0.316246,-0.272545,-0.28079,-0.33923,-0.3428,-0.562638,-0.808102,-1.401151,-0.484472,0.347458,3
5,0.007381,0.001494,0.008972,0.000978,11,{'n_neighbors': 11},-0.261446,-0.258246,-0.315214,-0.267373,-0.283163,-0.341623,-0.352816,-0.593811,-0.764916,-1.450949,-0.488956,0.357693,4
6,0.007568,0.000794,0.009776,0.000788,13,{'n_neighbors': 13},-0.258642,-0.257328,-0.307653,-0.266091,-0.285492,-0.335602,-0.348783,-0.559872,-0.78178,-1.439788,-0.484103,0.356149,2
