In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

rating_header = ["user_id", "item_id", "rating", "timestamp"]

rating_a_train = pd.read_csv("ua.base", sep = '\t', header = None, names=rating_header)
rating_b_train = pd.read_csv("ub.base", sep = '\t', header = None, names=rating_header)
rating_a_test = pd.read_csv("ua.test", sep = '\t', header = None, names=rating_header)
rating_b_test = pd.read_csv("ub.test", sep = '\t', header = None, names=rating_header)
# users = pd.read_csv("u.user", sep = '|', header = None, names=user_header)
# movies = pd.read_csv("u.item", sep = '|', header = None, encoding = 'latin1', names = movie_header)

# print('Rating: ')
# rating.info()
# print('\nUsers: ')
# users.info()
# print('\nMovies: ')
# movies.info()

In [6]:
df_a_train = rating_a_train.pivot(index='item_id', columns='user_id', values='rating').fillna(0)
df_b_train = rating_b_train.pivot(index='item_id', columns='user_id', values='rating').fillna(0)

In [7]:
from sklearn.neighbors import NearestNeighbors

# df = df_a_train.copy()
df = df_b_train.copy()

# knn的k值
number_neighbors = 100

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(df.values)

# 找出k每個movie的前k個最相似的其他movie
# distances => 他們的距離
# indices => 他們的index
distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

def movie_recommender(user, movie):

    user_index = df.columns.tolist().index(user)
    movie_index = df.index.tolist().index(movie)
    
    # 如果沒有被評分過
    if df.iloc[movie_index, user_index] == 0:
        sim_movies = indices[movie_index].tolist()
        movie_distances = distances[movie_index].tolist()

        if movie_index in sim_movies:
            id_movie = sim_movies.index(movie_index)
            sim_movies.remove(movie_index)
            movie_distances.pop(id_movie) 

        else:
            # 未被評分的movie
            sim_movies = sim_movies[:number_neighbors-1]
            movie_distances = movie_distances[:number_neighbors-1]

        # similarity = 1 - distance
        movie_similarity = [1 - x for x in movie_distances]
        movie_similarity_copy = movie_similarity.copy()
        nominator = 0

        # 用前k個相似的movie預測rating
        for s in range(0, len(movie_similarity)):
            if df.iloc[sim_movies[s], user_index] == 0:
                if len(movie_similarity_copy) == (number_neighbors - 1):
                    movie_similarity_copy.pop(s)
                else:
                    movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))
            else:
                # weighted average of ratings for similar movies
                # 這是其中一部份
                nominator = nominator + movie_similarity[s]*df.iloc[sim_movies[s],user_index]

        if len(movie_similarity_copy) > 0:
            if sum(movie_similarity_copy) > 0:
                # 這是weighted average的另一部份
                predicted_r = nominator/sum(movie_similarity_copy)

            else:
                predicted_r = 0
        else:
            predicted_r = 0

        if predicted_r > 5:
            predicted_r = 5
#         print('Predicted Rating: ', predicted_r)
        return predicted_r
    else:
        # 回傳已填過的rating
        return df.iloc[movie_index, user_index]

In [8]:
from IPython.display import clear_output

# RMSE
def error(test):
    nominator = 0
    length = len(test.index)
    new_items = []
    for i in range(length):
        user_id = test.loc[i]['user_id']
        item_id = test.loc[i]['item_id']
        if item_id not in df.index or user_id not in df.columns:
            new_items.append((user_id, item_id))
            continue
        predicted_rating = movie_recommender(user_id, item_id)
        nominator += (predicted_rating - test.loc[i]['rating']) ** 2
        if i % 100 == 0 or i == length - 1:
            clear_output(wait=True)
            print(f'Progress: {i + 1} / {length}')

    nominator = (nominator / length) ** (1 / 2)
    print(f'RMSE: {nominator}')
    print(f'new data: {new_items}')

# error(rating_a_test)
error(rating_b_test)

Progress: 9430 / 9430
RMSE: 1.0605788164205228
new data: [(100, 1236), (167, 1309), (381, 1533), (399, 1543), (587, 1624), (676, 1654), (751, 1661)]


data: ua

1. k = len(df.values) 

    RMSE: 1.0192876612718813

    new data: [(405, 1582), (675, 1653)]


2. 
    k = 10

    RMSE: 1.657360773012494
    
    new data: [(405, 1582), (675, 1653)]

data: ub

1.  k = len(df.values)

    RMSE: 1.0357045494559085

    new data: [(100, 1236), (167, 1309), (381, 1533), (399, 1543), (587, 1624), (676, 1654), (751, 1661)]


2.  k = 10
    
    RMSE: 1.678075268019329

    new data: [(100, 1236), (167, 1309), (381, 1533), (399, 1543), (587, 1624), (676, 1654), (751, 1661)]