In [100]:
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sortedcontainers import SortedList
import math


Tách thành tập train và test


In [101]:
df = pd.read_csv('../dataset/rating_final.csv')
df = shuffle(df)

cut_off_length = int(0.9 * len(df))
df_train = df[:cut_off_length]
df_test = df[cut_off_length:]


In [102]:
user_to_movie = {}
movie_to_user = {}
movie_user_rating = {}

for row in df.values:
    movie_id, user_id, rate = row
    if movie_id not in user_to_movie:
        user_to_movie[user_id] = [movie_id]
    else:
        user_to_movie[user_id].append(movie_id)

    if movie_id not in movie_to_user:
        movie_to_user[movie_id] = [user_id]
    else:
        movie_to_user[movie_id].append(user_id)

    movie_user_rating[(movie_id, user_id)] = rate

In [103]:
movie_user_rating_test = {}
for row in df_test.values:
    movie_id, user_id, rate = row

    movie_user_rating_test[(movie_id, user_id)] = rate


In [104]:
number_of_movie = int(len(movie_to_user.keys()))
number_of_user = int(len(user_to_movie.keys()))

print(f'Number of movie: {number_of_movie}')
print(f'Number of user: {number_of_user}')


Number of movie: 490
Number of user: 1002


In [105]:
k_neighbor = 20
limit_neighbor = 5
neighbors = {}
averages = {}
deviations = {}
similarities = {}

for movie_id_i in movie_to_user.keys():
    similarities[movie_id_i] = []

    # get all user who rated for movie_id_i
    user_rated_movie_i = movie_to_user[movie_id_i]

    # get all rating of users gave for movie_id_i
    rating_user_for_movie_i = [
        movie_user_rating[(movie_id_i, user_id)] for user_id in user_rated_movie_i]

    # calculate avarage rating of all user gave for movie_id_i
    avg_rating_movie_i = np.mean(rating_user_for_movie_i)

    averages[movie_id_i] = avg_rating_movie_i
    
    sorted_list = SortedList()
    for movie_id_j in movie_to_user.keys():
        if movie_id_j == movie_id_i:
            continue

        # get all user who rated for movie_id_j
        user_rated_movie_j = movie_to_user[movie_id_j]

        common_users = list(set(user_rated_movie_i).intersection(set(user_rated_movie_j)))

        if len(common_users) < limit_neighbor:
            continue

        # get all rating of users gave for movie_id_j
        rating_user_for_movie_j = [movie_user_rating[(movie_id_j, user_id)] for user_id in user_rated_movie_j]

        # calculate avarage rating of all user gave for movie_id_j
        avg_rating_movie_j = np.mean(rating_user_for_movie_j)
        
        deviation = 0
        sigmoid_i = 0
        sigmoid_j = 0

        for user_id in common_users:
            dev_i = (movie_user_rating[(movie_id_i, user_id)] - avg_rating_movie_i)
            dev_j = (movie_user_rating[(movie_id_j, user_id)] - avg_rating_movie_j)
            deviation += dev_i * dev_j
            sigmoid_i += dev_i * dev_i
            sigmoid_j += dev_j * dev_j

        s_ij = deviation / (math.sqrt(sigmoid_i) * math.sqrt(sigmoid_j))
        sorted_list.add((s_ij, movie_id_j))
        similarities[movie_id_i].append((s_ij, movie_id_j))

    similarities[movie_id_i] = sorted(similarities[movie_id_i], key = lambda x: x[1])

similarities

  s_ij = deviation / (math.sqrt(sigmoid_i) * math.sqrt(sigmoid_j))


{'tt2571774': [(0.5463627318770473, 'tt0015864'),
  (0.4425714639105998, 'tt0017136'),
  (0.01629743939617004, 'tt0019254'),
  (0.012954850089290528, 'tt0022100'),
  (0.2199120474511256, 'tt0025316'),
  (0.8062265411721607, 'tt0031381'),
  (0.16622213166280683, 'tt0032551'),
  (0.4205442069659832, 'tt0032553'),
  (0.2320070224428967, 'tt0032976'),
  (0.3518469490722213, 'tt0034583'),
  (0.41772771195869673, 'tt0036775'),
  (0.5458074356511479, 'tt0036868'),
  (0.3264576107242181, 'tt0040522'),
  (0.35437164468684007, 'tt0040897'),
  (0.17259990087129093, 'tt0042876'),
  (0.009506684435847483, 'tt0044741'),
  (-0.03268879060631447, 'tt0045152'),
  (-0.16364362367153504, 'tt0046268'),
  (0.15819266321344144, 'tt0046438'),
  (-0.22820136034829638, 'tt0047478'),
  (0.5177865617768453, 'tt0047708'),
  (0.5435517997359849, 'tt0048473'),
  (-0.37652580638744176, 'tt0050083'),
  (-0.23922102912825555, 'tt0050212'),
  (0.13966062183948783, 'tt0052357'),
  (0.886497749651301, 'tt0052520'),
  (0.

In [106]:
def predict(movie_id_i, user_id):
    if movie_id_i not in movie_to_user or user_id not in user_to_movie:
        return None

    # calculate average rating of all users who rated movie_id_i
    avg_rating_movie_i = averages[movie_id_i]

    # calculate the predicted rating for movie_id_i by user_id
    predicted_rating = avg_rating_movie_i

    similarity_sum = 0
    weighted_rating_sum = 0
    limit_neighbors = 20
    count = 0

    for similarity, neighbor_movie_id in similarities[movie_id_i]:
        if count > limit_neighbors:
            break
        
        if (neighbor_movie_id, user_id) in movie_user_rating:
            weighted_rating_sum += similarity * (movie_user_rating[(neighbor_movie_id, user_id)] - averages[neighbor_movie_id])
            similarity_sum += abs(similarity)
            count += 1

    if (similarity_sum == 0):
        return predicted_rating
    return predicted_rating + (weighted_rating_sum / similarity_sum)

In [107]:
rmse = 0
for movie_id, user_id in movie_user_rating_test:
    # calculate the prediction for this movie
    predict_rating = predict(movie_id, user_id)
    # save the prediction and target
    error = movie_user_rating_test[(movie_id, user_id)] - predict_rating
    
    rmse += error ** 2

rmse = math.sqrt(rmse / len(movie_user_rating_test))

In [108]:
rmse

nan