In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Đọc dữ liệu và tiền xử lý

In [None]:
# Import library
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Define column names based on the provided information
column_names = ["item_id", "Title", "Release Date", "Video Release Date", "IMDb URL",
                "Unknown", "Action", "Adventure", "Animation", "Children's",
                "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
                "Sci-Fi", "Thriller", "War", "Western"]

# Read the file with '|' delimiter and specifying column names
item_df = pd.read_csv("/content/drive/MyDrive/Recommendation-System/data/ml-100k/u.item", sep='|', names=column_names, encoding='latin-1')


# Define column names based on the provided information
column_names = ["User ID", "Age", "Gender", "Occupation", "ZIP Code"]

# Read the file with '|' delimiter and specifying column names
user_df = pd.read_csv("/content/drive/MyDrive/Recommendation-System/data/ml-100k/u.user", sep='|', names=column_names, encoding='latin-1')


# Đọc dữ liệu từ file "ua.base" vào DataFrame
ratings_df = pd.read_csv("/content/drive/MyDrive/Recommendation-System/data/ml-100k/ua.base", sep='\t', header=None)

# Đặt tên cho các cột
ratings_df.columns = ['user_id', 'item_id', 'rating', 'timestamp']

In [None]:
ratings_df.drop('timestamp', axis=1, inplace=True)

# Tổ chức CF

In [None]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        self.predict = None

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important
        # for both memory and computing efficiency. For example, if #user = 1M,
        # #item = 100k, then shape of the rating matrix would be (100k, 1M),
        # you may not have enough memory to store this. Then, instead, we store
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)


    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()


    def __pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    def pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF:
            pred_rating = self.__pred(u, i, normalized)
        else:
            pred_rating = self.__pred(i, u, normalized)

        # Nếu xếp hạng dự đoán lớn hơn 5, trả về 5; ngược lại, trả về xếp hạng dự đoán
        return min(pred_rating, 5)


    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def recommend_top_k(self, u, top_k):
        """
        Recommend top 5 films with the highest predicted ratings for user u.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.pred(u, i, 0)
                if rating > 0:
                    recommended_items.append((i, rating))

        recommended_items.sort(key=lambda x: x[1], reverse=True)

        top_k_recommendations = recommended_items[:top_k]

        return top_k_recommendations

    def get_ratings_predict(self):
        """
        Recommend predicted ratings for all users and items.
        """
        all_recommendations = []

        for u in range(self.n_users):
            for i in range(self.n_items):
                rating = self.pred(u, i, 0)
                if rating > 0:
                    print("user_id:", u, "item_id:", i, "rating:", rating)
                    all_recommendations.append((u, i, rating))

        if all_recommendations:
            self.predict = pd.DataFrame(all_recommendations, columns=['user_id', 'item_id', 'predicted_rating'])
        else:
            self.predict = None

        return self.predict

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('    Recommend item(s):', recommended_items, 'for user', u)
            else:
                print('    Recommend item', u, 'for user(s) : ', recommended_items)

## Tập dữ liệu

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('/content/drive/MyDrive/Recommendation-System/data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('/content/drive/MyDrive/Recommendation-System/data/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [None]:
rs = CF(rate_train, k=30, uuCF=1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0  # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized=0)
    SE += (pred - rate_test[n, 2]) ** 2

RMSE = np.sqrt(SE / n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.6952000543773251


In [None]:
from sklearn.metrics import r2_score

# Các giá trị thực tế
y_true = rate_test[:, 2]

# Các giá trị dự đoán
y_pred = np.array([rs.pred(rate_test[n, 0], rate_test[n, 1], normalized=0) for n in range(n_tests)])

# Tính R-squared
r_squared = r2_score(y_true, y_pred)

print('R-squared =', r_squared)

R-squared = 0.6172745239283569


In [None]:
ratings_predict = rs.get_ratings_predict()
ratings_predict

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
user_id: 940 item_id: 46 rating: 3.816021044135729
user_id: 940 item_id: 47 rating: 4.0748054009126085
user_id: 940 item_id: 48 rating: 3.7056125305658165
user_id: 940 item_id: 49 rating: 4.195459404372217
user_id: 940 item_id: 50 rating: 3.6646798061418333
user_id: 940 item_id: 51 rating: 3.9971107476796903
user_id: 940 item_id: 52 rating: 3.7664910240196554
user_id: 940 item_id: 53 rating: 3.7455106453271543
user_id: 940 item_id: 54 rating: 3.9280722792086102
user_id: 940 item_id: 55 rating: 3.9685119312902652
user_id: 940 item_id: 56 rating: 4.036504532235128
user_id: 940 item_id: 57 rating: 3.960226596681343
user_id: 940 item_id: 58 rating: 4.211240652227531
user_id: 940 item_id: 59 rating: 4.106970722814214
user_id: 940 item_id: 60 rating: 4.163496917229642
user_id: 940 item_id: 61 rating: 3.6750348057486697
user_id: 940 item_id: 62 rating: 3.6515551195704425
user_id: 940 item_id: 63 rating: 4.30301396785076
us

Unnamed: 0,user_id,item_id,predicted_rating
0,0,0,4.048754
1,0,1,3.612426
2,0,2,3.401359
3,0,3,3.717114
4,0,4,3.465786
...,...,...,...
1586087,942,1677,3.392405
1586088,942,1678,3.392405
1586089,942,1679,3.392405
1586090,942,1680,3.392405


In [None]:
ratings_pivot = ratings_predict.pivot(index='item_id', columns='user_id', values='predicted_rating')
ratings_pivot

user_id,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.048754,3.713398,3.166526,4.589606,3.390088,3.740628,4.335887,4.063631,3.556423,4.110697,...,3.432569,4.123193,3.703624,3.302537,3.411483,4.642177,3.575000,4.706250,4.331609,3.487957
1,3.612426,3.478433,2.762915,3.793576,2.842411,3.288998,3.846055,3.837736,3.999173,4.022395,...,3.735400,3.936213,3.391353,3.014445,3.112762,4.243597,3.370709,3.672772,4.060193,3.587609
2,3.401359,3.615732,2.702551,3.642919,2.647279,3.308597,3.796036,3.729247,3.166667,3.787438,...,3.496386,3.738482,3.475934,2.844094,2.994189,4.038683,3.426049,3.578102,3.638851,3.026260
3,3.717114,3.696342,3.022883,4.069061,3.069831,3.707334,4.463244,3.927591,3.683975,4.011475,...,4.187489,3.872257,3.785101,3.089386,3.250595,4.349099,2.919212,3.730435,4.228549,3.479864
4,3.465786,3.750673,2.836971,4.113240,2.843313,3.331556,3.918864,3.713959,4.408915,4.226266,...,3.534925,3.939929,3.608942,3.146740,3.221646,4.060898,3.348898,3.708581,4.121265,3.164883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1.603054,1.750001,4.909090,5.000000,0.909092,1.631841,3.959288,1.857143,4.166667,2.201150,...,3.707317,1.965518,1.742424,5.000000,1.306123,4.307692,1.463919,3.833333,2.304348,3.392405
1678,3.603053,3.750000,2.909091,4.357143,2.909091,3.631841,3.959288,3.857143,4.166667,4.201149,...,3.707317,3.965517,3.742424,3.233333,3.306122,4.307692,3.463918,3.833333,4.304348,3.392405
1679,2.603054,2.750001,3.909091,5.000000,1.909092,2.631841,3.959288,2.857143,4.166667,3.201150,...,3.707317,2.965517,2.742424,4.233333,2.306123,4.307692,2.463918,3.833333,3.304348,3.392405
1680,3.603053,3.750000,2.909091,4.357143,2.909091,3.631841,3.959288,3.857143,4.166667,4.201149,...,3.707317,3.965517,3.742424,3.233333,3.306122,4.307692,3.463918,3.833333,4.304348,3.392405


In [None]:
ratings_pivot_filled = ratings_pivot.fillna(0)
ratings_pivot_filled

user_id,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.048754,3.713398,3.166526,4.589606,3.390088,3.740628,4.335887,4.063631,3.556423,4.110697,...,3.432569,4.123193,3.703624,3.302537,3.411483,4.642177,3.575000,4.706250,4.331609,3.487957
1,3.612426,3.478433,2.762915,3.793576,2.842411,3.288998,3.846055,3.837736,3.999173,4.022395,...,3.735400,3.936213,3.391353,3.014445,3.112762,4.243597,3.370709,3.672772,4.060193,3.587609
2,3.401359,3.615732,2.702551,3.642919,2.647279,3.308597,3.796036,3.729247,3.166667,3.787438,...,3.496386,3.738482,3.475934,2.844094,2.994189,4.038683,3.426049,3.578102,3.638851,3.026260
3,3.717114,3.696342,3.022883,4.069061,3.069831,3.707334,4.463244,3.927591,3.683975,4.011475,...,4.187489,3.872257,3.785101,3.089386,3.250595,4.349099,2.919212,3.730435,4.228549,3.479864
4,3.465786,3.750673,2.836971,4.113240,2.843313,3.331556,3.918864,3.713959,4.408915,4.226266,...,3.534925,3.939929,3.608942,3.146740,3.221646,4.060898,3.348898,3.708581,4.121265,3.164883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1.603054,1.750001,4.909090,5.000000,0.909092,1.631841,3.959288,1.857143,4.166667,2.201150,...,3.707317,1.965518,1.742424,5.000000,1.306123,4.307692,1.463919,3.833333,2.304348,3.392405
1678,3.603053,3.750000,2.909091,4.357143,2.909091,3.631841,3.959288,3.857143,4.166667,4.201149,...,3.707317,3.965517,3.742424,3.233333,3.306122,4.307692,3.463918,3.833333,4.304348,3.392405
1679,2.603054,2.750001,3.909091,5.000000,1.909092,2.631841,3.959288,2.857143,4.166667,3.201150,...,3.707317,2.965517,2.742424,4.233333,2.306123,4.307692,2.463918,3.833333,3.304348,3.392405
1680,3.603053,3.750000,2.909091,4.357143,2.909091,3.631841,3.959288,3.857143,4.166667,4.201149,...,3.707317,3.965517,3.742424,3.233333,3.306122,4.307692,3.463918,3.833333,4.304348,3.392405


In [None]:
ratings_pivot_filled.to_csv('/content/drive/MyDrive/Recommendation-System/ratings_pivot_filled.csv', index=True)

In [None]:
recommendations = rs.recommend_top_k(942, top_k = 10)
recommendations

[(829, 5),
 (851, 5),
 (890, 5),
 (896, 5),
 (1308, 5),
 (1490, 4.811674850528234),
 (1307, 4.504718145959886),
 (912, 4.456514951946335),
 (852, 4.408897802980359),
 (1466, 4.392404990152209)]

In [None]:
recommendation_ids = [pair[0] for pair in recommendations]
print(recommendation_ids)

[829, 851, 890, 896, 1308, 1490, 1307, 912, 852, 1466]


In [None]:
rs.print_recommendation()

In [None]:
rs = CF(rate_train, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)