## 数据预处理

+ 导入数据，将txt转化为csv的dataframe
+ 将用户从0开始编号
+ 通过数据透视函数构建 用户*电影 矩阵
+ 测试集缺失部分电影，将其补全为10000 * 10000的矩阵

In [2]:
# 导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 导入数据
user = pd.read_csv("users.txt", names = ['userid'])
netflix_train = pd.read_csv("netflix_train.txt", sep = ' ', names = ['user_id', 'film_id', 'rating', 'date'])
netflix_test = pd.read_csv("netflix_test.txt", sep = ' ', names = ['user_id', 'film_id', 'rating', 'date'])

# 在user的df中建立新的一列
user['id'] = range(len(user))
netflix_train = netflix_train.merge(user, left_on='user_id', right_on='userid')
netflix_test = netflix_test.merge(user, left_on='user_id', right_on='userid')

X_train = netflix_train.pivot(index='id', columns='film_id', values='rating')
X_test = netflix_test.pivot(index='id', columns='film_id', values='rating')

# 将测试集补全为10000 * 10000的矩阵
for i in range(1, 10001):
    if i not in X_test.columns:
        X_test[i] = np.nan
X_test = X_test.sort_index(axis=1)


print(X_train.head())  # head() 默认是前5行数据
print(X_test.head())

film_id  1      2      3      4      5      6      7      8      9      10     \
id                                                                              
0          1.0    NaN    2.0    1.0    1.0    1.0    2.0    1.0    NaN    1.0   
1          1.0    NaN    2.0    NaN    1.0    2.0    1.0    NaN    NaN    NaN   
2          NaN    1.0    NaN    1.0    NaN    1.0    NaN    1.0    NaN    1.0   
3          5.0    NaN    NaN    3.0    5.0    4.0    3.0    4.0    3.0    4.0   
4          5.0    NaN    4.0    3.0    5.0    4.0    3.0    NaN    NaN    3.0   

film_id  ...  9991   9992   9993   9994   9995   9996   9997   9998   9999   \
id       ...                                                                  
0        ...    2.0    1.0    NaN    1.0    1.0    2.0    1.0    1.0    1.0   
1        ...    NaN    1.0    1.0    1.0    NaN    2.0    1.0    1.0    2.0   
2        ...    1.0    1.0    NaN    NaN    NaN    1.0    1.0    1.0    1.0   
3        ...    5.0    3.0    NaN    

In [5]:
# Collaborate Filtering
# Compute the overall mean and mean by row and column
mu = np.mean(np.mean(X_train))
bx = np.array(np.mean(X_train, axis=1) - mu)
by = np.array(np.mean(X_train, axis=0) - mu)

# Compute the similarity matrix
X = X_train.sub(bx+mu, axis=0)   # Demean
X = X.div(np.sqrt(np.sum(np.square(X), axis=1)), axis=0)

# fill the 0
X.fillna(0, inplace=True)
similarity_matrix = np.dot(X, X.T)

# Compute the point matrix using CF
X_train = np.array(X_train.fillna(0))
for i in range(X_train.shape[0]):
    indexs = np.argsort(similarity_matrix[i, :])[::-1]
    for j in range(X_train.shape[1]):
        if X_train[i, j] == 0:
            sum = 0
            num = 0
            simi = 0
            k = 0
            while num < 3 & k < X_train.shape[1]:    # top 3
                if X_train[indexs[k], j] > 0:
                    sum = sum + similarity_matrix[i, indexs[k]] * (X_train[indexs[k], j] - mu - bx[indexs[k]] - by[j])
                    simi = simi + similarity_matrix[i, indexs[k]]
                    k = k+1
                    num = num + 1
                else:
                    k = k+1
            if simi != 0:
                X_train[i, j] = mu + bx[i] + by[j] + sum/simi
            else:
                X_train[i, j] = mu + bx[i] + by[j]
        else:
            continue
            
# Compute RMSE for the algorithm
sum = 0
for index, rows in netflix_test.iterrows():
    sum = sum + np.square(X_train[rows['id'], rows['film_id']-1] - rows['rating'])

RMSE = np.sqrt(sum/netflix_test.shape[0])
print(RMSE)


1.013374013560779
