In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

## 载入MovieLens数据集

In [2]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
full_matrix = (ratings.pivot_table('rating', index='userId', columns='movieId') >= 0).astype(np.int)
full_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 分割数据集

In [4]:
# 随机从观影数量至少40部的用户中抽取5部影片作为测试集，选择200名用户作为测试
least_item = 40
test_sample_num = 5
test_user_num = 200

# 生成测试集
test_matrix = full_matrix[full_matrix.sum(axis=1) >= least_item].sample(test_user_num, random_state=47)
test_matrix = test_matrix.apply(lambda x: x[x == 1].sample(test_sample_num, random_state=47), axis=1).fillna(0)

# 生成训练集
train_matrix = full_matrix - test_matrix.reindex_like(full_matrix).fillna(0)

In [5]:
full_matrix.sum().sum()

100004

In [6]:
train_matrix.sum().sum()

99004.0

In [7]:
test_matrix.sum().sum()

1000.0

## 测试集效果验证准备

In [8]:
# 计算准确率和召回率
def compute_score(test, pred):
    hit = (test * pred).sum().sum()
    precision = hit / pred.sum().sum()
    recall = hit / test.sum().sum()
    f1_score = 2. / (1. / precision + 1. / recall)
    return f1_score, precision, recall

In [9]:
selected_train_matrix = train_matrix.loc[test_matrix.index]
selected_train_matrix.shape

(200, 9066)

In [10]:
# topN推荐，推荐5部电影
topN = 5

In [11]:
# 用户活跃度
user_activity = train_matrix.sum(axis=1)

# 影片热门度
movie_popularity = train_matrix.sum(axis=0)

## 计算热门推荐的准确率和召回率

In [12]:
def compute_popular(row):
    index = row.sort_values(ascending=False)[:topN].index
    row[:] = 0
    row[index] = 1
    return row

# popularity_matrix = selected_train_matrix.copy()
popularity_matrix = pd.DataFrame([movie_popularity], index=selected_train_matrix.index, columns=selected_train_matrix.columns)
popularity_matrix[selected_train_matrix == 1] = 0
pred_matrix = popularity_matrix.apply(compute_popular, axis=1)

In [13]:
compute_score(test_matrix, pred_matrix)

(0.062, 0.062, 0.062)

## 计算ItemCF的准确率的召回率

In [14]:
for i in range(movie_num):
    if i % 1000 == 0:
        print('movie',i)
    if models[i] > 0:
        for j in range(i + 1, movie_num):
            deno = models[i] * models[j]
            if deno > 0:
                item1 = uif_train_matrix[:,i]
                item2 = uif_train_matrix[:,j]
                similarity = item1 @ item2 / deno # 计算余弦相似性
                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

NameError: name 'movie_num' is not defined

In [44]:
# 计算各影片被观看的次数的平方根，用于计算余弦相似性
movie_popularity_root = np.sqrt(movie_popularity)

# 计算用户活跃度
user_activity_log = np.log(1 + user_activity)
# 计算带有活跃度惩罚的训练矩阵
uif_train_matrix = train_matrix.div(user_activity_log, axis=0)

# 计算影片之间的余弦相似性矩阵
similarity_matrix = pd.DataFrame(0, index=selected_train_matrix.columns, columns=selected_train_matrix.columns)
count = 0
for i, row in similarity_matrix.iterrows():
    if count % 100 == 0:
        print('count', count)
    count += 1
    if movie_popularity_root[i] > 0:
        for j, value in row.items():
            if i == j:
                continue
            if similarity_matrix.loc[i, j] > 0 or similarity_matrix.loc[j, i] > 0:
                continue
            deno = movie_popularity_root[i] * movie_popularity_root[j]
            if deno > 0:
                item1 = uif_train_matrix.loc[:,i].values
                item2 = uif_train_matrix.loc[:,j].values
                similarity = item1 @ item2 / deno # 计算余弦相似性
                similarity_matrix.loc[i, j] = similarity
                similarity_matrix.loc[j, i] = similarity

# for i in range(similarity_matrix.shape[0]):
#     if i % 1000 == 0:
#         print('movie',i)
#     iname = similarity_matrix.iloc[i].name
#     if movie_popularity_root[iname] > 0:
#         for j in range(i + 1, similarity_matrix.shape[1]):
#             jname = similarity_matrix.iloc[:,j].name
#             deno = movie_popularity_root[iname] * movie_popularity_root[jname]
#             if deno > 0:
#                 similarity = (uif_train_matrix.loc[:,iname] * uif_train_matrix.loc[:,jname]).sum() / deno # 计算余弦相似性
#                 print(iname, jname, similarity)
#                 similarity_matrix.loc[iname, jname] = similarity
#                 similarity_matrix.loc[jname, iname] = similarity

count 0


KeyboardInterrupt: 

In [18]:
sum(uif_train_matrix.loc[:,1] * uif_train_matrix.loc[:,2])

2.1931473803011112

In [20]:
df = pd.DataFrame([[13,4,5,6,7]], index=[1,2,3], columns=[3,4,5,6,7])

In [21]:
df

Unnamed: 0,3,4,5,6,7
1,13,4,5,6,7
2,13,4,5,6,7
3,13,4,5,6,7


In [31]:
for i, row in df.iterrows():
    for j, value in row.iteritems():
        print(i, j, value)

1 3 13
1 4 4
1 5 5
1 6 6
1 7 7
2 3 13
2 4 4
2 5 5
2 6 6
2 7 7
3 3 13
3 4 4
3 5 5
3 6 6
3 7 7


In [41]:
uif_train_matrix[:,1].values

TypeError: unhashable type: 'slice'