# 第一步 收集数据

https://grouplens.org/datasets/movielens/

# 第二步 准备数据

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [3]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies_df.tail()

Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [4]:
movies_df['movieRow'] = movies_df.index
movies_df.tail()

Unnamed: 0,movieId,title,genres,movieRow
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,9737
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,9738
9739,193585,Flint (2017),Drama,9739
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,9740
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,9741


## 筛选 movies_df 中的特征

In [5]:
movies_df = movies_df[['movieRow', 'movieId', 'title']]
movies_df.to_csv('ml-latest-small/moviesProcessed.csv', index=False, header=True, encoding='utf-8')

In [6]:
movies_df

Unnamed: 0,movieRow,movieId,title
0,0,1,Toy Story (1995)
1,1,2,Jumanji (1995)
2,2,3,Grumpier Old Men (1995)
3,3,4,Waiting to Exhale (1995)
4,4,5,Father of the Bride Part II (1995)
...,...,...,...
9737,9737,193581,Black Butler: Book of the Atlantic (2017)
9738,9738,193583,No Game No Life: Zero (2017)
9739,9739,193585,Flint (2017)
9740,9740,193587,Bungo Stray Dogs: Dead Apple (2018)


# 将 ratings_df 中的 movieId 替换为行号

In [7]:
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieRow,title
0,1,1,4.0,964982703,0,Toy Story (1995)
1,5,1,4.0,847434962,0,Toy Story (1995)
2,7,1,4.5,1106635946,0,Toy Story (1995)
3,15,1,2.5,1510577970,0,Toy Story (1995)
4,17,1,4.5,1305696483,0,Toy Story (1995)
5,18,1,3.5,1455209816,0,Toy Story (1995)
6,19,1,4.0,965705637,0,Toy Story (1995)
7,21,1,3.5,1407618878,0,Toy Story (1995)
8,27,1,3.0,962685262,0,Toy Story (1995)
9,31,1,5.0,850466616,0,Toy Story (1995)


In [8]:
ratings_df = ratings_df[['userId', 'movieRow', 'rating']]
ratings_df.to_csv('ml-latest-small/ratingsProcessed.csv', index=False, header=True, encoding='utf-8')

In [10]:
ratings_df.tail()

Unnamed: 0,userId,movieRow,rating
100831,610,9325,2.5
100832,610,9330,4.5
100833,610,9342,3.0
100834,610,9389,3.5
100835,610,9390,3.5


## 创建电影评分矩阵 rating 和 评分记录矩阵 record

In [13]:
userNo = ratings_df['userId'].max() + 1
movieNo = ratings_df['movieRow'].max() + 1
print(userNo, movieNo)

611 9742


In [14]:
rating = np.zeros((movieNo, userNo))

In [18]:
flag = 0 # 记录处理进度
ratings_df_length = np.shape(ratings_df)[0] # ratings_df 样本个数
for index, row in ratings_df.iterrows():
    rating[int(row['movieRow']), int(row['userId'])] = row['rating']
    flag += 1
    if flag % 100 == 0 or flag == ratings_df_length:
        print('processed {}, {} left'.format(flag, ratings_df_length-flag))

processed 100, 100736 left
processed 200, 100636 left
processed 300, 100536 left
processed 400, 100436 left
processed 500, 100336 left
processed 600, 100236 left
processed 700, 100136 left
processed 800, 100036 left
processed 900, 99936 left
processed 1000, 99836 left
processed 1100, 99736 left
processed 1200, 99636 left
processed 1300, 99536 left
processed 1400, 99436 left
processed 1500, 99336 left
processed 1600, 99236 left
processed 1700, 99136 left
processed 1800, 99036 left
processed 1900, 98936 left
processed 2000, 98836 left
processed 2100, 98736 left
processed 2200, 98636 left
processed 2300, 98536 left
processed 2400, 98436 left
processed 2500, 98336 left
processed 2600, 98236 left
processed 2700, 98136 left
processed 2800, 98036 left
processed 2900, 97936 left
processed 3000, 97836 left
processed 3100, 97736 left
processed 3200, 97636 left
processed 3300, 97536 left
processed 3400, 97436 left
processed 3500, 97336 left
processed 3600, 97236 left
processed 3700, 97136 left
pr

processed 30200, 70636 left
processed 30300, 70536 left
processed 30400, 70436 left
processed 30500, 70336 left
processed 30600, 70236 left
processed 30700, 70136 left
processed 30800, 70036 left
processed 30900, 69936 left
processed 31000, 69836 left
processed 31100, 69736 left
processed 31200, 69636 left
processed 31300, 69536 left
processed 31400, 69436 left
processed 31500, 69336 left
processed 31600, 69236 left
processed 31700, 69136 left
processed 31800, 69036 left
processed 31900, 68936 left
processed 32000, 68836 left
processed 32100, 68736 left
processed 32200, 68636 left
processed 32300, 68536 left
processed 32400, 68436 left
processed 32500, 68336 left
processed 32600, 68236 left
processed 32700, 68136 left
processed 32800, 68036 left
processed 32900, 67936 left
processed 33000, 67836 left
processed 33100, 67736 left
processed 33200, 67636 left
processed 33300, 67536 left
processed 33400, 67436 left
processed 33500, 67336 left
processed 33600, 67236 left
processed 33700, 671

processed 60400, 40436 left
processed 60500, 40336 left
processed 60600, 40236 left
processed 60700, 40136 left
processed 60800, 40036 left
processed 60900, 39936 left
processed 61000, 39836 left
processed 61100, 39736 left
processed 61200, 39636 left
processed 61300, 39536 left
processed 61400, 39436 left
processed 61500, 39336 left
processed 61600, 39236 left
processed 61700, 39136 left
processed 61800, 39036 left
processed 61900, 38936 left
processed 62000, 38836 left
processed 62100, 38736 left
processed 62200, 38636 left
processed 62300, 38536 left
processed 62400, 38436 left
processed 62500, 38336 left
processed 62600, 38236 left
processed 62700, 38136 left
processed 62800, 38036 left
processed 62900, 37936 left
processed 63000, 37836 left
processed 63100, 37736 left
processed 63200, 37636 left
processed 63300, 37536 left
processed 63400, 37436 left
processed 63500, 37336 left
processed 63600, 37236 left
processed 63700, 37136 left
processed 63800, 37036 left
processed 63900, 369

processed 90100, 10736 left
processed 90200, 10636 left
processed 90300, 10536 left
processed 90400, 10436 left
processed 90500, 10336 left
processed 90600, 10236 left
processed 90700, 10136 left
processed 90800, 10036 left
processed 90900, 9936 left
processed 91000, 9836 left
processed 91100, 9736 left
processed 91200, 9636 left
processed 91300, 9536 left
processed 91400, 9436 left
processed 91500, 9336 left
processed 91600, 9236 left
processed 91700, 9136 left
processed 91800, 9036 left
processed 91900, 8936 left
processed 92000, 8836 left
processed 92100, 8736 left
processed 92200, 8636 left
processed 92300, 8536 left
processed 92400, 8436 left
processed 92500, 8336 left
processed 92600, 8236 left
processed 92700, 8136 left
processed 92800, 8036 left
processed 92900, 7936 left
processed 93000, 7836 left
processed 93100, 7736 left
processed 93200, 7636 left
processed 93300, 7536 left
processed 93400, 7436 left
processed 93500, 7336 left
processed 93600, 7236 left
processed 93700, 713

In [20]:
record = rating > 0
record = np.array(record, dtype=int)
record

array([[0, 1, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 第三步 构建模型

In [24]:
def normalizeRatings(rating, record):
    m, n = rating.shape
    rating_mean = np.zeros((m, 1))
    rating_norm = np.zeros((m, n))
    # 对每部电影进行迭代
    for i in range(m):
        idx = record[i, :] != 0
        rating_mean[i] = np.mean(rating[i, idx])
        rating_norm[i, idx] -= rating_mean[i]
    return rating_norm, rating_mean

In [25]:
rating_norm, rating_mean = normalizeRatings(rating, record)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [26]:
rating_norm = np.nan_to_num(rating_norm)
rating_norm

array([[ 0.        , -3.92093023,  0.        , ..., -3.92093023,
        -3.92093023, -3.92093023],
       [ 0.        ,  0.        ,  0.        , ..., -3.43181818,
         0.        ,  0.        ],
       [ 0.        , -3.25961538,  0.        , ..., -3.25961538,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [27]:
rating_mean = np.nan_to_num(rating_mean)
rating_mean

array([[3.92093023],
       [3.43181818],
       [3.25961538],
       ...,
       [3.5       ],
       [3.5       ],
       [4.        ]])

In [28]:
num_features = 10

In [30]:
X_parameters = tf.Variable(tf.random_normal([movieNo, num_features], stddev=0.35))

In [31]:
Theta_parameters = tf.Variable(tf.random_normal([userNo, num_features], stddev=0.35))

### 公式：

$$
J(X,\theta)=\frac{1}{2}\sum_{(i,j):r(i,j)=1}\left(\left(\theta^{(j)}\right)^Tx^{(i)}-y^{(i,j)}\right)^2+\frac{\lambda}{2}\sum_{i=1}^{m}\sum_{k=1}^n\left(x_k^{(i)}\right)^2+\frac{\lambda}{2}\sum_{i=1}^{m}\sum_{k=1}^n\left(\theta_k^{(j)}\right)^2
$$

In [35]:
# $\lambda$ = 1
loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_parameters, transpose_b=True) - rating_norm) * record) ** 2) + \
       1/2 * tf.reduce_sum(X_parameters ** 2) + \
       1/2 * tf.reduce_sum(Theta_parameters ** 2)

In [36]:
optimizer = tf.train.AdamOptimizer(1e-4)
train = optimizer.minimize(loss)

# 第四步 训练模型

In [37]:
tf.summary.scalar('loss', loss)

<tf.Tensor 'loss:0' shape=() dtype=string>

In [38]:
# 将所有信息汇总
summaryMerged = tf.summary.merge_all()

In [39]:
filename = './movie_tensorboard'

In [40]:
writer = tf.summary.FileWriter(filename)

In [42]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [44]:
for i in range(5000):
    tmp, movie_summary = sess.run([train, summaryMerged])
    writer.add_summary(movie_summary, i)

# 第五步 评估模型

In [45]:
Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_parameters])

In [46]:
predicts = np.dot(Current_X_parameters, Current_Theta_parameters.T) + rating_mean
errors = np.sqrt(np.sum((predicts - rating) ** 2))

In [47]:
print(errors)

4023.788973404913


# 第六步 构建完整的电影推荐系统

In [53]:
user_id = input('请输入您的用户编号：')

sortedResult = predicts[:, int(user_id)].argsort()[::-1] # 从大到小排序

idx = 0
print('为您推荐20部电影：'.center(80, '='))
for i in sortedResult:
    print('%2d. 评分：%.2f，电影名：%s' % (idx + 1, predicts[i, int(user_id)], movies_df.iloc[i]['title']))
    idx += 1
    if idx == 20:
        break

请输入您的用户编号：435
 1. 评分：4.83，电影名：Asphyx, The (1973)
 2. 评分：4.67，电影名：Evil Aliens (2005)
 3. 评分：4.62，电影名：A Perfect Day (2015)
 4. 评分：4.54，电影名：Saving Santa (2013)
 5. 评分：4.51，电影名：Sapphire Blue (2014)
 6. 评分：4.48，电影名：Sonatine (Sonachine) (1993)
 7. 评分：4.48，电影名：Family Guy Presents: Something, Something, Something, Dark Side (2009)
 8. 评分：4.47，电影名：00 Schneider - Jagd auf Nihil Baxter (1994)
 9. 评分：4.44，电影名：Asterix and the Vikings (Astérix et les Vikings) (2006)
10. 评分：4.43，电影名：Angry Red Planet, The (1959)
11. 评分：4.40，电影名：Little Dieter Needs to Fly (1997)
12. 评分：4.39，电影名：The Road Within (2014)
13. 评分：4.39，电影名：Dead Meat (2004)
14. 评分：4.32，电影名：Scooby-Doo! and the Samurai Sword (2009)
15. 评分：4.32，电影名：Tales of Manhattan (1942)
16. 评分：4.31，电影名：Man with the Golden Arm, The (1955)
17. 评分：4.28，电影名：The Jinx: The Life and Deaths of Robert Durst (2015)
18. 评分：4.23，电影名：Thin Line Between Love and Hate, A (1996)
19. 评分：4.19，电影名：Alpha (2018)
20. 评分：4.19，电影名：Don't Look Now: We're Being Shot At (La grande vadrou