### 1.用协同过滤构建模型并进行预测

#### 1.1 movielens的例子

In [2]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import evaluate, print_perf

# load movielens dataset
data = Dataset.load_builtin('ml-100k')
# k fold cross validation
data.split(n_folds=3)
# SVD matrix decomposition
algo = KNNWithMeans()
# Sample test
perf = evaluate(algo, data, measures=['RMSE', 'MAE']) # MAE for mean absolute error
# output
print_perf(perf)



Evaluating RMSE, MAE of algorithm KNNWithMeans.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9560
MAE:  0.7515
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9554
MAE:  0.7514
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9597
MAE:  0.7575
------------
------------
Mean RMSE: 0.9570
Mean MAE : 0.7535
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9560  0.9554  0.9597  0.9570  
MAE     0.7515  0.7514  0.7575  0.7535  


### Load local dataset

In [1]:
import os
import io

from surprise import KNNBaseline, Reader
from surprise import Dataset

import pickle

# build the mapping info from song id to song name
id_name_dic = pickle.load(open("../data/popular_playlist.pkl", "rb"))
print("id to name dic built done")
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("name to id dic built done")

id to name dic built done
name to id dic built done


In [2]:
file_path = os.path.expanduser('../data/popular_music_suprise_format.txt')
# set format
reader = Reader(line_format='user item rating timestamp', sep=',')
# read data
music_data = Dataset.load_from_file(file_path=file_path, reader=reader)
# calcuate the similarity of songs
print('building dataset..')
train_set = music_data.build_full_trainset()

building dataset..


In [6]:
train_set.n_items

130573

In [7]:
train_set.n_users

3771

### Build model

In [18]:
algo = KNNBaseline()
algo.train(train_set)



Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f1a665d96d8>

### Test model

In [20]:
current_playlist = list(name_id_dic.keys())[89]
print("Current playlist: ", current_playlist)

# get neighbors
playlist_id = name_id_dic[current_playlist]
print("Id of playlist: ", playlist_id)
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print("Inner id of playlist: ", playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, 10)

# convert song id to song name
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                     for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                     for playlist_id in playlist_neighbors)

print()
print('the nearest 10 playlist of ', current_playlist, 'are:')
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]))

Current playlist:  李宗盛 理性与感性 作品音乐会
Id of playlist:  74920010
Inner id of playlist:  89

the nearest 10 playlist of  李宗盛 理性与感性 作品音乐会 are:
没有吉他我就唱不出歌·续 1
听了五年还不舍得删的华语歌 24
华语经典怀旧歌曲(女人篇) 43
至少有十首歌给你安慰 63
老人老歌 71
换个版本，再听一次！（华语篇） 91
别走，你没有来错地方 93
™ 网易• 典藏• 华语篇• 总有一首❤ 97
评论过万的中文歌与潜力股 101
程一电台音乐歌单-华语 110


### Make prediction which targeting to user

In [21]:
song_id_name_dic = pickle.load(open("../data/popular_song.pkl", "rb"))
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id

In [26]:
user_inner_id = 4
user_rating = train_set.ur[user_inner_id]

items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 4          item: 361        r_ui = 1.00   est = 1.00   {'was_impossible': False} 家	许巍
user: 4          item: 362        r_ui = 1.00   est = 1.00   {'was_impossible': False} 老街	李荣浩
user: 4          item: 363        r_ui = 1.00   est = 1.00   {'was_impossible': False} 滴答	侃侃
user: 4          item: 364        r_ui = 1.00   est = 1.00   {'was_impossible': False} 彩虹	周杰伦
user: 4          item: 365        r_ui = 1.00   est = 1.00   {'was_impossible': False} 米店	张玮玮和郭龙
user: 4          item: 366        r_ui = 1.00   est = 1.00   {'was_impossible': False} 情人	Beyond
user: 4          item: 367        r_ui = 1.00   est = 1.00   {'was_impossible': False} 喜欢你	Beyond
user: 4          item: 220        r_ui = 1.00   est = 1.00   {'was_impossible': False} 灰姑娘	郑钧
user: 4          item: 235        r_ui = 1.00   est = 1.00   {'was_impossible': False} 安和桥	宋冬野
user: 4          item: 240        r_ui = 1.00   est = 1.00   {'was_impossible': False} 去大理	郝云
user: 4          item: 368        r_ui = 1.00   est 

### Use matrix decomposition to make the prediction

In [20]:
from surprise import NMF, evaluate

file_path = os.path.expanduser("../data/popular_music_suprise_format.txt")
reader = Reader(line_format="user item rating timestamp", sep=',')
music_data = Dataset.load_from_file(file_path=file_path, reader=reader)

algo = NMF()
trainset = music_data.build_full_trainset()
algo.train(trainset=trainset)



<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f880924c940>

In [21]:
user_inner_id = 4
user_rating = train_set.ur[user_inner_id]

items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

user: 4          item: 361        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 家	许巍
user: 4          item: 362        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 老街	李荣浩
user: 4          item: 363        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 滴答	侃侃
user: 4          item: 364        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 彩虹	周杰伦
user: 4          item: 365        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 米店	张玮玮和郭龙
user: 4          item: 366        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 情人	Beyond
user: 4          item: 367        r_ui = 1.00   est = 1.00   {'was_impossible': True, 'reason': 'User and item are unkown.'} 喜欢你	Beyond
user: 4          item: 220        r_ui = 1.00   est = 1.00   {'was_im

### Save model

In [22]:
import surprise

surprise.dump.dump('./recommendation.model', algo=algo)

# Load model
algo = surprise.dump.load('./recommendation.model')

### Test for other recommendation algorithms

In [23]:
music_data.split(n_folds=5)

In [24]:
music_data

<surprise.dataset.DatasetAutoFolds at 0x7f87fdf02e80>

In [25]:
music_data.raw_ratings[:20]

[('62659400', '29572804', 1.0, '1300000'),
 ('323848458', '29950533', 1.0, '1300000'),
 ('31177392', '25702070', 1.0, '1300000'),
 ('380843121', '3313653', 1.0, '1300000'),
 ('38052169', '321603', 1.0, '1300000'),
 ('30666384', '28288202', 1.0, '1300000'),
 ('41564652', '32619064', 1.0, '1300000'),
 ('415491808', '405599470', 1.0, '1300000'),
 ('534970837', '28095367', 1.0, '1300000'),
 ('76675156', '32897022', 1.0, '1300000'),
 ('66735851', '86279', 1.0, '1300000'),
 ('17270407', '132313', 1.0, '1300000'),
 ('328064266', '29775856', 1.0, '1300000'),
 ('91631821', '31165551', 1.0, '1300000'),
 ('36663696', '34364763', 1.0, '1300000'),
 ('124284945', '234014', 1.0, '1300000'),
 ('170747938', '5324813', 1.0, '1300000'),
 ('26558065', '185664', 1.0, '1300000'),
 ('87396960', '28406472', 1.0, '1300000'),
 ('406356708', '28987151', 1.0, '1300000')]

#### NormalPredictor

In [26]:
from surprise import NormalPredictor
algo = NormalPredictor()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])
print(perf)



Evaluating RMSE, MAE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 0.0000
MAE:  0.0000
------------
Fold 2
RMSE: 0.0000
MAE:  0.0000
------------
Fold 3
RMSE: 0.0000
MAE:  0.0000
------------
Fold 4
RMSE: 0.0000
MAE:  0.0000
------------
Fold 5
RMSE: 0.0000
MAE:  0.0000
------------
------------
Mean RMSE: 0.0000
Mean MAE : 0.0000
------------
------------
defaultdict(<class 'list'>, {'rmse': [0.0, 0.0, 0.0, 0.0, 0.0], 'mae': [0.0, 0.0, 0.0, 0.0, 0.0]})
