In [1]:
import io 
import pandas as pd
from collections import defaultdict
from surprise import Dataset
from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNBasic
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [2]:
data = Dataset.load_builtin('ml-100k')
K = 30
trainset, testset = train_test_split(data, test_size=.25)

In [3]:
# ассоциативный массив (названия алгоритмов - ключи, функиции - значения)
algorithms = {
    "NP": NormalPredictor(),
    "KNN_cos": KNNBasic(k = K, sim_options = { 'name': 'cosine' }),
    "KNN_MSD": KNNBasic(k = K),
    "KNN_Pearson": KNNBasic(k = K, sim_options = { 'name': 'pearson' }),
    "SVD" : SVD()
}
RSMA = {}

In [4]:
# для каждого алгоритма с помошбю функции cross_validate оцениваем метрику RMSE (среднеквадратичную ошибку модели)
for [name, algo] in algorithms.items():
    crv = cross_validate(algo, data, measures=['RMSE'], verbose=False)
    RSMA[name] = round(crv['test_rmse'].mean(), 3)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson si

In [9]:
# выберем среди всех алгоритмов лучший
print(RSMA)
bestAlgoName = min(RSMA.items(), key=lambda x: x[1])[0]
print(bestAlgoName)
bestAlgo = algorithms[bestAlgoName]
# тренируем алгоритм на тренировочном наборе данных, даём предсказание для тестового
bestAlgo.fit(trainset)
predictions = bestAlgo.test(testset)
for prediction in predictions[:10]:
    print(prediction)

{'NP': 1.522, 'KNN_cos': 1.019, 'KNN_MSD': 0.977, 'KNN_Pearson': 1.013, 'SVD': 0.936}
SVD
user: 648        item: 367        r_ui = 3.00   est = 3.18   {'was_impossible': False}
user: 286        item: 288        r_ui = 5.00   est = 3.75   {'was_impossible': False}
user: 13         item: 869        r_ui = 3.00   est = 2.51   {'was_impossible': False}
user: 405        item: 1228       r_ui = 1.00   est = 1.27   {'was_impossible': False}
user: 300        item: 881        r_ui = 5.00   est = 3.58   {'was_impossible': False}
user: 927        item: 158        r_ui = 2.00   est = 2.90   {'was_impossible': False}
user: 561        item: 229        r_ui = 3.00   est = 2.03   {'was_impossible': False}
user: 385        item: 1159       r_ui = 4.00   est = 3.16   {'was_impossible': False}
user: 846        item: 41         r_ui = 3.00   est = 3.24   {'was_impossible': False}
user: 62         item: 401        r_ui = 3.00   est = 2.38   {'was_impossible': False}


In [16]:
# посчитаем метрики precision@k and recall@k для k=5 и порога отсечения 3.52
# https://github.com/NicolasHug/Surprise/blob/master/examples/precision_recall_at_k.py - готовая функция
# precision@k - отношение релевантных и рекомендованных фильмов, к количеству рекомендованных фильмов
# recall@k - отношение релевантных и рекомендованных фильмов, к количеству релевантных (тех, которые человек захочет посмотреть) фильмов
def precision_recall_at_k(predictions, k = 5, threshold = 3.52):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.52)

# усредняем по всем пользователям
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
print(precision_at_k)
print(recall_at_k)

0.7433828733191795
0.370768337830131


In [6]:
# получим рекомендации для человека 14
USER_INDEX = '14'
N = 5
userPredictions = list(filter(lambda x: x.uid == USER_INDEX, predictions))
top = sorted(userPredictions, key=lambda x: x.est, reverse=True)[0:N]
top

[Prediction(uid='14', iid='50', r_ui=5.0, est=4.774747708506592, details={'was_impossible': False}),
 Prediction(uid='14', iid='172', r_ui=5.0, est=4.724461503712341, details={'was_impossible': False}),
 Prediction(uid='14', iid='176', r_ui=1.0, est=4.6507253803560165, details={'was_impossible': False}),
 Prediction(uid='14', iid='603', r_ui=4.0, est=4.54496916723333, details={'was_impossible': False}),
 Prediction(uid='14', iid='127', r_ui=2.0, est=4.533140042757135, details={'was_impossible': False})]

In [7]:
# получим информацию о рекомендованных фильмах по их id
def getInfoForList(l):
    print(l)
    data_path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    data = pd.read_csv(data_path, sep='|',encoding='ISO-8859-1', header = None) 
    for id in l:
        row = data.iloc[int(id)]
        info[id] = (row[1], row[2])

    return info
info = getInfoForList(list(map(lambda x: x.iid, top)))
print(info)

['50', '172', '176', '603', '127']
{'50': ('Legends of the Fall (1994)', '01-Jan-1994'), '172': ('Princess Bride, The (1987)', '01-Jan-1987'), '176': ('Good, The Bad and The Ugly, The (1966)', '01-Jan-1966'), '603': ('It Happened One Night (1934)', '01-Jan-1934'), '127': ('Supercop (1992)', '26-Jul-1996')}


In [8]:
# выведем результат
print('User {}'.format(USER_INDEX))
for pred in top:
    print('{}, {}, {}'.format(pred.iid, info[pred.iid], round(pred.est, 3)))

User 14
50, ('Legends of the Fall (1994)', '01-Jan-1994'), 4.775
172, ('Princess Bride, The (1987)', '01-Jan-1987'), 4.724
176, ('Good, The Bad and The Ugly, The (1966)', '01-Jan-1966'), 4.651
603, ('It Happened One Night (1934)', '01-Jan-1934'), 4.545
127, ('Supercop (1992)', '26-Jul-1996'), 4.533
