# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety
from tabulate import tabulate
import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [2]:
# wczytujemy oceny uytkownikow i obliczamy (za pomocą collaborative filtering) wszystkie przewidywane oceny filmow

raw_ratings = pandas.read_csv('../ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings, max_iterations=10)
ratings

215575.2462699197
208804.9560670217
202498.51798099163
196610.63134484066
191101.51644274403
185936.11967165925
181083.4510345969
176516.02881615108
172209.41166448637
168141.8024090963
164293.71110586412


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,10,6,2,9,4,3,10,4,10,6,...,2,10,0,10,6,10,9,5,7,5
2,10,10,7,0,9,5,10,4,10,3,...,10,10,10,10,7,5,6,4,10,7
3,3,4,10,0,10,0,2,9,10,6,...,5,0,1,0,0,7,8,3,3,0
4,10,4,0,10,4,5,5,3,0,0,...,10,8,3,10,10,0,0,0,8,9
5,9,0,3,5,0,6,4,4,5,5,...,4,4,10,2,10,9,10,6,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,9,4,5,5,0,9,10,1,6,3,...,0,6,6,10,9,5,4,10,7,9
607,7,10,0,5,6,0,6,9,10,10,...,4,7,0,5,4,6,9,8,0,5
608,4,10,10,2,10,8,1,9,6,3,...,10,9,6,4,0,9,10,3,1,5
609,10,0,2,0,0,10,4,0,0,0,...,2,10,6,10,10,9,9,6,10,2


In [3]:
movies

[1,
 3,
 6,
 47,
 50,
 70,
 101,
 110,
 151,
 157,
 163,
 216,
 223,
 231,
 235,
 260,
 296,
 316,
 333,
 349,
 356,
 362,
 367,
 423,
 441,
 457,
 480,
 500,
 527,
 543,
 552,
 553,
 590,
 592,
 593,
 596,
 608,
 648,
 661,
 673,
 733,
 736,
 780,
 804,
 919,
 923,
 940,
 943,
 954,
 1009,
 1023,
 1024,
 1025,
 1029,
 1030,
 1031,
 1032,
 1042,
 1049,
 1060,
 1073,
 1080,
 1089,
 1090,
 1092,
 1097,
 1127,
 1136,
 1196,
 1197,
 1198,
 1206,
 1208,
 1210,
 1213,
 1214,
 1219,
 1220,
 1222,
 1224,
 1226,
 1240,
 1256,
 1258,
 1265,
 1270,
 1275,
 1278,
 1282,
 1291,
 1298,
 1348,
 1377,
 1396,
 1408,
 1445,
 1473,
 1500,
 1517,
 1552,
 1573,
 1580,
 1587,
 1617,
 1620,
 1625,
 1644,
 1676,
 1732,
 1777,
 1793,
 1804,
 1805,
 1920,
 1927,
 1954,
 1967,
 2000,
 2005,
 2012,
 2018,
 2028,
 2033,
 2046,
 2048,
 2054,
 2058,
 2078,
 2090,
 2093,
 2094,
 2096,
 2099,
 2105,
 2115,
 2116,
 2137,
 2139,
 2141,
 2143,
 2161,
 2174,
 2193,
 2253,
 2268,
 2273,
 2291,
 2329,
 2338,
 2353,
 2366,
 

In [4]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pandas.read_csv('../ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
# definiujemy testowe grupy uzytkownikow, dla ktorych bedziemy generowac rekomendacje

groups_no = 50
group_size = 5
groups = [sample(users, group_size) for i in range(groups_no)]
groups

[[217, 478, 495, 560, 216],
 [60, 477, 12, 446, 482],
 [420, 218, 589, 7, 539],
 [522, 578, 563, 204, 360],
 [542, 557, 313, 4, 406],
 [127, 61, 505, 417, 566],
 [290, 401, 575, 266, 256],
 [494, 557, 534, 199, 538],
 [469, 375, 563, 540, 321],
 [97, 164, 208, 601, 327],
 [233, 197, 196, 251, 520],
 [559, 181, 136, 56, 291],
 [149, 23, 255, 314, 36],
 [354, 87, 101, 334, 378],
 [313, 412, 42, 238, 6],
 [203, 544, 366, 426, 188],
 [374, 174, 19, 317, 181],
 [423, 392, 589, 128, 585],
 [105, 572, 407, 149, 476],
 [352, 588, 385, 369, 268],
 [309, 349, 418, 195, 112],
 [185, 365, 174, 510, 210],
 [46, 117, 331, 378, 292],
 [362, 328, 601, 43, 98],
 [595, 282, 35, 157, 34],
 [3, 217, 14, 269, 446],
 [169, 66, 556, 561, 153],
 [98, 48, 498, 61, 194],
 [295, 310, 333, 182, 495],
 [320, 275, 485, 15, 143],
 [265, 275, 219, 217, 413],
 [536, 248, 567, 468, 398],
 [449, 151, 562, 68, 474],
 [135, 257, 301, 490, 246],
 [260, 288, 558, 291, 522],
 [256, 127, 219, 267, 348],
 [30, 63, 589, 263, 30

In [6]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)

    mean_stdev = ratings.iloc[group].std(axis=0).mean()
    median_stdev = ratings.iloc[group].std(axis=0).median()
    std_stdev = ratings.iloc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')

    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in
                   list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in
                    list(average_scores[:N].index)]

    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')


describe_group(groups[1])



User ids: [60, 477, 12, 446, 482]

Mean ratings deviation: 3.8770500141539115
Median ratings deviation: 3.9749213828703582
Standard deviation of ratings deviation: 0.8233099037052277

Best movies:
It Runs in the Family (2003), 9.2*
Mansfield Park (1999), 9.2*
Josie and the Pussycats (2001), 9.2*
8MM (1999), 9.0*
Big Stan (2007), 9.0*
Some Mother's Son (1996), 8.8*
The Golden Voyage of Sinbad (1973), 8.8*
Anne of the Thousand Days (1969), 8.8*
Paper Chase, The (1973), 8.6*
Legally Blonde (2001), 8.6*

Worst movies:
Lions For Lambs (2007), 2.2*
Davy Crockett, King of the Wild Frontier (1955), 2.4*
Newsies (1992), 2.4*
Northmen - A Viking Saga (2014), 2.4*
Halloween 5: The Revenge of Michael Myers (1989), 2.6*
Girl with a Pearl Earring (2003), 2.6*
Above the Rim (1994), 2.6*
Men in Black III (M.III.B.) (M.I.B.³) (2012), 2.6*
Canadian Bacon (1995), 2.6*
Miracle (2004), 2.6*


In [7]:
'a   b'.split()

['a', 'b']

## Część 2. - algorytmy proste

In [8]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania

class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'

    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [9]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.filter(items=group, axis=0)
        return list(group_ratings.mean().sort_values(ascending=False).iloc[:size].index)

In [10]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.filter(items=group, axis=0)
        excluded_movies_filter = group_ratings < self.score_threshold
        included_movies_filter = excluded_movies_filter.sum() == 0
        included_movies = included_movies_filter[included_movies_filter].index
        ratings_included_movies = ratings[included_movies]
        return list(ratings_included_movies.mean().sort_values(ascending=False).iloc[:size].index)

In [11]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'

    def recommend(self, movies, ratings, group, size):
        selected_movies = set()
        best_users_movies = [ratings.loc[user].sort_values(ascending=False) for user in group]
        round = -1
        step = 0
        while len(selected_movies) < size:
            user_id = step % len(group)
            if user_id == 0:
                round += 1
            movie = best_users_movies[user_id].index[round]
            if movie not in selected_movies:
                selected_movies.add(movie)
            step += 1
        return list(selected_movies)

In [12]:
# wybrany algorytm wyborczy

class VotingRecommender(Recommender):
    def __init__(self, treshold):
        self.name = "simple voting"  # nazwa wybranego algorytmu
        self.score_threshold = treshold

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.filter(items=group, axis=0)
        group_ratings = group_ratings > self.score_threshold
        return list(group_ratings.sum().sort_values(ascending=False).iloc[:size].index)

In [13]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class PAVRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'

    def recommend(self, movies, ratings, group, size):
        group_ratings = ratings.filter(items=group, axis=0)
        group_ratings = group_ratings > self.threshold
        group_ratings = group_ratings.astype("float")
        recommender_weight_divisor = {user_id: 1 for user_id in group}
        selected_movies = set()
        for i in range(size):
            new_movie = group_ratings.sum().idxmax()
            recommenders = group_ratings[new_movie] > 0
            recommenders = list(recommenders[recommenders].index)

            group_ratings[new_movie] = 0.0
            for recommender in recommenders:
                group_ratings.loc[recommender] = group_ratings.loc[recommender] * recommender_weight_divisor[
                    recommender] / (recommender_weight_divisor[recommender] + 1)
                recommender_weight_divisor[recommender] += 1

            selected_movies.add(new_movie)
        return list(selected_movies)


## Część 3. - funkcje celu

In [14]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    return ratings.loc[user_id].sort_values(ascending=False).iloc[:n].index


def total_score(recommendation, user_id, ratings):
    return ratings.loc[user_id][recommendation].sum()

In [15]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    group_list_sat = total_score(recommendation, user_id, ratings)
    user_list_sat = total_score(list(top_n_movies_for_user(ratings, movies, user_id, len(recommendation))), user_id,
                                ratings)
    return group_list_sat / user_list_sat


# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    return 1.0 * sum([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]) / len(
        group)


# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    sat_scores = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(sat_scores) - min(sat_scores)

## Część 4. - Sequential Hybrid Aggregation

In [16]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'

    def recommend(self, movies, ratings, group, size):
        selected_movies = set()
        group_ratings = ratings.filter(items=group, axis=0)
        alpha = 0.5
        avg_scores = group_ratings.mean()
        least_scores = ratings.min()
        for i in range(size):
            scores = (1 - alpha) * avg_scores + alpha * least_scores
            new_movie = scores.idxmax()
            selected_movies.add(new_movie)
            avg_scores[new_movie] = 0
            least_scores[new_movie] = 0
            alpha = group_disagreement([movies[i - 1]], group, movies, ratings)
        return list(selected_movies)



## Część 5. - porównanie algorytmów

In [18]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(5),
    PAVRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci funkcji celu dla kazdej rekomendacji
#  - obliczmy srednia i odchylenie standardowe wartosci funkcji celu
#  - wypiszmy wyniki na konsole

table = []
for recommender in recommenders:
    satisfactions = []
    disagreements = []
    for group in groups:
        recommendation = recommender.recommend(movies, ratings, group, recommendation_size)
        satisfactions.append(overall_group_satisfaction(recommendation, group, movies, ratings))
        disagreements.append(group_disagreement(recommendation, group, movies, ratings))
    table.append(
        [recommender.name, mean(satisfactions), stdev(satisfactions), mean(disagreements), stdev(disagreements)])
print(tabulate(table, headers=["recommender", "satisfaction-mean", "satisfaction-std", "disagreement-mean",
                               "disagreement-mean"]))

recommender                      satisfaction-mean    satisfaction-std    disagreement-mean    disagreement-mean
-----------------------------  -------------------  ------------------  -------------------  -------------------
random                                    0.56108            0.0546547             0.2692              0.105072
average                                   0.98784            0.0203313             0.0222              0.0340042
average_without_misery                    0.80412            0.0413909             0.223               0.0750306
fairness                                  0.64748            0.0516405             0.2478              0.0852437
simple voting                             0.86764            0.0269442             0.1228              0.0533008
PAV                                       0.86816            0.0256056             0.1184              0.0468763
sequential_hybrid_aggregation             0.927984           0.0322147             0.131625      