# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [2]:
# wczytujemy oceny uytkownikow i obliczamy (za pomocą collaborative filtering) wszystkie przewidywane oceny filmow

raw_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings

Total error: 210450.31194982934
Total error: 203933.5168490659
Total error: 197869.69332520894
Total error: 192213.62368390427
Total error: 186925.70258635204
Total error: 181971.1195454668
Total error: 177319.17838890987
Total error: 172942.72756423932
Total error: 168817.68076130332
Total error: 164922.61160032873
Total error: 161238.40942244575
Total error: 157747.985769986
Total error: 154436.02313728447
Total error: 151288.7591420358
Total error: 148293.80051127815
Total error: 145439.96226875373
Total error: 142717.12830756154
Total error: 140116.13017599165
Total error: 137628.64142767552
Total error: 135247.0853146161
Total error: 132964.55395265378
Total error: 130774.73737853089
Total error: 128671.86115780174
Total error: 126650.63140271202
Total error: 124706.18622627103
Total error: 122834.05279896769
Total error: 121030.10929269688
Total error: 119290.55109627111
Total error: 117611.86077152689
Total error: 115990.7812909962
Total error: 114424.29215948137
Total error: 11

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,4,8,5,4,5,9,6,2,9,5,...,2,6,10,1,5,6,9,6,10,5
2,0,10,4,2,10,6,2,1,7,7,...,8,2,10,4,10,1,7,7,2,3
3,5,10,3,0,9,9,3,6,10,8,...,0,5,10,1,10,9,10,1,4,3
4,5,8,8,5,7,1,4,10,6,8,...,10,8,10,4,7,5,7,7,9,7
5,7,4,0,0,7,10,5,4,10,4,...,10,5,0,8,10,8,10,0,6,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,6,6,6,6,5,6,6,6,...,7,6,7,6,6,7,6,7,7,6
607,5,8,2,6,8,10,6,8,6,10,...,8,3,4,10,2,6,10,4,6,4
608,6,6,5,5,7,6,5,6,6,7,...,6,6,5,7,6,7,6,5,6,6
609,3,6,0,10,10,10,10,8,3,6,...,10,5,7,1,0,0,10,5,0,2


In [3]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pandas.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pandas.read_csv('groups.csv', header=None).values.tolist()
groups

[[606, 274, 474, 599, 448],
 [111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [5]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[2])



User ids: [469, 182, 232, 448, 600]

Mean ratings deviation: 0.7079227400840691
Median ratings deviation: 0.5477225575051662
Standard deviation of ratings deviation: 0.3999990501833321

Best movies:
Uncle Nino (2003), 10.0*
Kentucky Fried Movie, The (1977), 10.0*
Knockaround Guys (2002), 10.0*
Whistleblower, The (2010), 10.0*
Head Above Water (1996), 9.8*
Greedy (1994), 9.8*
Benji (1974), 9.8*
Piglet's Big Movie (2003), 9.8*
Cabin Boy (1994), 9.8*
Hard Way, The (1991), 9.8*

Worst movies:
Hatchet (2006), 0.6*
Double Dragon (1994), 0.6*
Pollyanna (1960), 0.6*
Lakeview Terrace (2008), 0.8*
Glory Road (2006), 0.8*
Bolt (2008), 1.0*
Swimming with Sharks (1995), 1.0*
Safe (2012), 1.0*
Tetro (2009), 1.2*
Submarine (2010), 1.2*


## Część 2. - algorytmy proste

In [6]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass


# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [7]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        return ratings.loc[group].mean(axis=0).nlargest(size).index.tolist()

AverageRecommender().recommend(movies, ratings, groups[0], 5)

[7328, 22, 179, 184, 291]

In [8]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        g_ratings = ratings.loc[group]
        return g_ratings.loc[:,(g_ratings >= self.score_threshold).all()].mean(axis=0).nlargest(size).index.tolist()

AverageWithoutMiseryRecommender(6).recommend(movies, ratings, groups[0], 5)

[7328, 22, 179, 184, 291]

In [9]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        
    def recommend(self, movies, ratings, group, size):
        recommendations = []
        g_ratings = [ratings.loc[i].nlargest(size).index.tolist() for i in group]
        i = 0
        while len(recommendations) < size:
            movie = g_ratings[i].pop(0)
            if movie not in recommendations:
                recommendations.append(movie)
                i = (i + 1) % len(group)
        return recommendations

FairnessRecommender().recommend(movies, ratings, groups[0], 5)

[22, 1606, 184, 23, 54]

In [10]:
# wybrany algorytm wyborczy (dyktatura, glosowanie proste, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = "simple voting"
        self.score_threshold = score_threshold
    
    def recommend(self, movies, ratings, group, size):
        voted = (ratings.loc[group] >= self.score_threshold).sum().nlargest(size, keep="all").index.tolist()
        new_ratings = ratings.loc[:, voted]
        return AverageRecommender().recommend(movies, new_ratings, group, size)

VotingRecommender(6).recommend(movies, ratings, groups[0], 5)

[7328, 22, 179, 184, 291]

In [11]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'
        
    def recommend(self, movies, ratings, group, size):
        recommendations = []
        point_tracker = {i: 1 for i in group}
        points = np.zeros(len(ratings.columns))
        while len(recommendations) < size:
            for movie_ix in range(len(ratings.columns)):
                for user in group:
                    movie = ratings.columns[movie_ix]
                    if ratings.loc[user, movie] >= self.threshold and movie not in recommendations:
                        points[movie_ix] += 1 / point_tracker[user]

            best_movie = ratings.columns[np.argmax(points)]
            recommendations.append(best_movie)
            points = np.zeros(len(ratings.columns))

            for user in group:
                if ratings.loc[user, best_movie] >= self.threshold:
                    point_tracker[user] += 1

        return recommendations

ProportionalApprovalVotingRecommender(7).recommend(movies, ratings, groups[0], 3)        

[7328, 8375, 22]

## Część 3. - funkcje celu

In [12]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    return ratings.loc[user_id].nlargest(n).index.tolist()

def total_score(recommendation, user_id, ratings):
    return ratings.loc[user_id, recommendation].sum()

In [13]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    score = total_score(recommendation, user_id, ratings)
    ideal_recommendation = top_n_movies_for_user(ratings, movies, user_id, len(recommendation))
    ideal_score = total_score(ideal_recommendation, user_id, ratings)
    return score / ideal_score

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    satisfaction = 0
    for user in group:
        satisfaction += overall_user_satisfaction(recommendation, user, movies, ratings)
    return satisfaction / len(group)

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    min_sat = float("inf")
    max_sat = 0
    for user in group:
        sat = overall_user_satisfaction(recommendation, user, movies, ratings)
        if sat > max_sat: max_sat = sat
        elif sat < min_sat: min_sat = sat
            
    return max_sat - min_sat

## Część 4. - Sequential Hybrid Aggregation

In [None]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
     def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        pass

SequentialHybridAggregationRecommender().recommend(movies, retings, groups[0], 5)

## Część 5. - porównanie algorytmów

In [14]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(5),
    ProportionalApprovalVotingRecommender(5),
    # SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - obliczmy srednia i odchylenie standardowe dla obu funkcji celu
#  - wypiszmy wyniki na konsole

for recommender in recommenders:
    print(f"Recommender: {recommender.name}")
    sat = []
    dis = []

    for group in groups:
        recommendation = recommender.recommend(movies, ratings, group, recommendation_size)
        sat.append(overall_group_satisfaction(recommendation, group, movies, ratings))
        dis.append(group_disagreement(recommendation, group, movies, ratings))

    np_sat = np.array(sat)
    np_dis = np.array(dis)

    print(f"Satisfaction: {np.mean(np_sat)} +/- {np.std(np_sat)}")
    print(f"Disagreement: {np.mean(np_dis)} +/- {np.std(np_dis)}")
    print()

Recommender: random
Satisfaction: 0.6281317001105733 +/- 0.14506453810454767
Disagreement: 0.16095238095238099 +/- 0.12209248938292161

Recommender: average
Satisfaction: 0.938170757276391 +/- 0.04305162945487477
Disagreement: 0.09940476190476191 +/- 0.09127405097261969

Recommender: average_without_misery
Satisfaction: 0.938170757276391 +/- 0.04305162945487477
Disagreement: 0.09496031746031747 +/- 0.08188961038643194

Recommender: fairness
Satisfaction: 0.7099940906701471 +/- 0.10977561159096678
Disagreement: 0.15857142857142856 +/- 0.09654813281526152

Recommender: simple voting
Satisfaction: 0.938170757276391 +/- 0.04305162945487477
Disagreement: 0.09496031746031747 +/- 0.08188961038643194

Recommender: PAV
Satisfaction: 0.7881678026114645 +/- 0.0575590353974651
Disagreement: 0.08579365079365081 +/- 0.078459114852589

