# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [1]:
import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [2]:
raw_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings)

Total error: 217539.92466722557
Total error: 210472.90062742485
Total error: 203925.28524757296
Total error: 197842.00411433706
Total error: 192175.24582551786
Total error: 186883.33932286987
Total error: 181929.83043726775
Total error: 177282.71749567086
Total error: 172913.81491473972
Total error: 168798.2205172809
Total error: 164913.867468471
Total error: 161241.14567438522
Total error: 157762.58052772656
Total error: 154462.55925062092
Total error: 151327.09693699362
Total error: 148343.63585897628
Total error: 145500.87276340433
Total error: 142788.6098136063
Total error: 140197.62557940566
Total error: 137719.56308364612
Total error: 135346.83240630187
Total error: 133072.5257505156
Total error: 130890.34320650002
Total error: 128794.52772319799
Total error: 126779.80802491285
Total error: 124841.34839947258
Total error: 122974.70444287205
Total error: 121175.78397822773
Total error: 119440.81247880624
Total error: 117766.30241941275
Total error: 116149.02606051492
Total error: 

In [3]:
ratings

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,6,5,7,8,8,8,5,3,5,7,...,6,6,6,8,6,6,7,9,4,10
2,10,6,2,0,0,0,7,10,3,6,...,2,10,5,8,2,9,10,10,10,7
3,8,5,1,10,10,6,10,5,9,1,...,4,7,10,0,2,4,10,10,3,9
4,8,8,8,6,10,3,7,4,4,7,...,5,7,6,6,5,3,10,8,6,6
5,6,10,7,10,10,10,10,0,10,2,...,3,1,2,5,0,0,10,9,2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,7,6,6,6,6,6,5,6,...,6,7,6,6,6,7,5,6,6,6
607,3,6,3,2,4,8,10,2,6,9,...,5,8,9,4,5,7,0,0,8,2
608,6,7,7,6,6,6,6,7,5,6,...,6,6,6,7,7,6,6,6,6,5
609,7,5,10,6,4,5,10,4,3,5,...,4,10,8,8,0,10,0,4,3,0


In [4]:
movies_metadata = pandas.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
groups = pandas.read_csv('groups.csv', header=None).values.tolist()
groups

[[606, 274, 474, 599, 448],
 [111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [6]:
def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[2])



User ids: [469, 182, 232, 448, 600]

Mean ratings deviation: 0.7667201185789386
Median ratings deviation: 0.7071067811865476
Standard deviation of ratings deviation: 0.45660745931085245

Best movies:
Sherlock: The Abominable Bride (2016), 10.0*
Hatari! (1962), 9.8*
Dragon Ball: Mystical Adventure (Doragon bôru: Makafushigi dai bôken) (1988), 9.8*
...And Justice for All (1979), 9.8*
Soul Food (1997), 9.6*
Europa (Zentropa) (1991), 9.6*
Before Sunrise (1995), 9.6*
Cyborg (1989), 9.6*
Flipped (2010), 9.6*
Man Who Planted Trees, The (Homme qui plantait des arbres, L') (1987), 9.6*

Worst movies:
Being Elmo: A Puppeteer's Journey (2011), 0.8*
The African Doctor (2016), 0.8*
Shooter (2007), 0.8*
Embalmer, The (Imbalsamatore, L') (2002), 1.0*
They Came Together (2014), 1.0*
Devil and Max Devlin, The (1981), 1.2*
Bakuman (2015), 1.2*
Horse Feathers (1932), 1.2*
Pocahontas (1995), 1.2*
Door in the Floor, The (2004), 1.4*


## Część 2. - algorytmy proste

In [7]:
class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass

### Algorytm rekomendujący losowo

In [8]:
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [9]:
RandomRecommender().recommend(movies, ratings, groups[0], 10)

[5601, 5785, 101577, 68932, 78836, 100843, 1633, 2442, 6058, 3598]

### Algorytm rekomendujący filmy o najwyższej średniej ocen

In [10]:
class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def get_avg_score(self, movie_id, ratings, group):
        return sum(ratings[movie_id][user] for user in group) / len(group)
    
    def recommend(self, movies, ratings, group, size):
        return sorted(movies, key=lambda movie: self.get_avg_score(movie, ratings, group), reverse=True)[:size]

In [11]:
AverageRecommender().recommend(movies, ratings, groups[0], 10)

[282, 8970, 42004, 3074, 44788, 8363, 4458, 4478, 42740, 109941]

### Algorytm rekomendujący filmy o najwyzszej sredniej ocen, ale wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej wartości `score_threshold`

In [12]:
class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
    
    def get_avg_score(self, movie_id, ratings, group):
        return sum(ratings[movie_id][user] for user in group) / len(group)
        
    def min_movie_rating_in_group(self, movie_id, ratings, group):
        return min([ratings[movie_id][user] for user in group])        
            
    def recommend(self, movies, ratings, group, size):
        movies_without_misery = [m for m in movies if self.min_movie_rating_in_group(m, ratings, group) >= self.score_threshold]
        return sorted(movies, key=lambda movie: self.get_avg_score(movie, ratings, group), reverse=True)[:size]

In [13]:
AverageWithoutMiseryRecommender(6).recommend(movies, ratings, groups[0], 10)

[282, 8970, 42004, 3074, 44788, 8363, 4458, 4478, 42740, 109941]

### Algorytm, ktory w kazdej iteracji uwzglednia preferencje tylko jednego, kolejnego uzytkownika

In [14]:
class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        
    def recommend(self, movies, ratings, group, size):
        recommendations = []
        for i in range(size):
            ith_user_rating_movies_sorted = sorted(movies, key = lambda movie: ratings[movie][group[i%len(group)]], reverse=True)
            for movie in ith_user_rating_movies_sorted:
                if movie not in recommendations:
                    recommendations.append(movie)
                    break
        return recommendations

In [15]:
FairnessRecommender().recommend(movies, ratings, groups[0], 10)

[971, 235, 2033, 1719, 3024, 3, 1032, 2644, 534, 2763]

### Algorytm uwzgledniajacy preferencje tylko jednego uzytkownika - dyktatora

In [16]:
class DictatorshipRecommender(Recommender):
    def __init__(self, dictator_id):
        self.name = 'dictatorship'
        self.dictator_id = dictator_id
        
    def recommend(self, movies, ratings, group, size):
        return sorted(movies, key = lambda movie: ratings[movie][self.dictator_id], reverse=True)[:size]

In [17]:
DictatorshipRecommender(groups[0][1]).recommend(movies, ratings, groups[0], 10)

[235, 1032, 2116, 2596, 3386, 86345, 2851, 190, 1733, 4121]

### Algorytm zachlanny, aproksymujacy metode Proportional Approval Voting. W kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV
### 1. Każdy użytkownik głosuje, które filmy może zaakcpetować (ocena większa niż treshold)
### 2. Bierzemy film o największe sumie wartości głosów
### 3. Po wybraniu danego filmu zmiejszamy wagę głosu użytkowników którzy na niego zagłosowali przed przejściem do kolejnej rundy głosowania z 1/2 -> 1/3 -> 1/4 itd.

In [18]:
class ProportionalApprovalVotingRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'

    def count_score(self, ratings, movie_id, group, group_movies_counter):
        score = 0
        for user in group:
            if ratings[movie_id][user] >= self.threshold:
                score += 1 / (group_movies_counter[user] + 1)
        return score
        
    def recommend(self, movies, ratings, group, size):
        from collections import defaultdict
        group_movies_counter = defaultdict(int)
        recommendations = []
        movie_set = set(movies)
        for i in range(size):
            best_movie = sorted(list(movie_set), key=lambda movie: self.count_score(ratings, movie, group, group_movies_counter),reverse=True)[0]
            movie_set.remove(best_movie)
            for user in group:
                if ratings[best_movie][user] >= self.threshold:
                    group_movies_counter[user] += 1
            recommendations.append(best_movie)
        return recommendations

In [19]:
ProportionalApprovalVotingRecommender(6).recommend(movies, ratings, groups[0], 10)

[1, 2, 3, 4, 5, 6, 7, 8, 10, 11]

## Część 3. - funkcje celu

### Funkcje pomocnicze

In [20]:
def top_n_movies_for_user(ratings, movies, user_id, n):
    return sorted(movies, key=lambda movie : ratings[movie][user_id], reverse=True)[:n]

def total_score(recommendation, user_id, ratings):
    return sum(ratings[movie][user_id] for movie in recommendation)

In [21]:
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    return total_score(recommendation, user_id, ratings)/  \
total_score(top_n_movies_for_user(ratings, movies, user_id, len(recommendation)), user_id, ratings)

def overall_group_satisfaction(recommendation, group, movies, ratings):
    return 1.0 * sum([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]) / len(group)

def group_disagreement(recommendation, group, movies, ratings):
    satisfaction_scores = [overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group]
    return max(satisfaction_scores) - min(satisfaction_scores)

## Część 4. - Sequential Hybrid Aggregation

### Algorytm optymalizujący roznice pomiedzy srednie oraz minimalne oceny
### W kazdej iteracji aktualizujacy parametr alfa = max - min (rozstrzal pomiedzy najwiekszym a najmniejszym)

In [22]:
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
        self.alpha = 0.5
        
    def update_alpha(self, group, movie, ratings):
        self.alpha = (max([ratings[movie][user] for user in group]) - min([ratings[movie][user] for user in group]))
        
    def get_average_score(self, movie_id, ratings, group):
        return sum(ratings[movie_id][user] for user in group) / len(group)
    
    def get_score(self, ratings, movie, group):
        # (1 - alfa) * srednia z ocen dla danego filmu + alfa * najmniejsza ocena dla danego filmu
        score = (1.0 - self.alpha) * self.get_average_score(movie, ratings, group) + self.alpha * min(ratings[movie][user] for user in group)
        self.update_alpha(group, movie, ratings)
        return score       
    
    def recommend(self, movies, ratings, group, size):
        return sorted(movies, key = lambda movie: self.get_score(ratings, movie, group))[:size]

In [23]:
SequentialHybridAggregationRecommender().recommend(movies, ratings, groups[0], 10)

[987, 1687, 2278, 933, 1047, 8951, 136447, 139855, 42730, 25773]

## Część 5. - porównanie algorytmów

In [24]:
from statistics import mean, stdev

recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    DictatorshipRecommender(1),
    ProportionalApprovalVotingRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

for recommender in recommenders:
    print(recommender.name)
    satisfaction = np.zeros(len(groups))
    disagreement = np.zeros(len(groups))

    for i, group in enumerate(groups):
        recommendation = recommender.recommend(movies, ratings, group, recommendation_size)
        satisfaction[i] = overall_group_satisfaction(recommendation, group, movies, ratings)
        disagreement[i] = group_disagreement(recommendation, group, movies, ratings)
        
    avg_satisfaction = np.sum(satisfaction)/len(groups)
    sdev_satisfaction = np.sqrt(np.sum(pow(satisfaction - avg_satisfaction, 2)) / len(groups))
    avg_disagreement = np.sum(disagreement)/len(groups)
    sdev_disagreement = np.sqrt(np.sum(pow(disagreement - avg_satisfaction, 2)) / len(groups))
    
    print('satisfaction:', avg_satisfaction, '+/-', sdev_satisfaction)
    print('disagreement:', avg_disagreement, '+/-', sdev_disagreement)
    print()

random
satisfaction: 0.6538479869113671 +/- 0.13538246713900468
disagreement: 0.21484848484848484 +/- 0.4562863416131002

average
satisfaction: 0.951358291161108 +/- 0.03417966690594917
disagreement: 0.10954616588419407 +/- 0.8479044368139858

average_without_misery
satisfaction: 0.951358291161108 +/- 0.03417966690594917
disagreement: 0.10954616588419407 +/- 0.8479044368139858

fairness
satisfaction: 0.7219397902566917 +/- 0.10925264877826439
disagreement: 0.20820905229355932 +/- 0.5235767957566942

dictatorship
satisfaction: 0.6786557730219703 +/- 0.11506764895008234
disagreement: 0.3165512265512265 +/- 0.4009742981302035

PAV
satisfaction: 0.7807823303457107 +/- 0.05113539829353232
disagreement: 0.16368615734812916 +/- 0.622855234414809

sequential_hybrid_aggregation
satisfaction: 0.7733785643151841 +/- 0.04147579257897248
disagreement: 0.5836711176147796 +/- 0.35210857178438426

