# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [2]:
# wczytujemy oceny uytkownikow i obliczamy (za pomocą collaborative filtering) wszystkie przewidywane oceny filmow

raw_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings

Total error: 211076.65696503568
Total error: 204631.37191814365
Total error: 198618.4721780347
Total error: 192996.2967085731
Total error: 187728.2175060557
Total error: 182781.92286930652
Total error: 178128.81798804083
Total error: 173743.52103017282
Total error: 169603.43747069302
Total error: 165688.39890918252
Total error: 161980.3553376033
Total error: 158463.11193945678
Total error: 155122.10316863962
Total error: 151944.19817720252
Total error: 148917.53271480682
Total error: 146031.36346838312
Total error: 143275.9414935377
Total error: 140642.40194400738
Total error: 138122.6677585166
Total error: 135709.36533621725
Total error: 133395.75053858856
Total error: 131175.64360971327
Total error: 129043.37181822225
Total error: 126993.7188007521
Total error: 125021.87973477347
Total error: 123123.42159317997
Total error: 121294.24783815946
Total error: 119530.56700086799
Total error: 117828.86466901859
Total error: 116185.87846887397
Total error: 114598.57568307106
Total error: 11

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,6,3,6,3,8,8,7,4,9,6,...,5,5,3,9,7,10,5,6,5,4
2,10,6,9,10,7,3,4,10,3,6,...,5,10,1,5,5,8,3,0,1,5
3,8,5,3,10,10,4,7,10,1,7,...,2,6,9,0,4,6,0,3,0,10
4,7,10,4,4,9,8,6,6,6,7,...,9,7,5,6,9,6,4,4,8,5
5,9,10,5,10,1,8,6,10,10,5,...,3,6,5,7,10,5,0,1,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,5,6,6,6,6,7,6,5,...,6,6,6,6,6,6,6,7,6,7
607,7,3,6,10,6,9,3,8,6,1,...,6,10,6,7,10,6,0,9,7,5
608,6,5,6,6,6,6,6,5,6,6,...,6,6,5,6,6,6,6,6,6,5
609,7,10,6,10,0,2,6,5,9,7,...,0,0,2,1,9,0,8,0,10,0


In [3]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pandas.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pandas.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [5]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[1])



User ids: [469, 182, 232, 448, 600]

Mean ratings deviation: 0.670626799584098
Median ratings deviation: 0.5477225575051662
Standard deviation of ratings deviation: 0.3628491368618455

Best movies:
Antichrist (2009), 9.6*
Alien: Covenant (2017), 9.6*
Beverly Hillbillies, The (1993), 9.6*
Riddick (2013), 9.6*
Godzilla vs. Mothra (Mosura tai Gojira) (1964), 9.4*
Traitor (2008), 9.4*
Road House (1989), 9.4*
New Police Story (Xin jing cha gu shi) (2004), 9.4*
Escape Plan (2013), 9.4*
Dungeons & Dragons (2000), 9.4*

Worst movies:
Whatever (1998), 1.6*
Hitchcock/Truffaut (2015), 1.6*
Die Frauen von Ravensbrück (2005), 1.6*
Clockwise (1986), 1.8*
Ip Man 2 (2010), 1.8*
Keeping the Faith (2000), 1.8*
Alice Through the Looking Glass (2016), 2.0*
Yes Men, The (2003), 2.0*
Spaceballs (1987), 2.0*
Leprechaun 4: In Space (1997), 2.0*


## Część 2. - algorytmy proste

In [6]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass

# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [7]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        average_scores = ratings.iloc[group].mean(axis=0)
        average_scores = average_scores.sort_values()
        return list(average_scores[-size:].index)
    
AverageRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[6754, 7017, 4828, 5767, 41712]

In [8]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        average_scores = ratings.iloc[group].mean(axis=0)
        average_scores = average_scores.sort_values()
        average_scores = average_scores[average_scores >= self.score_threshold]
        return list(average_scores[-size:].index)
    
AverageWithoutMiseryRecommender(8).recommend(movies, ratings, groups[0], len(groups[0]))

[6754, 7017, 4828, 5767, 41712]

In [9]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        self.user_index = 0
        
    def recommend(self, movies, ratings, group, size):
        self.user_index = (self.user_index + 1) % len(group)
        user_id = group[self.user_index]
        user_ratings = ratings.loc[user_id]
        user_ratings = user_ratings.sort_values()
        return list(user_ratings[-size:].index)
    
FairnessRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[85788, 27830, 2865, 55232, 85025]

In [10]:
# wybrany algorytm wyborczy (dyktatura, glosowanie proste, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = 'borda'
    
    def recommend(self, movies, ratings, group, size):
        user_points = ratings.loc[group].copy()
        
        for user_id in group:
            sorted_user_ratings = user_points.loc[user_id].sort_values(ascending=False)
            sorted_user_ratings = sorted_user_ratings.rank(method='dense')
            user_points.loc[user_id] = sorted_user_ratings
            
        user_points = user_points.sum(axis=0)
        return list(user_points.sort_values()[-size:].index)

VotingRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[3089, 171765, 927, 2010, 46664]

In [29]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

import pandas as pd

class PAVRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'

    @staticmethod
    def _calculate_satisfaction(user_satisfactions):
        return {user: 1/(satisfaction + 1) for user, satisfaction in user_satisfactions.items()}

    def recommend(self, movies, ratings, group, size):
        user_preferences = ratings.loc[group] >= self.threshold
        user_satisfactions = pd.Series(0, index=group)
        recommendations = []

        for _ in range(size):
            user_satisfaction_scores = pd.Series(self._calculate_satisfaction(user_satisfactions))
            user_satisfaction = user_preferences.mul(user_satisfaction_scores, axis=0)

            # Find the movie with the highest satisfaction sum that is not yet recommended
            recommended_movie = user_satisfaction.loc[:, ~user_satisfaction.columns.isin(recommendations)].sum().idxmax()
            recommendations.append(recommended_movie)

            # Update satisfaction for users satisfied by the recommended movie
            user_satisfactions[user_preferences[recommended_movie]] += 1

        return recommendations
        
PAVRecommender(7).recommend(movies, ratings, groups[0], len(groups[0]))

[2010, 46664, 342, 7034, 445]

## Część 3. - funkcje celu

In [30]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    user_ratings = ratings.loc[user_id]
    user_ratings = user_ratings.sort_values(ascending=False)
    return list(user_ratings[:n].index)

def total_score(recommendation, user_id, ratings):
    user_ratings = ratings.loc[user_id]
    return user_ratings[recommendation].sum()

In [37]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    n = len(recommendation)
    top_n = top_n_movies_for_user(ratings, movies, user_id, n)
    return total_score(recommendation, user_id, ratings) / total_score(top_n, user_id, ratings)

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    return mean([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group])

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    if len(group) <= 1:
        return 0
    
    min_satisfaction = float('inf')
    max_satisfaction = -float('inf')
    
    for user_id in group:
        satisfaction = overall_user_satisfaction(recommendation, user_id, movies, ratings)
        min_satisfaction = min(min_satisfaction, satisfaction)
        max_satisfaction = max(max_satisfaction, satisfaction)
        
    return max_satisfaction - min_satisfaction

## Część 4. - Sequential Hybrid Aggregation

In [38]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        avg_scores = ratings.loc[group].mean(axis=0)
        least_scores = ratings.loc[group].min()
        alpha = 0.5
        
        recommendation = []
        
        for _ in range(size):
            movie_scores = (1 - alpha) * avg_scores + alpha * least_scores
            recommended_movie = movie_scores.sort_values(ascending=False).index[0]
            recommendation.append(recommended_movie)
            
            avg_scores[recommended_movie] = 0
            least_scores[recommended_movie] = 0
            
            alpha = group_disagreement(recommendation, group, movies, ratings)
            
        return recommendation
    
SequentialHybridAggregationRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[2010, 46664, 3565, 4866, 535]

## Część 5. - porównanie algorytmów

In [42]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    PAVRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - obliczmy srednia i odchylenie standardowe dla obu funkcji celu
#  - wypiszmy wyniki na konsole

for recommender in recommenders:
    recommendations = [recommender.recommend(movies, ratings, group, recommendation_size) for group in groups]
    satisfaction = [overall_group_satisfaction(recommendation, group, movies, ratings) for recommendation, group in zip(recommendations, groups)]
    disagreement = [group_disagreement(recommendation, group, movies, ratings) for recommendation, group in zip(recommendations, groups)]
    print(f'\n{recommender.name}')
    print(f'Satisfaction: {mean(satisfaction)} +- {stdev(satisfaction)}')
    print(f'Disagreement: {mean(disagreement)} +- {stdev(disagreement)}')



random
Satisfaction: 0.6300058108507677 +- 0.11489259165522603
Disagreement: 0.30645385304659495 +- 0.06685907055815507

average
Satisfaction: 0.6339360869518185 +- 0.13463406567871836
Disagreement: 0.46949767151294863 +- 0.23820428217387496

average_without_misery
Satisfaction: 0.6339360869518185 +- 0.13463406567871836
Disagreement: 0.46949767151294863 +- 0.23820428217387496

fairness
Satisfaction: 0.7008493582462517 +- 0.10121269337408421
Disagreement: 0.549375 +- 0.17558956810861923

borda
Satisfaction: 0.9443674389527127 +- 0.02677782493547146
Disagreement: 0.10660288117594457 +- 0.046570107332709935

PAV
Satisfaction: 0.7800950267014272 +- 0.0480315617986343
Disagreement: 0.19507488479262672 +- 0.025991442167355154

sequential_hybrid_aggregation
Satisfaction: 0.94404111992745 +- 0.02730118052301141
Disagreement: 0.09390907947686117 +- 0.04637645449562613
