# Laboratorium 5 - rekomendacje grupowe

## Przygotowanie

 * pobierz i wypakuj dataset: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
   * więcej możesz poczytać tutaj: https://grouplens.org/datasets/movielens/
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab5`
 * zainstaluj potrzebne biblioteki:
 `pip install numpy pandas matplotlib`

## Część 1. - przygotowanie danych

In [42]:
# importujemy wszystkie potrzebne pakiety

import math
import numpy as np
import pandas

from random import choice, sample
from statistics import mean, stdev

from reco_utils import *

In [43]:
# wczytujemy oceny uytkownikow i obliczamy (za pomocą collaborative filtering) wszystkie przewidywane oceny filmow

raw_ratings = pandas.read_csv('ml-latest-small/ratings.csv').drop(columns=['timestamp'])
movies = list(raw_ratings['movieId'].unique())
users = list(raw_ratings['userId'].unique())
ratings = get_predicted_ratings(raw_ratings)
ratings

Total error: 212249.06330650047
Total error: 205682.6341426574
Total error: 199568.4655967327
Total error: 193861.7903331736
Total error: 188523.366701796
Total error: 183518.67613367102
Total error: 178817.25502165395
Total error: 174392.13534482854
Total error: 170219.3738511813
Total error: 166277.65383948802
Total error: 162547.94682568708
Total error: 159013.22388995255
Total error: 155658.2084595337
Total error: 152469.16382216537
Total error: 149433.70988390275
Total error: 146540.66465702569
Total error: 143779.90674345088
Total error: 141142.2557086967
Total error: 138619.36775284397
Total error: 136203.64450260662
Total error: 133888.15309161326
Total error: 131666.55597902436
Total error: 129533.04919130102
Total error: 127482.30786738124
Total error: 125509.43815099234
Total error: 123609.93461106133
Total error: 121779.64248684516
Total error: 120014.72415219188
Total error: 118311.62927631136
Total error: 116667.06822901641
Total error: 115077.98833862557
Total error: 113

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,5,5,5,5,10,6,5,7,3,9,...,0,4,3,4,7,6,6,5,7,5
2,7,7,5,10,0,8,9,1,7,2,...,10,8,10,10,10,10,2,2,0,6
3,8,10,2,1,9,10,0,10,0,9,...,6,2,3,0,1,0,10,2,10,3
4,6,9,10,2,6,6,9,4,7,4,...,10,6,5,10,8,4,3,8,1,8
5,3,7,6,2,4,6,4,4,6,2,...,10,8,10,10,1,10,8,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,5,6,5,6
607,5,4,0,10,1,6,10,7,10,4,...,7,4,3,4,5,9,2,6,5,2
608,7,6,6,6,5,6,7,6,7,6,...,7,6,6,6,6,6,6,6,5,6
609,8,5,10,1,1,10,3,8,3,10,...,10,10,10,10,4,8,10,4,4,10


In [44]:
# wczytujemy nazwy filmow i kategorie

movies_metadata = pandas.read_csv('ml-latest-small/movies.csv').set_index('movieId')
movies_metadata

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [45]:
# wczytujemy przykladowe grupy uzytkownikow
groups = pandas.read_csv('groups.csv').values.tolist()
groups

[[111, 307, 474, 599, 414],
 [469, 182, 232, 448, 600],
 [508, 581, 497, 402, 566],
 [300, 515, 245, 568, 507],
 [2, 371, 252, 518, 37],
 [269, 360, 469, 287, 308],
 [243, 527, 418, 118, 370],
 [186, 559, 327, 553, 314]]

In [46]:
# przygotowujemy funkcje pomocnicza

def describe_group(group, N=10):
    print(f'\n\nUser ids: {group}')
    group_size = len(group)
    
    mean_stdev = ratings.loc[group].std(axis=0).mean()
    median_stdev = ratings.loc[group].std(axis=0).median()
    std_stdev = ratings.loc[group].std(axis=0).std()
    print(f'\nMean ratings deviation: {mean_stdev}')
    print(f'Median ratings deviation: {median_stdev}')
    print(f'Standard deviation of ratings deviation: {std_stdev}')
    
    average_scores = ratings.iloc[group].mean(axis=0)
    average_scores = average_scores.sort_values()
    best_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[-N:].index)]
    worst_movies = [(movies_metadata['title'][movie_id], average_scores[movie_id]) for movie_id in list(average_scores[:N].index)]
    
    print('\nBest movies:')
    for movie, score in best_movies[::-1]:
        print(f'{movie}, {score}*')
    print('\nWorst movies:')
    for movie, score in worst_movies:
        print(f'{movie}, {score}*')

describe_group(groups[1])



User ids: [469, 182, 232, 448, 600]

Mean ratings deviation: 0.6397204115358627
Median ratings deviation: 0.5477225575051662
Standard deviation of ratings deviation: 0.3779411793404649

Best movies:
Evil Aliens (2005), 9.6*
Fountain, The (2006), 9.6*
Cure, The (1995), 9.6*
Soul Surfer (2011), 9.4*
Teenage Mutant Ninja Turtles (2014), 9.4*
Made of Honor (2008), 9.2*
Adaptation (2002), 9.2*
Remains of the Day, The (1993), 9.2*
Four Brothers (2005), 9.2*
Tender Mercies (1983), 9.2*

Worst movies:
Counselor, The (2013), 1.2*
World War Z (2013), 1.8*
Class of Nuke 'Em High (1986), 1.8*
Revenge for Jolly! (2012), 2.0*
Bless the Child (2000), 2.0*
Big Hero 6 (2014), 2.0*
Son's Room, The (Stanza del figlio, La) (2001), 2.0*
Airport (1970), 2.0*
Magnificent Seven, The (1960), 2.0*
Gamer (2009), 2.2*


## Część 2. - algorytmy proste

In [47]:
# zdefiniujmy interfejs dla wszystkich algorytmow rekomendacyjnych

class Recommender:
    def recommend(self, movies, ratings, group, size):
        pass

# jako pierwszy zaimplementujemy algorytm losowy - dla porownania
    
class RandomRecommender(Recommender):
    def __init__(self):
        self.name = 'random'
        
    def recommend(self, movies, ratings, group, size):
        return sample(movies, size)

In [48]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen

class AverageRecommender(Recommender):
    def __init__(self):
        self.name = 'average'
    
    def recommend(self, movies, ratings, group, size):
        average_scores = ratings.iloc[group].mean(axis=0)
        average_scores = average_scores.sort_values()
        return list(average_scores[-size:].index)
    
AverageRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[7060, 1623, 41769, 26564, 122898]

In [49]:
# algorytm rekomendujacy filmy o najwyzszej sredniej ocen,
#   ale rownoczesnie wykluczajacy te filmy, ktore otrzymaly choc jedna ocene ponizej thresholdu

class AverageWithoutMiseryRecommender(Recommender):
    def __init__(self, score_threshold):
        self.name = 'average_without_misery'
        self.score_threshold = score_threshold
        
    def recommend(self, movies, ratings, group, size):
        average_scores = ratings.iloc[group].mean(axis=0)
        average_scores = average_scores.sort_values()
        average_scores = average_scores[average_scores >= self.score_threshold]
        return list(average_scores[-size:].index)
    
AverageWithoutMiseryRecommender(8).recommend(movies, ratings, groups[0], len(groups[0]))

[7060, 1623, 41769, 26564, 122898]

In [50]:
# algorytm uwzgledniajacy preferencje tylko jednego uzytkownika w kazdej iteracji

class FairnessRecommender(Recommender):
    def __init__(self):
        self.name = 'fairness'
        self.user_index = 0
        
    def recommend(self, movies, ratings, group, size):
        self.user_index = (self.user_index + 1) % len(group)
        user_id = group[self.user_index]
        user_ratings = ratings.loc[user_id]
        user_ratings = user_ratings.sort_values()
        return list(user_ratings[-size:].index)
    
FairnessRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[84950, 97785, 2879, 32456, 26350]

In [51]:
# wybrany algorytm wyborczy (dyktatura, glosowanie proste, Borda, Copeland)

class VotingRecommender(Recommender):
    def __init__(self):
        self.name = 'borda'
    
    def recommend(self, movies, ratings, group, size):
        user_points = ratings.loc[group].copy()
        
        for user_id in group:
            sorted_user_ratings = user_points.loc[user_id].sort_values(ascending=False)
            sorted_user_ratings = sorted_user_ratings.rank(method='dense')
            user_points.loc[user_id] = sorted_user_ratings
            
        user_points = user_points.sum(axis=0)
        return list(user_points.sort_values()[-size:].index)

VotingRecommender().recommend(movies, ratings, groups[0], len(groups[0]))

[95873, 946, 72142, 5628, 121035]

In [52]:
# algorytm zachlanny, aproksymujacy metode Proportional Approval Voting
#   w kazdej iteracji wybieramy ten film, ktory najbardziej zwieksza zadowolenie zgodnie z punktacja PAV

import pandas as pd

class PAVRecommender(Recommender):
    def __init__(self, threshold):
        self.threshold = threshold
        self.name = 'PAV'

    @staticmethod
    def _calculate_satisfaction(user_satisfactions):
        return {user: 1/(satisfaction + 1) for user, satisfaction in user_satisfactions.items()}

    def recommend(self, movies, ratings, group, size):
        user_preferences = ratings.loc[group] >= self.threshold
        user_satisfactions = pd.Series(0, index=group)
        recommendations = []

        for _ in range(size):
            user_satisfaction_scores = pd.Series(self._calculate_satisfaction(user_satisfactions))
            user_satisfaction = user_preferences.mul(user_satisfaction_scores, axis=0)

            # Find the movie with the highest satisfaction sum that is not yet recommended
            recommended_movie = user_satisfaction.loc[:, ~user_satisfaction.columns.isin(recommendations)].sum().idxmax()
            recommendations.append(recommended_movie)

            # Update satisfaction for users satisfied by the recommended movie
            user_satisfactions[user_preferences[recommended_movie]] += 1

        return recommendations
        
PAVRecommender(7).recommend(movies, ratings, groups[0], len(groups[0]))

[946, 1734, 25996, 5379, 6978]

## Część 3. - funkcje celu

In [53]:
# dwie funkcje pomocnicze:
#  - znajdujaca ulubione filmy danego uzytkownika
#  - obliczajaca sume ocen wystawionych przez uzytkownika wszystkim filmom w rekomendacji

def top_n_movies_for_user(ratings, movies, user_id, n):
    user_ratings = ratings.loc[user_id]
    user_ratings = user_ratings.sort_values(ascending=False)
    return list(user_ratings[:n].index)

def total_score(recommendation, user_id, ratings):
    user_ratings = ratings.loc[user_id]
    return user_ratings[recommendation].sum()

In [54]:
# funkcja obliczajaca zadowolenie pojedynczego uzytkownika
#  - iloraz zadowolenia z wygenerowanej rekomendacji oraz zadowolenia z hipotetycznej rekomendacji idealnej
def overall_user_satisfaction(recommendation, user_id, movies, ratings):
    n = len(recommendation)
    top_n = top_n_movies_for_user(ratings, movies, user_id, n)
    return total_score(recommendation, user_id, ratings) / total_score(top_n, user_id, ratings)

# funkcja celu - srednia z zadowolenia wszystkich uzytkownikow w grupie
def overall_group_satisfaction(recommendation, group, movies, ratings):
    return mean([overall_user_satisfaction(recommendation, user_id, movies, ratings) for user_id in group])

# funkcja celu - roznica miedzy maksymalnym i minimalnym zadowolenie w grupie
def group_disagreement(recommendation, group, movies, ratings):
    if len(group) <= 1:
        return 0
    
    min_satisfaction = float('inf')
    max_satisfaction = -float('inf')
    
    for user_id in group:
        satisfaction = overall_user_satisfaction(recommendation, user_id, movies, ratings)
        min_satisfaction = min(min_satisfaction, satisfaction)
        max_satisfaction = max(max_satisfaction, satisfaction)
        
    return max_satisfaction - min_satisfaction

## Część 4. - Sequential Hybrid Aggregation

In [67]:
# algorytm balansujacy pomiedzy wyborem elementow o najwyzszej sredniej ocen
#   i o najwyzszej minimalnej ocenie
#   wyliczajacy w kazdej iteracji parametr alfa - jak na wykladzie
class SequentialHybridAggregationRecommender(Recommender):
    def __init__(self):
        self.name = 'sequential_hybrid_aggregation'
    
    def recommend(self, movies, ratings, group, size):
        all_scores = ratings.loc[group].copy()  # copy of original scores 
        recommendation = []

        for _ in range(size):
            all_scores.loc[:, recommendation] = np.nan
            avg_scores = all_scores.mean(axis=0)
            least_scores = all_scores.min(axis=0)
            
            if recommendation: 
                alpha = group_disagreement(recommendation, group, movies, ratings)
            else:
                alpha = 0
            
            score_for_recommendation = (1 - alpha) * avg_scores + alpha * least_scores
            
            recommended_movie = score_for_recommendation.idxmax()
            recommendation.append(recommended_movie)

        return recommendation

## Część 5. - porównanie algorytmów

In [68]:
recommenders = [
    RandomRecommender(),
    AverageRecommender(),
    AverageWithoutMiseryRecommender(5),
    FairnessRecommender(),
    VotingRecommender(),
    PAVRecommender(5),
    SequentialHybridAggregationRecommender()
]

recommendation_size = 10

# dla kazdego algorytmu:
#  - wygenerujmy jedna rekomendacje dla kazdej grupy
#  - obliczmy wartosci obu funkcji celu dla kazdej rekomendacji
#  - obliczmy srednia i odchylenie standardowe dla obu funkcji celu
#  - wypiszmy wyniki na konsole

for recommender in recommenders:
    recommendations = [recommender.recommend(movies, ratings, group, recommendation_size) for group in groups]
    satisfaction = [overall_group_satisfaction(recommendation, group, movies, ratings) for recommendation, group in zip(recommendations, groups)]
    disagreement = [group_disagreement(recommendation, group, movies, ratings) for recommendation, group in zip(recommendations, groups)]
    print(f'\n{recommender.name}')
    print(f'Satisfaction: {mean(satisfaction)} +- {stdev(satisfaction)}')
    print(f'Disagreement: {mean(disagreement)} +- {stdev(disagreement)}')



random
Satisfaction: 0.6296519398323196 +- 0.1139871646364673
Disagreement: 0.27343128390596744 +- 0.1369500034698051

average
Satisfaction: 0.6436011014302153 +- 0.14389457254057428
Disagreement: 0.44468128390596745 +- 0.1724645053713391

average_without_misery
Satisfaction: 0.6436011014302153 +- 0.14389457254057428
Disagreement: 0.44468128390596745 +- 0.1724645053713391

fairness
Satisfaction: 0.6910519891500905 +- 0.10519521537615634
Disagreement: 0.5581487341772152 +- 0.17572974620646248

borda
Satisfaction: 0.9542018329771494 +- 0.04850826875300344
Disagreement: 0.0842857142857143 +- 0.07009364831086834

PAV
Satisfaction: 0.7789519562715765 +- 0.04889989666190108
Disagreement: 0.1868490054249548 +- 0.08643764314951664

sequential_hybrid_aggregation
Satisfaction: 0.953617211902022 +- 0.04963865874995003
Disagreement: 0.0677435064935065 +- 0.0514628049655131
