In [2]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
import warnings

In [3]:
# Import the ratings and movies csv files, drop the timestamp column (unused)
ratings = pd.read_csv("preprocessed_dataset/ratings.csv")
ratings = ratings.drop('timestamp', axis=1)
movies = pd.read_csv("preprocessed_dataset/movies.csv", index_col="item")

In [4]:
# Method for generating random groups based on the ratings file
def getGroups(ratings):
    user_ids = ratings['user'].unique() # Sort by unique user IDs
    random.shuffle(user_ids)
    group_size = 4 # Group size can be determined here
    random_groups = [user_ids[i:i + group_size] for i in range(0, len(user_ids), group_size)]

    return random_groups

In [5]:
# Method for training the UserUser recommender system using the train_data because of the Hold-Out validation strategy
def trainModel(train_data):
    user_user = UserUser(15, min_nbrs=3)
    recsys = Recommender.adapt(user_user)
    recsys.fit(train_data)
    
    return recsys

In [6]:
# Method for returning the ratings of a specific user
def getUserRatings(ratings,user):
    user_ratings = ratings[ratings['user'] == user]
    user_ratings_series = user_ratings.set_index('item')['rating']
    
    return user_ratings_series

In [7]:
# Method for returning all items in the user_item_matrix for which the user does not have a rating yet (NaN)
def getNaNList(user_item_matrix,user_id):
    cols = user_item_matrix.loc[user_id]
    nan_columns = cols[cols.isna()].index.tolist()
    
    return nan_columns

In [8]:
# Method for returning the recommendations for a specific user using the passed trained recommender model
def getRecommendation(recsys,user_item_matrix,user_id):
    user_ratings = getUserRatings(ratings,user_id) # Get all existing ratings
    items = getNaNList(user_item_matrix,user_id) # Get all unrated items
    predicted_scores = recsys.predict_for_user(user_id, items, user_ratings)
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore",category=FutureWarning)
        # Add predicted ratings to the matrix, clipped to a 0-5 interval
        user_item_matrix.loc[user_id, items] = np.clip(predicted_scores,0,5)
        
    return user_item_matrix

In [9]:
# Method for returning the top x recommendations for a given group using the additive aggregation strategy
def getAdditiveOrder(user_item_matrix,group):
    group_ratings = {}
    group_uim = user_item_matrix.loc[group]
    for item in group_uim: # For each item
        total_rating = group_uim[item].sum() # Calculate the added scores of all group members
        if item not in group_ratings: # Add item to array if not already present
            group_ratings[item] = []
        group_ratings[item].append(total_rating) # Add cumulative additive score to ratings array
    result = pd.DataFrame(group_ratings) # Transform into DataFrame
    ordered_items = result.max().sort_values(ascending=False) # Order items by size
    
    return ordered_items.head(5) # Return highest 5 predicted scores

In [10]:
# Method for returning the top x recommendations for a given group using the least misery aggregation strategy
def getLeastMiseryOrder(user_item_matrix,group):
    group_ratings = {}
    group_uim = user_item_matrix.loc[group]
    for item in group_uim: # For each item
        min_rating = group_uim[item].min() # Calculate the lowest score from all group members
        if item not in group_ratings: # Add item to array if not already present
            group_ratings[item] = []
        group_ratings[item].append(min_rating)  # Add lowest score to ratings array
    result = pd.DataFrame(group_ratings) # Transform into DataFrame
    ordered_items = result.max().sort_values(ascending=False) # Order items by size
    
    return ordered_items.head(5) # Return highest 5 predicted scores

In [11]:
# Method for returning the hits on the passed top 5 items (order) for a given group
def getHits(group,order,ratings):
    threshold = 3 # Minimum threshold for a hit can be determined here
    hits = pd.Series(index=order.index, dtype=int) # Instantiate Series
    for item in order.index: # For each passed item to be recommended
        item_ratings = ratings.loc[ratings['item'] == item] # Find ratings for said item
        relevance = sum( # Calculate relevance by calculating how many group members have rated the item above the threshold
            item_ratings.loc[item_ratings['user'] == user, 'rating'].values[0] > threshold
            for user in group if any(item_ratings['user'] == user)
        )
        hits[item] = relevance # Append the item's relevance

    return hits # Return hits Series

In [12]:
# Method for calculating the DCG
def GetDCG(hits):
    dcg = hits.iloc[0] # Start value is the relevance of the first item in the hits Series (rel_1)
    for i, hit_value in enumerate(hits, 1): # For every other item in the hits Series
        if i != 1: # Skip the first to avoid dividing by 0
            value = (hit_value)/np.log2(i) # Calculate relevance / log2(rank)
            dcg += value # Add to DCG

    return dcg

In [13]:
# Method for calclating the IDCG
def GetIDCG(hits):
    hits = hits.sort_values(ascending=False) # Rank the hits Series based on relevance
    ndcg = hits.iloc[0] # Start value is the relevance of the first item in the hits Series (rel_1)
    for i, hit_value in enumerate(hits, 1): # For every other item in the hits Series
        if i != 1: # Skip the first to avoid dividing by 0
            value = (hit_value)/np.log2(i) # Calculate relevance / log2(rank)
            ndcg += value # Add to DCG

    return ndcg

In [14]:
# Method for calculating the nDCG
def GetnDCG(group,order,ratings):
    hits = getHits(group,order,ratings) # Get hits Series
    dcg = GetDCG(hits) # Calculate DCG
    if (dcg == 0): # If DCG is 0, IDCG is also 0 so return 0 as nDCG value
        ndcg = 0
    else: # Else, calculate IDCG and nDCG
        idcg = GetIDCG(hits)
        ndcg = dcg/idcg 

    return ndcg

In [15]:
# Hold-out Validation using 20% test data, stratified on user so there are no missing users/items in any set
train_data, test_data = train_test_split(ratings, test_size=0.2, stratify=ratings['user'])
recsys = trainModel(train_data)

In [16]:
# Pivot the test data to form a user_item_matrix
user_item_matrix = test_data.pivot(index='user', columns='item', values='rating')
user_item_matrix = user_item_matrix.reset_index()
user_item_matrix = user_item_matrix.set_index('user')

In [17]:
# Display initial user_item_matrix with only the existing ratings
print(user_item_matrix)

item  1       2       3       4       5       6       7       8       9       \
user                                                                           
1        4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2        NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3        NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4        NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5        NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
606      2.5     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
607      NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
608      NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
609      NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
610      5.0     NaN     NaN     NaN    

In [18]:
# Generate a recommendation for every item for every user
for i in range(1,610):
    user_item_matrix = getRecommendation(recsys,user_item_matrix,i)

In [19]:
# Generate the random groups
random_groups = getGroups(ratings)

In [20]:
# Display groups and their members
for i, group in enumerate(random_groups, 1):
    print(f"Group {i}: {group}")

Group 1: [ 77 217 440 147]
Group 2: [313 480 160 192]
Group 3: [280 277 418 582]
Group 4: [478  95 155 245]
Group 5: [284  44 482 298]
Group 6: [345 292  51  57]
Group 7: [397 393 257 424]
Group 8: [358 419 604 142]
Group 9: [ 29  64 514 372]
Group 10: [ 80 378 224 271]
Group 11: [226 598 454 567]
Group 12: [ 37 135 118  75]
Group 13: [ 72 534 608 337]
Group 14: [ 69 201 475 568]
Group 15: [ 12 371 110 377]
Group 16: [253 265 606 243]
Group 17: [578 434  93  38]
Group 18: [400 603 490 402]
Group 19: [141 595 395 322]
Group 20: [202  81 254 252]
Group 21: [396 457 183 501]
Group 22: [ 10 465   6 494]
Group 23: [290 343 375 443]
Group 24: [599 517  19 374]
Group 25: [596 126 524 175]
Group 26: [301 177 489 198]
Group 27: [261 529 244 460]
Group 28: [481 296 416 591]
Group 29: [452 111 544  96]
Group 30: [587 528 468 409]
Group 31: [122 600 573  89]
Group 32: [139 522  97 540]
Group 33: [136 467 557   7]
Group 34: [519 228 300 116]
Group 35: [ 45 390  83 229]
Group 36: [194 450 459 321]
G

In [21]:
# Display updated user_item_matrix with the predicted ratings added
print(user_item_matrix)

item    1         2         3         4         5         6         7       \
user                                                                         
1     4.000000  4.101612  4.185755       NaN  3.921791  4.407018  4.102773   
2     4.525722  3.925315  2.965833       NaN  3.703662  4.713489  2.270573   
3     2.699072  2.252441  1.622512  1.442291  1.808286  2.545599  2.140871   
4     4.106570  3.369898  3.606832  2.601330  3.195004  3.920057  3.282139   
5     4.219506  3.460374  3.368864  2.182583  2.779606  3.943327  3.307207   
...        ...       ...       ...       ...       ...       ...       ...   
606   2.500000  3.297478  3.398741  2.665930  2.738567  3.884096  2.845800   
607   4.117969  3.569588  3.670373  3.068866  3.212065  4.139601  3.505072   
608   3.466747  2.885521  2.819855  1.959387  2.204608  3.784093  2.515480   
609   3.418139  3.627447  3.139198       NaN  2.511035  3.471011  2.816998   
610   5.000000       NaN       NaN       NaN       NaN       NaN

In [22]:
# Generate results DataFrame
results = pd.DataFrame(columns=['GroupID', 'Additive nDCG','Least Misery nDCG'])

In [23]:
# Loop to generate nDCG values for each group, for both aggregation strategies
for i, group in enumerate(random_groups, 1):
    order = getAdditiveOrder(user_item_matrix,group) # Generate Additive recommendations
    order2 = getLeastMiseryOrder(user_item_matrix,group) # Generate Least Misery recommendations
    # Display groups and their additive and least misery recommendations
    print(f"Group {i}, {group}")
    print(f"{order}")
    print(f"{order2}")
    ndcg1 = GetnDCG(group,order,ratings) # Calculate additive nDCG
    ndcg2 = GetnDCG(group,order2,ratings) # Calculate least misery nDCG
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore",category=FutureWarning)
        # Append both calculated values to the result DataFrame
        results = results.append({'GroupID': f'Group {i}', 'Additive nDCG': ndcg1, 'Least Misery nDCG': ndcg2}, ignore_index=True)

Group 1, [ 77 217 440 147]
3494    18.198019
1204    18.159512
750     17.998023
3451    17.928981
3435    17.861627
dtype: float64
3201      5.000000
1755      5.000000
1178      4.819551
168326    4.798186
4745      4.718053
dtype: float64
Group 2, [313 480 160 192]
3451    18.094034
2959    17.917563
608     17.755503
2858    17.511666
750     17.478330
dtype: float64
3024    5.000000
4021    4.602986
1942    4.191114
903     4.000000
2858    4.000000
dtype: float64
Group 3, [280 277 418 582]
2936    19.449560
3451    19.291906
3358    19.184934
3030    19.144535
7028    18.816033
dtype: float64
4021     5.000000
85       5.000000
3091     4.878815
4419     4.841086
98154    4.837462
dtype: float64
Group 4, [478  95 155 245]
1178    18.104787
3201    17.378534
5747    17.300283
2202    17.221272
1204    16.777214
dtype: float64
85      5.000000
4021    5.000000
4419    5.000000
3871    4.929453
4745    4.896649
dtype: float64
Group 5, [284  44 482 298]
3814    18.444837
3030    17.9

Group 36, [194 450 459 321]
1178      19.386218
3451      19.375999
1204      19.373510
28        19.175147
171763    19.099745
dtype: float64
4021      5.000000
86781     5.000000
2290      4.967873
4419      4.922179
100507    4.920475
dtype: float64
Group 37, [ 85  36  54 500]
3451    17.465627
3814    16.861920
1178    16.638605
1248    16.578166
2202    16.546132
dtype: float64
85       4.681751
51931    4.393544
6808     4.133481
4021     4.124347
8254     4.092220
dtype: float64
Group 38, [256 407  13 222]
1411    19.252713
1204    19.053731
1103    18.778254
1217    18.762154
3328    18.659389
dtype: float64
85      5.000000
3091    4.833438
3494    4.826835
4021    4.706490
3030    4.701153
dtype: float64
Group 39, [ 87 359 231 289]
3201    19.186311
1411    19.132376
1178    19.101057
3030    18.722102
1248    18.654986
dtype: float64
4745      5.000000
167746    4.891017
4021      4.833404
100507    4.584979
3925      4.566748
dtype: float64
Group 40, [ 60 191 415 278]
527  

Group 71, [ 70 574 403 398]
5500    19.543555
3451    19.493333
5747    19.425154
2202    19.218216
1283    19.218119
dtype: float64
4021     5.000000
85       5.000000
90717    5.000000
4419     4.984261
3201     4.939379
dtype: float64
Group 72, [ 18 334 131 536]
4021    19.018448
1755    18.696969
1178    18.603217
3201    18.511451
1411    18.481538
dtype: float64
85      5.000000
4021    4.558810
1755    4.536663
3201    4.497330
1178    4.426973
dtype: float64
Group 73, [520 492 119 272]
38061    19.020304
3037     18.857235
866      18.780623
1273     18.730617
4878     18.715526
dtype: float64
160271    5.0
148956    5.0
100507    5.0
51931     5.0
98154     5.0
dtype: float64
Group 74, [276 542 152 583]
951      19.455985
51931    19.441723
3334     19.265172
7028     19.109455
3451     19.080831
dtype: float64
85      5.000000
4745    5.000000
3814    4.907937
3091    4.873898
2132    4.802798
dtype: float64
Group 75, [379  43 266 602]
1411    18.409929
589     18.320401
527 

Group 106, [408 195 526 204]
1204    19.955633
1411    19.953586
3814    19.800780
951     19.646020
608     19.558680
dtype: float64
4745     5.0
3201     5.0
90430    5.0
4021     5.0
85       5.0
dtype: float64
Group 107, [385 516 584 205]
3814    18.880658
1248    18.706751
541     18.587447
1103    18.371016
6820    18.331163
dtype: float64
1178     4.744173
2202     4.457315
3201     4.432385
3494     4.427813
73023    4.420569
dtype: float64
Group 108, [  1 572 189 149]
1178     19.339220
71899    19.146107
3201     18.790442
750      18.783252
3451     18.626464
dtype: float64
68952    5.0
51931    5.0
3729     5.0
4021     5.0
85       5.0
dtype: float64
Group 109, [323 333 132 199]
51931    17.178546
3030     16.945865
3814     16.885891
5747     16.795408
3451     16.738712
dtype: float64
4419     4.248120
3451     4.086056
51931    4.021879
3201     4.007386
4745     3.980071
dtype: float64
Group 110, [422 120 287 176]
2936    17.719249
608     17.699772
1178    17.545518
5

Group 141, [137 127  40 446]
3451    18.379855
3201    18.163494
28      18.149396
2202    18.029948
5747    17.806257
dtype: float64
51931    4.682139
4021     4.609986
64620    4.601981
2204     4.555195
53129    4.545569
dtype: float64
Group 142, [439 282 179 187]
2936    19.955204
1178    19.793427
3201    19.781924
4021    19.622757
1411    19.237065
dtype: float64
85       5.000000
2936     4.955204
1178     4.889696
98154    4.853779
3201     4.849961
dtype: float64
Group 143, [ 26 368 438 430]
1178    17.819850
3030    17.816684
1755    17.502109
1204    17.411160
2202    17.335662
dtype: float64
85        4.615889
4021      4.492869
4976      4.161327
104218    4.074663
2202      4.069134
dtype: float64
Group 144, [ 15  17 211 382]
85      20.000000
4450    19.394501
3494    19.235614
1411    19.225511
3201    19.142965
dtype: float64
85        5.000000
158872    4.907502
1178      4.621117
3925      4.618605
4021      4.602298
dtype: float64
Group 145, [275  11 609 545]
51931

In [24]:
# Display results
print(results)

       GroupID Additive nDCG Least Misery nDCG
0      Group 1             0                 0
1      Group 2      0.813015          0.602909
2      Group 3             0                 0
3      Group 4             0                 0
4      Group 5             0                 0
..         ...           ...               ...
148  Group 149           1.0                 0
149  Group 150             0                 0
150  Group 151             0                 0
151  Group 152             0                 0
152  Group 153      0.430677                 0

[153 rows x 3 columns]


In [25]:
# Calculate average nDCG values for both aggregation strategies
average_additive = results['Additive nDCG'].mean()
average_leastmisery = results['Least Misery nDCG'].mean()
print("Average nDCG for Additive Aggregation:", average_additive)
print("Average nDCG for Least Misery Aggregation:", average_leastmisery)

Average nDCG for Additive Aggregation: 0.3271962334504739
Average nDCG for Least Misery Aggregation: 0.32754519155755124


In [26]:
# Conclude which strategy was more effective
if average_additive > average_leastmisery:
    print("This means that for this random group generation, Additive aggregation provides better recommendations")
else:
    print("This means that for this random group generation, Least Misery aggregation provides better recommendations")

This means that for this random group generation, Least Misery aggregation provides better recommendations
