**Loading dataset from Drive.**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

!cp "/content/drive/MyDrive/datasets/MovieLens100K/u.genre" /content
!cp "/content/drive/MyDrive/datasets/MovieLens100K/u.item" /content
!cp "/content/drive/MyDrive/datasets/MovieLens100K/movies.csv" /content
!cp "/content/drive/MyDrive/datasets/MovieLens100K/ratings.csv" /content

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

**List of genres**

In [None]:
genre = pd.read_csv('/content/u.genre', sep="|", encoding='latin-1', header=None)
genre.drop(genre.columns[1], axis=1, inplace=True)
genre.columns = ['Genres']
Genre = list(genre['Genres'])
Genre

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

**Ratings dataset**

In [None]:
ratings=pd.read_csv('/content/ratings.csv')
movies=pd.read_csv('/content/movies.csv', encoding='ISO-8859-1')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


**Forming a Rating Matrix**

In [None]:
R = pd.pivot(ratings,index = 'userId', columns ='movieId', values = 'rating')
R.fillna(0,inplace=True)
R

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Splitting the rating matrix**

In [None]:
def holdout_split(R, test_ratio=0.2, shuffle=False, random_state=None):
  test = np.zeros(R.shape)
  train = R.copy()
  for user in range(R.shape[0]):  
    obs = R[user].nonzero()[0]       
    if shuffle:
      engine = np.random.default_rng(random_state)
      engine.shuffle(obs)
    indices = np.random.choice(obs, size=int(len(obs)*test_ratio), replace=False)
    test[user, indices] = R[user, indices]
    train[user, indices] = 0
  return train, test

In [None]:
R = R.to_numpy()

In [None]:
train_R, test_R = holdout_split(R)

In [None]:
train_R.shape

(943, 1682)

In [None]:
test_R.shape

(943, 1682)

**Genre dataset**

In [None]:
movies.head()
movies.shape

(1681, 3)

In [None]:
item = pd.read_csv('/content/u.item', sep="|", encoding='latin-1', header=None)
item.columns = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item.drop(item.columns[[2,3,4]], axis=1, inplace=True)
item.head()

Unnamed: 0,movie id,movie title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
data=item.loc[:, ~item.columns.isin(['movie title'])]

**Clustering based on genres**

In [None]:
import collections
genre_map = collections.defaultdict(dict)      
for i, row in item.iterrows():
  row_genres = set(row[row == 1].index) - { 'movie id', }
  for genre in row_genres:
    genre_map[genre][row['movie id']] = len(genre_map[genre])

In [None]:
genre_map = dict(genre_map)
genre_map

{'Comedy': {1: 0,
  4: 1,
  8: 2,
  13: 3,
  16: 4,
  17: 5,
  21: 6,
  25: 7,
  26: 8,
  29: 9,
  34: 10,
  40: 11,
  41: 12,
  42: 13,
  45: 14,
  47: 15,
  49: 16,
  63: 17,
  65: 18,
  66: 19,
  67: 20,
  69: 21,
  70: 22,
  72: 23,
  73: 24,
  74: 25,
  80: 26,
  81: 27,
  83: 28,
  85: 29,
  88: 30,
  90: 31,
  91: 32,
  93: 33,
  94: 34,
  95: 35,
  104: 36,
  105: 37,
  108: 38,
  109: 39,
  110: 40,
  111: 41,
  116: 42,
  120: 43,
  122: 44,
  123: 45,
  138: 46,
  139: 47,
  150: 48,
  151: 49,
  152: 50,
  153: 51,
  154: 52,
  158: 53,
  163: 54,
  167: 55,
  168: 56,
  169: 57,
  170: 58,
  171: 59,
  173: 60,
  184: 61,
  186: 62,
  189: 63,
  194: 64,
  201: 65,
  202: 66,
  204: 67,
  208: 68,
  209: 69,
  211: 70,
  216: 71,
  220: 72,
  225: 73,
  231: 74,
  232: 75,
  235: 76,
  236: 77,
  238: 78,
  240: 79,
  242: 80,
  243: 81,
  248: 82,
  249: 83,
  251: 84,
  255: 85,
  256: 86,
  257: 87,
  259: 88,
  261: 89,
  269: 90,
  274: 91,
  284: 92,
  290: 93,
  294

In [None]:
train_R.shape

(943, 1682)

In [None]:
# R_final=[]
# for genre in genre_map:
#   R=[]
#   for mid in ratings.iterrows():
#     if mid[1]["movieId"] in genre_map[genre]:
#       R.append(mid[1])
#   R_final.append(R)

In [None]:
# R_final = [ pd.concat(R, axis=1).T for R in R_final ]

In [None]:
R_final = { genre: train_R[:,np.array(list(genre_map[genre].keys()))-1] for genre in genre_map }

In [None]:
for genre in R_final:
  print(genre, ":\n",(R_final[genre].shape))

Comedy :
 (943, 505)
Animation :
 (943, 42)
Children's :
 (943, 122)
Action :
 (943, 251)
Thriller :
 (943, 251)
Adventure :
 (943, 135)
Drama :
 (943, 725)
Crime :
 (943, 109)
Sci-Fi :
 (943, 101)
War :
 (943, 71)
Romance :
 (943, 247)
Horror :
 (943, 92)
Musical :
 (943, 56)
Documentary :
 (943, 50)
Western :
 (943, 27)
Fantasy :
 (943, 22)
Film-Noir :
 (943, 24)
Mystery :
 (943, 61)
unknown :
 (943, 2)


In [None]:
R_final_test = { genre: R[:,np.array(list(genre_map[genre].keys()))-1] for genre in genre_map }

In [None]:
for genre in R_final_test:
  print(genre, ":", R_final_test[genre].shape)

Comedy : (943, 505)
Animation : (943, 42)
Children's : (943, 122)
Action : (943, 251)
Thriller : (943, 251)
Adventure : (943, 135)
Drama : (943, 725)
Crime : (943, 109)
Sci-Fi : (943, 101)
War : (943, 71)
Romance : (943, 247)
Horror : (943, 92)
Musical : (943, 56)
Documentary : (943, 50)
Western : (943, 27)
Fantasy : (943, 22)
Film-Noir : (943, 24)
Mystery : (943, 61)
unknown : (943, 2)


In [None]:
# R_final = { genre: R for genre, R in zip(genre_map, R_final) }

In [None]:
def melt(R_final, include_unknown=False):
  for genre in R_final:
    genre_df = pd.DataFrame(R_final[genre])
    genre_df.index.rename("userId", inplace=True)
    genre_df = genre_df.reset_index().melt(id_vars=['userId'], var_name='movieId', value_name='rating')
    if not include_unknown:
      R_final[genre] = genre_df[genre_df["rating"] != 0]
    else:
      R_final[genre] = genre_df

In [None]:
melt(R_final)

In [None]:
melt(R_final_test)

In [None]:
R_final["Comedy"]

Unnamed: 0,userId,movieId,rating
0,0,0,5.0
1,1,0,4.0
4,4,0,4.0
5,5,0,4.0
9,9,0,4.0
...,...,...,...
473224,781,501,4.0
473281,838,501,1.0
473312,869,501,4.0
473322,879,501,4.0


In [None]:
R_final_test["Comedy"]

Unnamed: 0,userId,movieId,rating
0,0,0,5.0
1,1,0,4.0
4,4,0,4.0
5,5,0,4.0
9,9,0,4.0
...,...,...,...
473312,869,501,4.0
473322,879,501,4.0
474167,781,502,3.0
475110,781,503,3.0


Installing Scikit surprise package

In [None]:
%pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 5.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626481 sha256=f6cf4d406843a2fb86c2e53cc74c3baa94880541b0cfb695e41cb80bb9581852
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
from surprise import Dataset, Reader
reader = Reader(rating_scale=(1, 5))
data = { genre: Dataset.load_from_df(R[[ 'userId', 'movieId', 'rating' ]], reader) for genre, R in R_final.items() }
test_data = { genre: Dataset.load_from_df(R[[ 'userId', 'movieId', 'rating' ]], reader) for genre, R in R_final_test.items() }
# data = Dataset.load_from_df(R_final[['userId', 'movieId', 'rating'] ], reader)

In [None]:
import tqdm

from surprise import NMF
from surprise.model_selection import cross_validate

models={}
for genre, d in data.items():
  algo = NMF()
  print(genre)
  algo.fit(d.build_full_trainset())
  models[genre]=algo
# algo.fit

Comedy
Animation
Children's
Action
Thriller
Adventure
Drama
Crime
Sci-Fi
War
Romance
Horror
Musical
Documentary
Western
Fantasy
Film-Noir
Mystery
unknown


In [None]:
pred_data={}
for genre in data:
  pred=models[genre].test(test_data[genre].build_full_trainset().build_testset())
  pred_data[genre]=pred

In [None]:
genre_keys = { genre: list(genre_map[genre].keys()) for genre in genre_map }

In [None]:
genre_keys

{'Comedy': [1,
  4,
  8,
  13,
  16,
  17,
  21,
  25,
  26,
  29,
  34,
  40,
  41,
  42,
  45,
  47,
  49,
  63,
  65,
  66,
  67,
  69,
  70,
  72,
  73,
  74,
  80,
  81,
  83,
  85,
  88,
  90,
  91,
  93,
  94,
  95,
  104,
  105,
  108,
  109,
  110,
  111,
  116,
  120,
  122,
  123,
  138,
  139,
  150,
  151,
  152,
  153,
  154,
  158,
  163,
  167,
  168,
  169,
  170,
  171,
  173,
  184,
  186,
  189,
  194,
  201,
  202,
  204,
  208,
  209,
  211,
  216,
  220,
  225,
  231,
  232,
  235,
  236,
  238,
  240,
  242,
  243,
  248,
  249,
  251,
  255,
  256,
  257,
  259,
  261,
  269,
  274,
  284,
  290,
  294,
  301,
  312,
  316,
  319,
  321,
  335,
  337,
  338,
  341,
  342,
  345,
  347,
  352,
  354,
  362,
  364,
  367,
  368,
  369,
  372,
  376,
  377,
  381,
  382,
  383,
  384,
  385,
  386,
  388,
  390,
  391,
  393,
  394,
  395,
  396,
  399,
  400,
  401,
  402,
  407,
  408,
  409,
  410,
  411,
  412,
  414,
  415,
  419,
  422,
  425,
  428,
  430,


In [None]:
pred_dfs = {
    genre: pd.DataFrame(
        [ (pred.uid + 1, genre_keys[genre][pred.iid], pred.est) for pred in pred_data[genre] ],
        columns=[ 'userId', 'movieId', 'rating' ]
    )
    for genre in pred_data
}

In [None]:
print("Predicted dataset for Children's genre: \n   ", pred_dfs["Children's"])

Predicted dataset for Children's genre: 
          userId  movieId    rating
0          1        1  4.185367
1          1        8  1.851296
2          1       35  1.052374
3          1       63  2.133849
4          1       71  3.414600
...      ...      ...       ...
7177     736      993  3.357044
7178     939      993  4.015358
7179     208      996  2.975464
7180     799     1063  3.357044
7181     146     1293  5.000000

[7182 rows x 3 columns]


## Frequent Itemset Generation

In [None]:
%pip install mlxtend

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from mlxtend.frequent_patterns import apriori

item_data = item.drop([ 'movie id', 'movie title' ], axis=1)

itemsets = apriori(
    item_data, use_colnames=True, min_support=0.01
)

In [None]:
genre_freq = {
    genre: sum(1 if genre in row['itemsets'] else 0 for _, row in itemsets.iterrows())
    for genre in genre_map
}

In [None]:
itemsets

Unnamed: 0,support,itemsets
0,0.149227,(Action)
1,0.080262,(Adventure)
2,0.02497,(Animation)
3,0.072533,(Children's)
4,0.300238,(Comedy)
5,0.064804,(Crime)
6,0.029727,(Documentary)
7,0.431034,(Drama)
8,0.01308,(Fantasy)
9,0.014269,(Film-Noir)


In [None]:
genre_freq

{'Comedy': 7,
 'Animation': 4,
 "Children's": 7,
 'Action': 12,
 'Thriller': 11,
 'Adventure': 8,
 'Drama': 8,
 'Crime': 4,
 'Sci-Fi': 6,
 'War': 3,
 'Romance': 5,
 'Horror': 2,
 'Musical': 5,
 'Documentary': 1,
 'Western': 1,
 'Fantasy': 1,
 'Film-Noir': 1,
 'Mystery': 2,
 'unknown': 0}

In [None]:
ratings['predictedRating'] = np.zeros(len(ratings))

import tqdm

new_ratings = []
for _, row in tqdm.tqdm(ratings.iterrows(), total=len(ratings)):
  rowitem = item[item['movie id'] == row['movieId']].iloc[0]
  genres = set(rowitem[rowitem == 1].index) - { 'movie id', }
  total_weight = sum(genre_freq[genre] for genre in genres)
  aggregate_rating = 0
  for genre in genres:
    pred_df = pred_dfs[genre]
    rows = pred_df[
        (pred_df['userId']==row['userId']) & (pred_df['movieId']==row['movieId'])
    ]
    if len(rows) > 0:
      aggregate_rating += genre_freq[genre] * rows.iloc[0,2]
  if total_weight == 0: total_weight = 1
  new_ratings.append(aggregate_rating / total_weight)

ratings['predictedRating'] = new_ratings

 53%|█████▎    | 52739/100000 [02:31<02:15, 348.06it/s]


KeyboardInterrupt: ignored

In [None]:
# for i in range(10):
#   row = ratings.iloc[i]
#   rowitem = item[item['movie id'] == row['movieId']].iloc[0]
#   genres = set(rowitem[rowitem == 1].index) - { 'movie id', }
#   total_weight = sum(genre_freq[genre] for genre in genres)
#   aggregate_rating = 0
#   aggregate_eqn = ''
#   for genre in genres:
#     pred_df = pred_dfs[genre]
#     rows = pred_df[
#         (pred_df['userId']==row['userId']) & (pred_df['movieId']==row['movieId'])
#     ]
#     if len(rows) > 0:
#       aggregate_rating += genre_freq[genre] * rows.iloc[0,2]
#       aggregate_eqn += f"+ ({genre_freq[genre]} * {rows.iloc[0,2]})"
#     else:
#       aggregate_eqn += f"+ ({genre_freq[genre]} * 0)"
#     aggregate_eqn += f" ({genre}) "
#   if total_weight == 0: total_weight = 1
#   aggregate_eqn += f"/ {total_weight} = {aggregate_rating / total_weight}"
#   print(row['userId'], row['movieId'], aggregate_eqn)

In [None]:
ratings

In [None]:
pd.DataFrame(R)

In [None]:
predR = pd.pivot(ratings, index = 'userId', columns ='movieId', values = 'predictedRating')
predR.fillna(0,inplace=True)
predR

## Missing Predictions:

In [None]:
R_final_complete = { genre: R[:,np.array(list(genre_map[genre].keys()))-1] for genre in genre_map }

In [None]:
melt(R_final_complete, include_unknown=True)

In [None]:
reader = Reader(rating_scale=(1, 5))
complete_data = { genre: Dataset.load_from_df(R[[ 'userId', 'movieId', 'rating' ]], reader) for genre, R in R_final_complete.items() }

In [None]:
pred_data_complete={}
for genre in data:
  pred=models[genre].test(complete_data[genre].build_full_trainset().build_testset())
  pred_data_complete[genre]=pred

In [None]:
pred_dfs_complete = {
    genre: pd.DataFrame(
        [ (pred.uid + 1, genre_keys[genre][pred.iid], pred.est) for pred in preds ],
        columns=[ 'userId', 'movieId', 'rating' ]
    )
    for genre, preds in pred_data_complete.items()
}

In [None]:
import itertools

userCount = ratings['userId'].max()
movieCount = ratings['movieId'].max()

all_ratings = pd.DataFrame(
    ((userId, movieId, 0) for userId, movieId in
      itertools.product(range(1, userCount+1), range(1, movieCount+1))),
    columns=['userId', 'movieId', 'rating']
)

In [None]:
def aggregate_ratings(ratings, item, pred_df, mode='weighted', genre_freq=None):
  merge_cols = lambda x: f"{int(x['userId'])}_{int(x['movieId'])}"
  new_ratings = np.zeros(len(ratings))
  new_weights = np.zeros(len(ratings))
  # new_expl    = ratings.apply(lambda x: f"{int(x['userId'])} {int(x['movieId'])}", axis=1).values
  rating_ids = ratings.apply(merge_cols, axis=1)
  for genre, pred_df in tqdm.tqdm(pred_dfs.items()):
    genre_df_ids = pred_df.apply(merge_cols, axis=1)
    rev_id_mask = genre_df_ids.isin(rating_ids)
    id_mask     = rating_ids.isin(genre_df_ids)
    id_vals     = rating_ids[id_mask]
    id_index    = pd.Series(id_vals.index, index=id_vals)\
      .reindex(genre_df_ids[rev_id_mask].values)

    genre_weight = (genre_freq[genre] if mode=='weighted' else 1)
    # new_expl[id_index] += [ f" + {genre_weight} * {r} ({genre})" for r in pred_df[rev_id_mask]['rating'] ]
    new_ratings[id_index] += genre_weight * pred_df[rev_id_mask]['rating']
    new_weights[id_index] += genre_weight
  new_weights[new_weights == 0] = 1
  # new_expl = [ f"({expl}) / {wt}" for expl, wt in zip(new_expl, new_weights) ]
  return new_ratings / new_weights #, new_expl

In [None]:
pred_ratings_complete = aggregate_ratings(
    all_ratings, item, pred_dfs_complete, 'non-weighted', genre_freq
)

100%|██████████| 19/19 [00:14<00:00,  1.32it/s]


In [None]:
pred_ratings_weighted_complete = aggregate_ratings(
    all_ratings, item, pred_dfs_complete, 'weighted', genre_freq
)

100%|██████████| 19/19 [00:14<00:00,  1.31it/s]


In [None]:
new_ratings=pd.DataFrame(all_ratings, columns=['userId', 'movieId', 'predictedRating'])
new_ratings['predictedRating'] = pred_ratings_weighted_complete

In [None]:
new_ratings

Unnamed: 0,userId,movieId,predictedRating
0,1,1,4.421518
1,1,2,3.116191
2,1,3,3.745187
3,1,4,3.741981
4,1,5,3.134112
...,...,...,...
1586121,943,1678,0.000000
1586122,943,1679,0.000000
1586123,943,1680,0.000000
1586124,943,1681,0.000000


In [None]:
predRat = pd.pivot(new_ratings, index = 'userId', columns ='movieId', values = 'predictedRating')
predRat.fillna(0,inplace=True)
predRat

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.421518,3.116191,3.745187,3.741981,3.134112,3.231765,4.144088,3.326106,4.033765,3.728796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.024028,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.037581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3.851607,2.898722,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.795335,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.000000,0.000000,0.000000,3.160136,0.000000,0.000000,3.250632,4.131614,3.661644,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,4.879999,0.000000,0.000000,0.000000,0.000000,0.000000,3.917776,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Evaluation

In [None]:
# def aggregate_ratings_old(ratings, item, pred_df, mode='weighted', genre_freq=None):
#   new_ratings = []
#   for _, row in tqdm.tqdm(ratings.iterrows(), total=len(ratings)):
#     rowitem = item[item['movie id'] == row['movieId']].iloc[0]
#     genres = set(rowitem[rowitem == 1].index) - { 'movie id', }
#     if mode=='weighted':
#       total_weight = sum(genre_freq[genre] for genre in genres)
#     else:
#       total_weight = len(genres)
#     aggregate_rating = 0
#     for genre in genres:
#       pred_df = pred_dfs[genre]
#       rows = pred_df[
#           (pred_df['userId']==row['userId']) & (pred_df['movieId']==row['movieId'])
#       ]
#       if len(rows) > 0:
#         aggregate_rating += (genre_freq[genre] if mode=='weighted' else 1) * rows.iloc[0,2]
#     if total_weight == 0: total_weight = 1
#     if aggregate_rating<1: aggregate_rating=1
#     new_ratings.append(aggregate_rating / total_weight)

#   return np.array(new_ratings)

In [None]:
def aggregate_ratings(ratings, item, pred_df, mode='weighted', genre_freq=None):
  merge_cols = lambda x: f"{int(x['userId'])}_{int(x['movieId'])}"
  new_ratings = np.zeros(len(ratings))
  new_weights = np.zeros(len(ratings))
  # new_expl    = ratings.apply(lambda x: f"{int(x['userId'])} {int(x['movieId'])}", axis=1).values
  rating_ids = ratings.apply(merge_cols, axis=1)
  for genre, pred_df in tqdm.tqdm(pred_dfs.items()):
    genre_df_ids = pred_df.apply(merge_cols, axis=1)
    rev_id_mask = genre_df_ids.isin(rating_ids)
    id_mask     = rating_ids.isin(genre_df_ids)
    id_vals     = rating_ids[id_mask]
    id_index    = pd.Series(id_vals.index, index=id_vals)\
      .reindex(genre_df_ids[rev_id_mask].values)

    genre_weight = (genre_freq[genre] if mode=='weighted' else 1)
    # new_expl[id_index] += [ f" + {genre_weight} * {r} ({genre})" for r in pred_df[rev_id_mask]['rating'] ]
    new_ratings[id_index] += genre_weight * pred_df[rev_id_mask]['rating']
    new_weights[id_index] += genre_weight
  new_weights[new_weights == 0] = 1
  # new_expl = [ f"({expl}) / {wt}" for expl, wt in zip(new_expl, new_weights) ]
  return new_ratings / new_weights #, new_expl

In [None]:
def rmse(prediction, true):

    difference = (prediction - true)*(true !=0)
    square_diff = np.square(difference)
    mean_square_diff = np.sum(square_diff)/np.count_nonzero(true)
    score = np.sqrt(mean_square_diff)
    return score

In [None]:
def mae(prediction, true):

    difference = abs(prediction - true)*(true !=0)
    score = np.sum(difference)/np.count_nonzero(true)

    return score

In [None]:
# pred_ratings = aggregate_ratings_old(ratings, item, pred_dfs, 'non-weighted', genre_freq)
pred_ratings = aggregate_ratings(ratings, item, pred_dfs, 'non-weighted', genre_freq)

100%|██████████| 19/19 [00:03<00:00,  5.40it/s]


In [None]:
# pred_ratings_weighted = aggregate_ratings_old(ratings, item, pred_dfs, 'weighted', genre_freq)
pred_ratings_weighted = aggregate_ratings(ratings, item, pred_dfs, 'weighted', genre_freq)

100%|██████████| 19/19 [00:03<00:00,  5.35it/s]


In [None]:
print("Rating scores (baseline aggregation):")
print("RMSE:{:.4f}".format(rmse(ratings['rating'], pred_ratings)*100))
print("MAE:{:.4f}".format(mae(ratings['rating'], pred_ratings)*100))

Rating scores (non-weighted aggregation):
RMSE:67.8697
MAE:50.6456


In [None]:
print("Rating scores (proposed aggregation):")
print("RMSE:{:.4f}".format(rmse(ratings['rating'], pred_ratings_weighted)*100))
print("MAE:{:.4f}".format(mae(ratings['rating'], pred_ratings_weighted)*100))

Rating scores (weighted aggregation):
RMSE:69.2746
MAE:52.1123
