$$ ITI \space AI-Pro: \space Intake \space 45 $$
$$ Recommender \space Systems $$
$$ Lab \space no. \space 2 $$

# `01` Import Necessary Libraries

In [1]:
#!pip install --force-reinstall numpy
#!pip install --force-reinstall scikit-surprise


In [2]:
#!pip uninstall -y numpy scikit-surprise
#!pip install numpy==1.24.4
#!pip install scikit-surprise --no-binary :all:


In [3]:
#!pip install scikit-surprise


## `i` Default Libraries

In [1]:
import numpy as np
import pandas as pd
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNWithMeans

## `ii` Additional Libraries
Add imports for additional libraries you used throughout the notebook

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


----------------------------

# `02` Load Data

 The dataset will have the following columns :
   - song_id (String) : Unique identified for the song
   - user_id (String) : Unique identifier for the user
   - song_genre (Integer) : An integer representing a genre for the song, value is between 1 and 5, indicating that there are 5 unique genres. Each song can only have 1 genre
   - artist_id (String) : Unique identifier for the author of the song
   - n_listen (Integer) : The number of times this user has heard the song (0 -> 15)
   - publish_year (Integer) : The year of song publishing

In [3]:
data = pd.read_csv("/content/songs_data.csv")
data.head()

Unnamed: 0,song_id,artist_id,song_genre,user_id,n_listen,publish_year
0,537,368,4,2066,13,2002
1,921,107,1,1179,5,2006
2,352,188,1,1468,11,2013
3,853,370,4,460,9,2020
4,479,408,2,1125,3,2020


--------------------------

# `03` Content-based Filtering

Practice for content-based filtering on dummy data

## `i` Feature Engineering/Selection
Construct the item vector representation matrix from the `data` above

In [4]:
# One-Hot Encode `artist_id` and `song_genre
encoded_data = pd.get_dummies(data, columns=['artist_id', 'song_genre']).astype(int)
# Min-Max Normalize `publish_year`
min_year=encoded_data['publish_year'].min()
max_year=encoded_data['publish_year'].max()
encoded_data['publish_year']=(encoded_data['publish_year']-min_year)/(max_year-min_year)
# Drop
feature_selection=encoded_data.drop(["n_listen","user_id"], axis=1).drop_duplicates().set_index('song_id').sort_index()#Set song_id as index
feature_selection

Unnamed: 0_level_0,publish_year,artist_id_1,artist_id_2,artist_id_3,artist_id_4,artist_id_5,artist_id_6,artist_id_8,artist_id_9,artist_id_10,...,artist_id_496,artist_id_497,artist_id_498,artist_id_499,artist_id_500,song_genre_1,song_genre_2,song_genre_3,song_genre_4,song_genre_5
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.095238,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.761905,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.714286,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0.190476,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0.190476,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.761905,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
997,0.666667,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
998,0.904762,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
999,0.761905,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## `ii` Utility Matrix
Construct utility matrix for the loaded dataframe `data`

In [5]:
utility_matrix = data.pivot(index='song_id', columns='user_id', values='n_listen')
utility_matrix.fillna(0,inplace=True)
utility_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,3000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,15.0,6.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,9.0,8.0,5.0,0.0
3,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,9.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0
999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0


In [6]:
utility_matrix.shape

(1000, 3000)

## `iii` Item-Item Similarity Matrix

Construct item-item (Cosine/Adjusted Cosine) similarity matrix.

In [8]:
def cosine_sim(vec_a, vec_b):
     vec_a_cleaned=vec_a[:-1].copy()
     vec_b_cleaned=vec_b[:-1].copy()
     dot_product=vec_a_cleaned @ vec_b_cleaned
     a=np.sqrt(np.square(vec_a_cleaned).sum())
     b=np.sqrt(np.square(vec_b_cleaned).sum())
     sim_score = dot_product/(a*b)
     return sim_score

In [11]:
cosine_sim_matrix = cosine_similarity(feature_selection)
similarity_matrix = pd.DataFrame(cosine_sim_matrix, index=feature_selection.index, columns=feature_selection.index)
similarity_matrix=round(similarity_matrix,7)
similarity_matrix


song_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.031869,0.030292,0.503375,0.008969,0.000000,0.023338,0.004515,0.502257,0.002261,...,0.456439,0.023338,0.446119,0.456439,0.034827,0.031869,0.028650,0.036210,0.031869,0.036210
2,0.031869,1.000000,0.213829,0.063310,0.063310,0.000000,0.164737,0.471057,0.031869,0.015961,...,0.255604,0.164737,0.273834,0.255604,0.622276,0.224956,0.202240,0.626397,0.224956,0.255604
3,0.030292,0.213829,1.000000,0.060178,0.060178,0.000000,0.156588,0.030292,0.030292,0.461223,...,0.242960,0.156588,0.260290,0.242960,0.233677,0.213829,0.192237,0.242960,0.213829,0.618910
4,0.503375,0.063310,0.060178,1.000000,0.017817,0.000000,0.046362,0.008969,0.503375,0.004492,...,0.489347,0.046362,0.481661,0.489347,0.069186,0.063310,0.056917,0.071935,0.063310,0.071935
5,0.008969,0.063310,0.060178,0.017817,1.000000,0.000000,0.046362,0.008969,0.008969,0.004492,...,0.071935,0.511038,0.077066,0.071935,0.069186,0.063310,0.056917,0.071935,0.499554,0.071935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.031869,0.224956,0.213829,0.063310,0.063310,0.440183,0.577515,0.031869,0.031869,0.015961,...,0.255604,0.164737,0.273834,0.255604,0.245837,1.000000,0.600401,0.255604,0.224956,0.255604
997,0.028650,0.202240,0.192237,0.056917,0.056917,0.452267,0.572212,0.028650,0.028650,0.014350,...,0.229793,0.148102,0.246183,0.229793,0.221013,0.600401,1.000000,0.229793,0.202240,0.229793
998,0.036210,0.626397,0.242960,0.071935,0.071935,0.000000,0.187180,0.456439,0.036210,0.018136,...,0.290426,0.187180,0.311141,0.290426,0.639518,0.255604,0.229793,1.000000,0.255604,0.290426
999,0.031869,0.224956,0.213829,0.063310,0.499554,0.000000,0.164737,0.031869,0.031869,0.015961,...,0.255604,0.577515,0.273834,0.255604,0.245837,0.224956,0.202240,0.255604,1.000000,0.255604


## `iv` Top-K Candidate Generation

Selet top-K (a k of your choice) similar items for each item (a user of your choice) rated from the similarity matrix above.

In [12]:
prev_items=utility_matrix[utility_matrix[3000]!=0][3000].index
prev_items_rating=utility_matrix[utility_matrix[3000]!=0][3000]
prev_items
potential_items=similarity_matrix.loc[prev_items]
potential_items

song_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.503375,0.063310,0.060178,1.000000,0.017817,0.000000,0.046362,0.008969,0.503375,0.004492,...,0.489347,0.046362,0.481661,0.489347,0.069186,0.063310,0.056917,0.071935,0.063310,0.071935
9,0.502257,0.031869,0.030292,0.503375,0.008969,0.000000,0.023338,0.004515,1.000000,0.002261,...,0.456439,0.023338,0.446119,0.456439,0.034827,0.031869,0.028650,0.036210,0.031869,0.036210
10,0.002261,0.015961,0.461223,0.004492,0.004492,0.000000,0.011689,0.002261,0.002261,1.000000,...,0.018136,0.011689,0.019429,0.018136,0.017443,0.015961,0.014350,0.018136,0.015961,0.439078
11,0.008969,0.499554,0.060178,0.017817,0.017817,0.495526,0.511038,0.008969,0.008969,0.004492,...,0.071935,0.046362,0.077066,0.071935,0.069186,0.499554,0.505137,0.071935,0.063310,0.071935
15,0.025172,0.177687,0.168898,0.050007,0.509445,0.000000,0.130122,0.025172,0.025172,0.012607,...,0.201895,0.564847,0.216295,0.201895,0.194181,0.177687,0.159745,0.201895,0.585813,0.201895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,0.033380,0.235624,0.611304,0.066312,0.066312,0.000000,0.172549,0.033380,0.033380,0.450409,...,0.267725,0.172549,0.286820,0.267725,0.257495,0.235624,0.211831,0.267725,0.235624,0.633256
985,0.017477,0.123365,0.117263,0.034719,0.513189,0.000000,0.090341,0.017477,0.017477,0.008753,...,0.140172,0.543075,0.150170,0.140172,0.134817,0.123365,0.110908,0.140172,0.548398,0.140172
987,0.002261,0.015961,0.015172,0.004492,0.499737,0.000000,0.011689,0.002261,0.002261,0.001133,...,0.018136,0.480295,0.019429,0.018136,0.017443,0.015961,0.014350,0.018136,0.455895,0.018136
993,0.446119,0.273834,0.260290,0.481661,0.077066,0.000000,0.200531,0.038793,0.446119,0.019429,...,0.655034,0.200531,1.000000,0.655034,0.299253,0.273834,0.246183,0.311141,0.273834,0.311141


## `v` Candidate Filtering

Filter out items (your user) has rated from the candidates above.

In [13]:
filtered_items = potential_items.drop(prev_items,axis=1)
filtered_items


song_id,1,2,3,5,6,7,8,12,13,14,...,989,990,991,992,995,996,997,998,999,1000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.503375,0.063310,0.060178,0.017817,0.000000,0.046362,0.008969,0.000000,0.026433,0.042596,...,0.013415,0.013415,0.489347,0.046362,0.069186,0.063310,0.056917,0.071935,0.063310,0.071935
9,0.502257,0.031869,0.030292,0.008969,0.000000,0.023338,0.004515,0.000000,0.013306,0.021442,...,0.006753,0.006753,0.456439,0.023338,0.034827,0.031869,0.028650,0.036210,0.031869,0.036210
10,0.002261,0.015961,0.461223,0.004492,0.000000,0.011689,0.002261,0.499717,0.006664,0.010739,...,0.003382,0.003382,0.018136,0.011689,0.017443,0.015961,0.014350,0.018136,0.015961,0.439078
11,0.008969,0.499554,0.060178,0.017817,0.495526,0.511038,0.008969,0.000000,0.026433,0.512214,...,0.506432,0.013415,0.071935,0.046362,0.069186,0.499554,0.505137,0.071935,0.063310,0.071935
15,0.025172,0.177687,0.168898,0.509445,0.000000,0.130122,0.025172,0.000000,0.528594,0.119551,...,0.037652,0.037652,0.201895,0.564847,0.194181,0.177687,0.159745,0.201895,0.585813,0.201895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,0.033380,0.235624,0.611304,0.066312,0.000000,0.172549,0.033380,0.433936,0.098378,0.158531,...,0.049929,0.049929,0.267725,0.172549,0.257495,0.235624,0.211831,0.267725,0.235624,0.633256
985,0.017477,0.123365,0.117263,0.513189,0.000000,0.090341,0.017477,0.000000,0.524737,0.083002,...,0.026141,0.026141,0.140172,0.543075,0.134817,0.123365,0.110908,0.140172,0.548398,0.140172
987,0.002261,0.015961,0.015172,0.499737,0.000000,0.011689,0.002261,0.000000,0.496485,0.010739,...,0.003382,0.003382,0.018136,0.480295,0.017443,0.015961,0.014350,0.018136,0.455895,0.018136
993,0.446119,0.273834,0.260290,0.077066,0.000000,0.200531,0.038793,0.000000,0.114332,0.184240,...,0.058026,0.058026,0.655034,0.200531,0.299253,0.273834,0.246183,0.311141,0.273834,0.311141


## `vi` Candidate Rating Prediction

Calculate the predicted rating for each of the candidate items.

In [16]:
top_n_songs = filtered_items.apply(lambda row: list(row.nlargest(3).index), axis=1)
print(top_n_songs)


song_id
4       [974, 49, 208]
9      [187, 264, 355]
10      [897, 158, 45]
11       [33, 97, 271]
15     [133, 742, 790]
            ...       
978    [336, 387, 510]
985    [101, 261, 459]
987     [50, 247, 343]
993    [255, 212, 358]
994    [711, 498, 212]
Length: 151, dtype: object


In [17]:
top_n_dict = filtered_items.apply(lambda row: row.nlargest(3).to_dict(), axis=1).to_dict()
print(top_n_dict)

{4: {974: 0.9916592, 49: 0.5131891, 208: 0.5131891}, 9: {187: 0.5033746, 264: 0.5033746, 355: 0.5033746}, 10: {897: 0.9977554, 158: 0.9183712, 45: 0.5008487}, 11: {33: 0.5131891, 97: 0.5131891, 271: 0.5131891}, 15: {133: 0.5948119, 742: 0.5948119, 790: 0.5948119}, 21: {30: 0.5, 142: 0.5, 204: 0.5}, 24: {27: 0.5595392, 34: 0.5595392, 112: 0.5595392}, 25: {529: 0.6411277, 715: 0.6411277, 745: 0.6411277}, 29: {336: 0.6332424, 387: 0.6332424, 510: 0.6332424}, 32: {212: 0.6332424, 255: 0.6332424, 358: 0.6332424}, 44: {160: 0.5285941, 277: 0.5285941, 305: 0.5285941}, 47: {133: 0.6666667, 742: 0.6666667, 790: 0.6666667}, 48: {336: 0.6332424, 387: 0.6332424, 510: 0.6332424}, 55: {529: 0.6055039, 715: 0.6055039, 745: 0.6055039}, 59: {35: 0.5033746, 96: 0.5033746, 173: 0.5033746}, 64: {405: 0.9550684, 175: 0.520262, 184: 0.520262}, 73: {2: 0.5483976, 63: 0.5483976, 265: 0.5483976}, 80: {133: 0.6055039, 742: 0.6055039, 790: 0.6055039}, 81: {169: 0.5, 418: 0.5, 419: 0.5}, 82: {16: 0.5380276, 74: 0

In [18]:

ratings=dict(zip(prev_items, prev_items_rating))
print(ratings)

{4: 1.0, 9: 10.0, 10: 6.0, 11: 15.0, 15: 9.0, 21: 6.0, 24: 6.0, 25: 6.0, 29: 14.0, 32: 6.0, 44: 2.0, 47: 9.0, 48: 13.0, 55: 5.0, 59: 3.0, 64: 10.0, 73: 10.0, 80: 8.0, 81: 5.0, 82: 7.0, 100: 15.0, 123: 8.0, 124: 3.0, 125: 3.0, 132: 13.0, 137: 7.0, 139: 2.0, 152: 5.0, 154: 14.0, 155: 9.0, 157: 7.0, 161: 12.0, 181: 8.0, 186: 2.0, 195: 14.0, 196: 9.0, 206: 4.0, 207: 12.0, 218: 10.0, 226: 11.0, 237: 3.0, 242: 6.0, 249: 6.0, 253: 4.0, 256: 5.0, 257: 15.0, 258: 4.0, 269: 6.0, 286: 6.0, 289: 5.0, 291: 7.0, 295: 5.0, 304: 3.0, 311: 15.0, 313: 7.0, 331: 13.0, 335: 14.0, 341: 14.0, 342: 3.0, 350: 11.0, 359: 8.0, 368: 13.0, 372: 14.0, 374: 4.0, 385: 13.0, 398: 2.0, 414: 15.0, 424: 9.0, 431: 10.0, 438: 13.0, 443: 5.0, 448: 9.0, 471: 14.0, 484: 7.0, 488: 11.0, 494: 15.0, 500: 12.0, 521: 2.0, 522: 6.0, 535: 9.0, 541: 3.0, 542: 7.0, 543: 8.0, 545: 9.0, 556: 4.0, 561: 9.0, 568: 8.0, 573: 7.0, 576: 10.0, 578: 8.0, 588: 7.0, 594: 7.0, 601: 13.0, 611: 13.0, 619: 12.0, 626: 2.0, 639: 10.0, 643: 1.0, 652: 1

In [19]:
top_n_songs_cleaned = list((filtered_items.apply(pd.Series.nlargest, n=3, axis=1).columns))

print(top_n_songs_cleaned)

[1, 2, 5, 8, 12, 14, 16, 17, 22, 23, 27, 28, 30, 33, 34, 35, 36, 38, 39, 40, 41, 42, 45, 49, 50, 51, 53, 54, 60, 63, 66, 69, 70, 72, 74, 76, 77, 78, 84, 86, 88, 93, 96, 97, 98, 101, 104, 108, 112, 118, 119, 120, 121, 126, 129, 130, 133, 134, 135, 140, 142, 143, 158, 160, 163, 168, 169, 172, 173, 174, 175, 177, 180, 182, 183, 184, 185, 187, 189, 197, 199, 200, 204, 205, 208, 212, 214, 231, 234, 236, 241, 245, 247, 254, 255, 261, 262, 264, 265, 268, 270, 271, 272, 277, 278, 303, 305, 307, 319, 326, 328, 329, 332, 333, 336, 337, 343, 347, 355, 358, 365, 371, 381, 384, 387, 389, 405, 415, 418, 419, 425, 427, 446, 459, 464, 476, 478, 491, 498, 505, 510, 529, 533, 548, 553, 559, 607, 620, 622, 623, 624, 659, 660, 706, 707, 711, 712, 715, 741, 742, 745, 781, 790, 799, 815, 817, 830, 833, 835, 840, 846, 848, 852, 867, 897, 910, 943, 945, 970, 974, 982, 986, 992]


In [20]:
predicted_ratings={}
for rec in top_n_songs_cleaned:
    numerator=0
    denominator=0
    for prev in prev_items:
        if rec in top_n_dict[prev]:
            numerator+= (top_n_dict[prev][rec]*ratings[prev])
            denominator+= (top_n_dict[prev][rec])
    if denominator!=0:
        predicted_ratings[rec]=round(numerator/denominator,2)

print(predicted_ratings)


{1: 8.5, 2: 10.0, 5: 4.0, 8: 10.0, 12: 4.0, 14: 6.0, 16: 8.75, 17: 10.0, 22: 7.0, 23: 11.0, 27: 5.25, 28: 5.29, 30: 6.0, 33: 15.0, 34: 6.5, 35: 5.5, 36: 5.0, 38: 7.0, 39: 2.0, 40: 6.5, 41: 6.0, 42: 9.0, 45: 5.67, 49: 6.33, 50: 10.5, 51: 2.65, 53: 14.0, 54: 14.0, 60: 13.0, 63: 10.0, 66: 7.0, 69: 6.0, 70: 4.0, 72: 10.0, 74: 9.0, 76: 1.0, 77: 10.0, 78: 10.0, 84: 14.0, 86: 15.0, 88: 4.0, 93: 9.0, 96: 5.5, 97: 15.0, 98: 7.0, 101: 9.0, 104: 9.0, 108: 9.67, 112: 5.67, 118: 4.0, 119: 5.5, 120: 9.72, 121: 5.0, 126: 12.0, 129: 9.67, 130: 15.0, 133: 9.87, 134: 4.66, 135: 8.5, 140: 8.95, 142: 6.0, 143: 7.0, 158: 6.0, 160: 5.0, 163: 5.5, 168: 4.0, 169: 9.5, 172: 9.0, 173: 3.67, 174: 6.5, 175: 8.0, 177: 9.0, 180: 10.0, 182: 9.0, 183: 5.0, 184: 8.0, 185: 8.0, 187: 12.5, 189: 3.0, 197: 8.5, 199: 9.0, 200: 4.0, 204: 4.67, 205: 7.0, 208: 2.5, 212: 7.36, 214: 6.0, 231: 4.0, 234: 15.0, 236: 14.0, 241: 6.36, 245: 7.0, 247: 10.5, 254: 14.0, 255: 7.45, 261: 9.0, 262: 9.0, 264: 12.5, 265: 10.0, 268: 11.0, 270

--------------------------

# `04` KNN Item-based Colaborative Filtering

Practice for Using Scikit Surprise Library

## `i` Data Loading

Load `songsDataset.csv` file into a dataframe

In [21]:
df = pd.read_csv('/content/songsDataset.csv')
df.head()

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


In [22]:
df.shape

(72046, 3)

## `ii` Prepare Data

Procedures to Follow:
- Instantiate the Reader Object (see, [Documentation](https://surprise.readthedocs.io/en/stable/reader.html))
- Load the Data into `surprise.dataset.Dataset` (see, [Documentation](https://surprise.readthedocs.io/en/stable/dataset.html))
- Build the full (i.e. without folds) `surprise.Trainset` (see, [Documentation](https://surprise.readthedocs.io/en/stable/trainset.html#:~:text=It%20is%20used%20by%20the%20fit()%20method%20of%20every%20prediction%20algorithm.%20You%20should%20not%20try%20to%20build%20such%20an%20object%20on%20your%20own%20but%20rather%20use%20the%20Dataset.folds()%20method%20or%20the%20DatasetAutoFolds.build_full_trainset()%20method.))

In [23]:
reader = Reader()

In [24]:
data = Dataset.load_from_df(df, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7e00ab2e1b90>

## `iii` Initialize the `KNNWithMeans` Model

**Note**: `KNNWithMeans` uses the normalized ratings instead of the raw ones. (See [Documentation](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans))

**Hint**: Use $k=10$ and configure `sim_options` to be:
- item_based
- pearson

In [25]:
knn_model = KNNWithMeans(k=10,sim_options={"name":"pearson","user_based": False})

## `iv` Fit the Model on Data

In [26]:
trainset = data.build_full_trainset()
print(trainset.n_users, trainset.n_items)  # Number of users and items
knn_model.fit(trainset)

53963 56
Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7e00af8de9d0>

## `v` Calculate Predicted Rating $\hat{r}$ for User $199988$

**Hine**: you can use `.predict()` method of the model (see [Documentaion](https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=.predict#train-on-a-whole-trainset-and-the-predict-method:~:text=pred%20%3D%20algo.predict(uid%2C%20iid%2C%20r_ui%3D4%2C%20verbose%3DTrue)))

In [34]:
# Check if '199988' exists as a column in the utility matrix
#if 199988 in utility_matrix.columns:
    # Filter out items that have not been rated by this user
#    potential_items = utility_matrix[utility_matrix[199988] == 0].index
 #   print(f"Potential items for user 199988: {list(potential_items)}")
#else:
 #   print("User 199988 not found in the utility matrix.")


In [28]:
utility_matrix = df.pivot(index='songID', columns='userID', values='rating')
utility_matrix.fillna(0,inplace=True)

In [31]:
potential_items=utility_matrix[utility_matrix[199988]==0][199988].index
potential_items


Index([  2263,   3785,   8063,  12709,  13859,  16548,  17029,  19670,  22763,
        24427,  25182,  28985,  36561,  40712,  42781,  42906,  43827,  45026,
        45934,  48731,  52611,  54042,  55240,  55622,  60465,  60888,  62954,
        68572,  71582,  72017,  72309,  74640,  79622,  86341,  90409,  91266,
        92881,  94535,  94604, 105421, 105433, 112023, 113954, 119103, 120147,
       122065, 123176, 125557, 126757, 131048, 132189, 134732],
      dtype='int64', name='songID')

In [32]:
songs=[]
ratings=[]
for song in potential_items:
    prediction = knn_model.predict(199988, song)
    songs.append(prediction.iid)
    ratings.append(round(prediction.est,4))
song_predictions_dict = {"recommended_song": songs,"predicted_rating": ratings}
song_predictions=pd.DataFrame(song_predictions_dict)
song_predictions.set_index("recommended_song",inplace=True)
song_predictions.head()


Unnamed: 0_level_0,predicted_rating
recommended_song,Unnamed: 1_level_1
2263,4.2618
3785,3.9266
8063,4.2398
12709,4.0844
13859,4.4139


## `vi` Recommend Top 10 Songs

In [33]:
song_predictions_sorted = song_predictions.sort_values("predicted_rating",ascending=False)
song_predictions_sorted.head(10)

Unnamed: 0_level_0,predicted_rating
recommended_song,Unnamed: 1_level_1
62954,5.0
122065,5.0
71582,5.0
52611,5.0
40712,5.0
60888,5.0
132189,5.0
112023,4.9996
126757,4.9836
92881,4.9411


----------------------------------------------

$$ Wish \space you \space all \space the \space best \space ♡ $$
$$ Mahmoud \space Shawqi $$