$$ ITI \space AI-Pro: \space Intake \space 45 $$
$$ Recommender \space Systems $$
$$ Lab \space no. \space 2 $$

# `01` Import Necessary Libraries

## `i` Default Libraries

In [1]:
import numpy as np
import pandas as pd
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNWithMeans

## `ii` Additional Libraries
Add imports for additional libraries you used throughout the notebook

In [2]:
from sklearn.metrics.pairwise import cosine_similarity


## `iii` Global Variables 

In [3]:
# K = 5

----------------------------

# `02` Load Data

 The dataset will have the following columns :
   - song_id (String) : Unique identified for the song
   - user_id (String) : Unique identifier for the user
   - song_genre (Integer) : An integer representing a genre for the song, value is between 1 and 5, indicating that there are 5 unique genres. Each song can only have 1 genre
   - artist_id (String) : Unique identifier for the author of the song
   - n_listen (Integer) : The number of times this user has heard the song (0 -> 15)
   - publish_year (Integer) : The year of song publishing

In [4]:
data = pd.read_csv("Data/songs_data.csv")
data.head()

Unnamed: 0,song_id,artist_id,song_genre,user_id,n_listen,publish_year
0,537,368,4,2066,13,2002
1,921,107,1,1179,5,2006
2,352,188,1,1468,11,2013
3,853,370,4,460,9,2020
4,479,408,2,1125,3,2020


--------------------------

# `03` Content-based Filtering

Practice for content-based filtering on dummy data

## `i` Feature Engineering/Selection
Construct the item vector representation matrix from the `data` above

In [5]:
# Group by song_id and calculate the average number of listens
item_vectors = data.groupby('song_id').agg({
    'artist_id': 'first',  # Assuming each song has one artist
    'song_genre': 'first',  # Assuming each song has one genre
    'publish_year': 'first',  # Assuming each song has one publish year
    'n_listen': 'mean'  # Average number of listens
}).reset_index()

# Set the index to song_id
item_vectors.set_index('song_id', inplace=True)

item_vectors.head()

Unnamed: 0_level_0,artist_id,song_genre,publish_year,n_listen
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,156,2,2002,7.963362
2,328,3,2016,7.801782
3,219,4,2015,7.405462
4,361,2,2004,7.598272
5,297,5,2004,7.53373


## `ii` Utility Matrix
Construct utility matrix for the loaded dataframe `data`

In [6]:
# Construct the utility matrix
utility_matrix = data.pivot_table(index='user_id', columns='song_id', values='n_listen', fill_value=0)
utility_matrix.head()

song_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,15.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,11.0,0.0,6.0
3,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0


## `iii` Item-Item Similarity Matrix

Construct item-item (Cosine/Adjusted Cosine) similarity matrix.

In [11]:
# Calculate the cosine similarity between songs
song_similarity = cosine_similarity(utility_matrix.T)
song_similarity_df = pd.DataFrame(song_similarity, index=utility_matrix.columns, columns=utility_matrix.columns)
song_similarity_df.head()
# song_similarity.shape


song_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.103492,0.125318,0.104358,0.115531,0.113679,0.09839,0.08458,0.155508,0.116555,...,0.120627,0.090102,0.116106,0.133108,0.136135,0.09914,0.122903,0.08955,0.121334,0.107726
2,0.103492,1.0,0.129135,0.105922,0.127303,0.098625,0.104379,0.095586,0.095924,0.128374,...,0.143357,0.126049,0.120713,0.095244,0.102231,0.12141,0.106678,0.12798,0.107115,0.095038
3,0.125318,0.129135,1.0,0.092726,0.143701,0.116769,0.124291,0.095891,0.10067,0.099052,...,0.079593,0.118746,0.106723,0.129556,0.108218,0.10312,0.126577,0.136246,0.0737,0.097494
4,0.104358,0.105922,0.092726,1.0,0.110849,0.127045,0.096961,0.093863,0.085224,0.11343,...,0.1036,0.132431,0.148915,0.093695,0.095771,0.11945,0.101084,0.100977,0.137077,0.114204
5,0.115531,0.127303,0.143701,0.110849,1.0,0.097707,0.096947,0.137108,0.110604,0.112007,...,0.09927,0.12393,0.120146,0.168898,0.135543,0.088356,0.099578,0.099027,0.101542,0.116841


In [9]:
# Normalize the utility matrix by subtracting the mean rating of each user
user_mean = utility_matrix.mean(axis=1)
normalized_utility_matrix = utility_matrix.sub(user_mean, axis=0)

# Calculate the adjusted cosine similarity
adjusted_cosine_similarity = cosine_similarity(normalized_utility_matrix.T)
adjusted_cosine_similarity_df = pd.DataFrame(
    adjusted_cosine_similarity, 
    index=utility_matrix.columns, 
    columns=utility_matrix.columns
)

adjusted_cosine_similarity_df.head()

song_id,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.012372,0.010704,-0.012589,-0.006207,-0.00277,-0.017892,-0.032921,0.047479,0.002743,...,0.008055,-0.025983,-0.001513,0.018727,0.024555,-0.018381,0.011152,-0.030406,0.008451,-0.009266
2,-0.012372,1.0,0.018636,-0.007126,0.010482,-0.015778,-0.007366,-0.016555,-0.015606,0.020205,...,0.037449,0.018345,0.007328,-0.020485,-0.009829,0.010431,-0.002963,0.016567,-0.0037,-0.019831
3,0.010704,0.018636,1.0,-0.023619,0.027498,0.003042,0.013467,-0.017837,-0.011899,-0.014399,...,-0.035842,0.00857,-0.010093,0.01665,-0.004701,-0.01178,0.017783,0.02434,-0.042923,-0.018694
4,-0.012589,-0.007126,-0.023619,1.0,-0.00938,0.015033,-0.016922,-0.019722,-0.028876,0.002171,...,-0.008435,0.024342,0.038006,-0.023451,-0.018324,0.007037,-0.010485,-0.015089,0.028807,0.000566
5,-0.006207,0.010482,0.027498,-0.00938,1.0,-0.025188,-0.023717,0.022243,-0.007393,-0.006755,...,-0.020273,0.008029,-0.001061,0.055397,0.019805,-0.034804,-0.019486,-0.02382,-0.018109,-0.003093


## `iv` Top-K Candidate Generation

Selet top-K (a k of your choice) similar items for each item (a user of your choice) rated from the similarity matrix above.

In [10]:
K = 5

## `v` Candidate Filtering

Filter out items (your user) has rated from the candidates above.

In [11]:
# Select a user of your choice
user_id = 1

# Get the songs rated by the user
rated_songs = utility_matrix.loc[user_id]
rated_songs = rated_songs[rated_songs > 0].index

# Generate top-K similar items for each rated song
top_k_candidates = {}
for song in rated_songs:
    # Get similarity scores for the current song
    similar_songs = adjusted_cosine_similarity_df[song].sort_values(ascending=False)
    
    # Exclude the current song and select top-K similar songs
    top_k_similar = similar_songs.drop(song).head(K)
    top_k_candidates[song] = top_k_similar

# Display the top-K candidates for each rated song
top_k_candidates

{7: song_id
 931    0.051350
 262    0.050578
 544    0.049723
 345    0.049164
 598    0.045788
 Name: 7, dtype: float64,
 16: song_id
 993    0.059600
 363    0.053453
 972    0.047728
 121    0.047685
 46     0.047419
 Name: 16, dtype: float64,
 20: song_id
 638    0.068839
 42     0.058616
 878    0.052660
 907    0.052349
 11     0.051654
 Name: 20, dtype: float64,
 30: song_id
 516    0.056848
 304    0.055502
 504    0.054436
 71     0.052198
 171    0.051685
 Name: 30, dtype: float64,
 53: song_id
 676    0.065003
 798    0.056601
 754    0.055554
 600    0.052371
 840    0.049217
 Name: 53, dtype: float64,
 59: song_id
 369    0.065950
 843    0.059185
 965    0.057854
 518    0.052365
 974    0.047367
 Name: 59, dtype: float64,
 61: song_id
 148    0.049108
 983    0.047219
 631    0.046670
 970    0.045101
 502    0.044960
 Name: 61, dtype: float64,
 69: song_id
 676    0.066216
 995    0.061620
 759    0.060857
 669    0.059394
 491    0.054683
 Name: 69, dtype: float64,
 7

## `vi` Candidate Rating Prediction

Calculate the predicted rating for each of the candidate items.

In [12]:
# Calculate the predicted rating for each candidate item
predicted_ratings = {}

for song, similar_songs in top_k_candidates.items():
    # Calculate the weighted sum of ratings for the top-K similar songs
    weighted_sum = 0
    similarity_sum = 0
    
    for similar_song, similarity in similar_songs.items():
        if similar_song in rated_songs:
            weighted_sum += similarity * utility_matrix.loc[user_id, similar_song]
            similarity_sum += similarity
    
    # Avoid division by zero
    if similarity_sum > 0:
        predicted_ratings[song] = weighted_sum / similarity_sum
    else:
        predicted_ratings[song] = 0
# Convert the predicted ratings dictionary into a DataFrame for better readability
predicted_ratings_df = pd.DataFrame(list(predicted_ratings.items()), columns=['Song ID', 'Predicted Rating'])

# Sort the DataFrame by predicted rating in descending order
predicted_ratings_df = predicted_ratings_df.sort_values(by='Predicted Rating', ascending=False)

# Display the top 10 predicted ratings
print("Top 10 Predicted Ratings:")
print(predicted_ratings_df.head(10))

Top 10 Predicted Ratings:
     Song ID  Predicted Rating
21       205         15.000000
99       692         14.499777
4         53         14.000000
22       208         14.000000
134      967         14.000000
45       376         14.000000
73       547         14.000000
12       108         14.000000
136      981         14.000000
24       221         14.000000


--------------------------

# `04` KNN Item-based Colaborative Filtering

Practice for Using Scikit Surprise Library

## `i` Data Loading

Load `songsDataset.csv` file into a dataframe

In [13]:
df = pd.read_csv('Data/songsDataset.csv')
df.head()

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


## `ii` Prepare Data

Procedures to Follow:
- Instantiate the Reader Object (see, [Documentation](https://surprise.readthedocs.io/en/stable/reader.html))
- Load the Data into `surprise.dataset.Dataset` (see, [Documentation](https://surprise.readthedocs.io/en/stable/dataset.html))
- Build the full (i.e. without folds) `surprise.Trainset` (see, [Documentation](https://surprise.readthedocs.io/en/stable/trainset.html#:~:text=It%20is%20used%20by%20the%20fit()%20method%20of%20every%20prediction%20algorithm.%20You%20should%20not%20try%20to%20build%20such%20an%20object%20on%20your%20own%20but%20rather%20use%20the%20Dataset.folds()%20method%20or%20the%20DatasetAutoFolds.build_full_trainset()%20method.))

In [14]:
reader = Reader(rating_scale=(1, 15))

In [15]:
data = Dataset.load_from_df(df[['userID', 'songID', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7dfcd0cd5ae0>

## `iii` Initialize the `KNNWithMeans` Model

**Note**: `KNNWithMeans` uses the normalized ratings instead of the raw ones. (See [Documentation](https://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans))

**Hint**: Use $k=10$ and configure `sim_options` to be:
- item_based
- pearson

In [16]:
K = 10

## `iv` Fit the Model on Data

In [19]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

knn = KNNWithMeans(k=K, sim_options={'name': 'cosine', 'user_based': False})

knn.fit(trainset)

predictions = knn.test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [20]:
knn

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7dfd44b03c10>

## `v` Calculate Predicted Rating $\hat{r}$ for User $199988$

**Hine**: you can use `.predict()` method of the model (see [Documentaion](https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=.predict#train-on-a-whole-trainset-and-the-predict-method:~:text=pred%20%3D%20algo.predict(uid%2C%20iid%2C%20r_ui%3D4%2C%20verbose%3DTrue)))

In [42]:
# Extract the top 5 songs from the predictions
song_predictions = [{'Song ID': pred.iid, 'Predicted Rating': pred.est} for pred in predictions[:5]]

# Create a DataFrame from the list of dictionaries
song_predictions = pd.DataFrame(song_predictions)

# Display the top 5 songs as a table
print(song_predictions.to_string(index=False))


 Song ID  Predicted Rating
   91266          4.519379
    8063          5.092055
   24427          5.054134
  105433          5.179365
  134732          5.048668


## `vi` Recommend Top 10 Songs

In [43]:
song_predictions = [{'Song ID': pred.iid, 'Predicted Rating': pred.est} for pred in predictions[:10]]

# Create a DataFrame from the list of dictionaries
song_predictions = pd.DataFrame(song_predictions)

# Display the top 5 songs as a table
print(song_predictions.to_string(index=False))

 Song ID  Predicted Rating
   91266          4.519379
    8063          5.092055
   24427          5.054134
  105433          5.179365
  134732          5.048668
  105421          4.473427
   19670          4.425180
   79622          4.802118
   86341          4.719136
  131048          4.908481


----------------------------------------------

$$ Wish \space you \space all \space the \space best \space ♡ $$
$$ Abdelrahman \space Eid $$