# `01` Import Necessary Libraries

## `i` Default Libraries

In [87]:
import numpy as np
import pandas as pd

----------------------------

# `02` Load Data

In [88]:
ratings = pd.read_csv("/content/songsDataset.csv", names=['userID', 'songID', 'rating'], skiprows=[0])
ratings.head(5)

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


---------------------------------

# `03` Similarity Metrics

## `0` Utility Matrix


In [89]:
ratings.columns

Index(['userID', 'songID', 'rating'], dtype='object')

In [90]:
utility_matrix = ratings.pivot(index='userID', columns='songID', values='rating')

In [91]:
utility_matrix.fillna(0, inplace=True)

In [92]:
utility_matrix

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
199980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199988,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `i` Cosine Similarity

In [93]:
def cosine_sim(vec_a, vec_b):
    """
    Returns the raw cosine similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b
    """
    product = np.sum(vec_a * vec_b)

    magnitude_a = np.sqrt(np.sum(vec_a ** 2))
    magnitude_b = np.sqrt(np.sum(vec_b ** 2))

    sim_score = product / (magnitude_a * magnitude_b)

    return sim_score

In [94]:
print(f'Cosine Similarity between userID 56 and userID 227 is: {cosine_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')

Cosine Similarity between userID 56 and userID 227 is: 0.7808688094430304


## `ii` Adjusted Cosine Similarity

In [95]:
def adjusted_cosine_sim(vec_a, vec_b):
    """
    Returns the adjusted cosine similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b
    """
    mean_a = np.mean(vec_a)
    mean_b = np.mean(vec_b)

    product = np.sum((vec_a - mean_a) * (vec_b - mean_b))

    magnitude_a = np.sqrt(np.sum((vec_a - mean_a) ** 2))
    magnitude_b = np.sqrt(np.sum((vec_b - mean_b) ** 2))

    sim_score = product / (magnitude_a * magnitude_b)

    return sim_score

In [96]:
print(f'Adjusted Cosine Similarity between userID 56 and userID 227 is: {adjusted_cosine_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')

Adjusted Cosine Similarity between userID 56 and userID 227 is: 0.7764278070396684


## `iii` Pearson Correlation Coefficient

In [97]:
def pearson_sim(vec_a, vec_b):
    """
    Returns the pearson similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b
    """
    mean_a = np.mean(vec_a)
    mean_b = np.mean(vec_b)

    product = np.sum((vec_a - mean_a) * (vec_b - mean_b))

    magnitude_a = np.sqrt(np.sum((vec_a - mean_a) ** 2))
    magnitude_b = np.sqrt(np.sum((vec_b - mean_b) ** 2))

    sim_score = product / (magnitude_a * magnitude_b)

    return sim_score

In [98]:
print(f'Pearson Similarity between songID 3785 and songID 17029 is: {pearson_sim(utility_matrix[3785].copy(), utility_matrix[17029].copy())}')

Pearson Similarity between songID 3785 and songID 17029 is: -0.015085785303531213


## `iv` Mean Squared Difference

In [99]:
def msd_sim(vec_a, vec_b):
    """
    Returns the mean squared difference similarity score between two vectors.
    Note: Only consider common items between the two vectors

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b
    """
    common_indices_non_zero = (vec_a != 0) & (vec_b != 0)

    vec_a_common = vec_a[common_indices_non_zero]
    vec_b_common = vec_b[common_indices_non_zero]

    if len(vec_a_common) == 0 or len(vec_b_common) == 0:
        return 0.0

    diff_squared = (vec_a_common - vec_b_common) ** 2
    msd = diff_squared.mean()

    sim_score = 1 / (msd + 1)

    return sim_score

In [100]:
print(f'MSD Similarity between userID 56 and userID 227 is: {msd_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')
print(f'MSD Similarity between songID 3785 and songID 17029 is: {msd_sim(utility_matrix[3785].copy(), utility_matrix[17029].copy())}')

MSD Similarity between userID 56 and userID 227 is: 1.0
MSD Similarity between songID 3785 and songID 17029 is: 0.6363636363636364


--------------------------

# `04` Collaborative Filtering

Practice for item-based collaborative filtering

## `0` Utility Matrix
Construct utility matrix for the loaded data `ratings`
- Songs as Index
- Users as Columns

In [101]:
utility_matrix = ratings.pivot(index='songID', columns='userID', values='rating')

In [102]:
utility_matrix.fillna(0, inplace=True)

In [103]:
utility_matrix.head()

userID,0,4,5,7,14,20,31,33,40,46,...,199956,199969,199973,199974,199975,199976,199980,199988,199990,199996
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
3785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8063,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `i` Item-Item Similarity Matrix

Construct item-item (Cosine/Adjusted Cosine) similarity matrix from the utility matrix  above.

In [104]:
number_of_items = utility_matrix.shape[0]
sim_mat = np.zeros((number_of_items,number_of_items))
for i in range(number_of_items):
  for j in range(number_of_items):
    sim_mat[i,j] = adjusted_cosine_sim(utility_matrix.iloc[i] , utility_matrix.iloc[j])

In [105]:
sim_df = pd.DataFrame(sim_mat, index=utility_matrix.index, columns=utility_matrix.index)
sim_df.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,1.0,-0.0065,-0.017511,-0.016326,-0.01752,-0.013347,-0.022847,-0.007725,-0.017581,-0.017882,...,-0.007331,-0.008085,-0.009286,-0.004556,-0.020674,-0.014522,-0.011948,-0.013081,-0.017415,-0.012329
2726,-0.0065,1.0,-0.016699,-0.01094,-0.016806,-0.011452,-0.023635,0.01324,-0.019354,-0.020125,...,-0.000729,0.00947,0.013797,-0.016811,-0.018107,-0.009166,-0.011642,-0.012274,-0.02302,-0.007772
3785,-0.017511,-0.016699,1.0,0.001511,-0.002429,-0.007363,-0.010149,-0.015086,-0.013344,-0.014637,...,-0.015709,-0.00797,-0.015821,-0.015284,-0.005766,-0.010261,-0.0133,-0.007578,-0.00749,-0.003461
8063,-0.016326,-0.01094,0.001511,1.0,-0.003506,-0.001862,-0.013025,-0.005731,0.007944,-0.016066,...,-0.01948,-0.001559,-0.014644,-0.015865,-0.004209,-0.006944,-0.011152,-0.006553,-0.013862,0.005777
12709,-0.01752,-0.016806,-0.002429,-0.003506,1.0,-0.011653,-0.014726,-0.004692,-0.002641,-0.006035,...,-0.014878,-0.011811,-0.006868,-0.007521,-0.013235,-0.011558,-0.016553,-0.009346,0.000393,-0.005


## `ii` Candidate Generation and Filtering

Filter out items (user 199988) has rated from the similarity matrix above.

In [106]:
user_id = 199988
potential_items = utility_matrix.loc[:, 199988][utility_matrix.loc[:, 199988] != 0].index
potential_items

Index([2726, 19299, 43267, 56660], dtype='int64', name='songID')

In [107]:
filtered_sim_df = sim_df.loc[potential_items].drop(columns=potential_items)

In [108]:
filtered_sim_df

songID,2263,3785,8063,12709,13859,16548,17029,19670,22763,24427,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2726,-0.0065,-0.016699,-0.01094,-0.016806,-0.011452,-0.023635,0.01324,-0.020125,-0.023036,-0.017468,...,-0.000729,0.00947,0.013797,-0.016811,-0.018107,-0.009166,-0.011642,-0.012274,-0.02302,-0.007772
19299,-0.017581,-0.013344,0.007944,-0.002641,-0.003426,-0.001161,-0.012135,-0.004975,0.005505,0.001551,...,-0.018252,-0.005171,-0.015623,-0.002624,0.014827,-0.013707,-0.017614,-0.000595,-0.011755,0.00685
43267,-0.009534,-0.008429,-0.010259,-0.013956,-0.012019,-0.016373,0.007456,-0.018029,-0.01754,-0.007722,...,-0.00747,0.016524,0.018883,-0.015632,-0.01338,-0.006118,-0.003468,-0.010407,-0.018965,0.004009
56660,-0.016032,-0.007015,-0.009887,0.004105,-0.014507,-0.020362,-0.007994,-0.00904,-0.008463,-0.00702,...,-0.019435,-0.007092,-0.011,-0.010886,-0.003124,-0.009896,-0.014101,-0.009387,0.002315,0.000836


## `iii` Top-K Candidate Selection

Select top-K (a k of your choice) similar items for each item (user 199988) rated from the filtered similarity matrix above.

In [109]:
def select_top_k_similar_items(similarity_matrix, k, potential_items):
    top_k_similar_items = {}

    for songID in potential_items:
        similarities = similarity_matrix.loc[songID]
        top_similar_items = similarities.nlargest(k)
        top_k_similar_items[songID] = top_similar_items

    return top_k_similar_items

k = 6
top_k_similar_items = select_top_k_similar_items(filtered_sim_df, k, potential_items)
top_k_similar_items = pd.DataFrame(top_k_similar_items)
top_k_similar_items

Unnamed: 0_level_0,2726,19299,43267,56660
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8063,,0.007944,,
12709,,,,0.004105
17029,0.01324,,0.007456,
22763,,0.005505,,
25182,,,,0.002357
40712,0.012574,,,
42906,,,0.011221,
43827,,0.00763,,
45026,0.007124,,0.010135,
48731,,,,0.015657


## `iv` Candidate Rating Prediction

Calculate the predicted rating for each of the candidate items.

In [110]:
rows_to_append = []

can = top_k_similar_items.index
for c in can:
    similar_items = top_k_similar_items.loc[c].dropna()

    values = {
        'candidate': c,
        'ref_1': np.nan,
        'ref_1_similarity': np.nan,
        'ref_1_rating': np.nan,
        'ref_2': np.nan,
        'ref_2_similarity': np.nan,
        'ref_2_rating': np.nan,
        'predicted_rating': np.nan
    }

    if len(similar_items) >= 1:
        values['ref_1'] = similar_items.index[0]
        values['ref_1_similarity'] = similar_items.values[0]
        values['ref_1_rating'] = utility_matrix[user_id].loc[values['ref_1']]
        values['predicted_rating'] = values['ref_1_rating']


    if len(similar_items) >= 2:
        values['ref_2'] = similar_items.index[1]
        values['ref_2_similarity'] = similar_items.values[1]
        values['ref_2_rating'] = utility_matrix[user_id].loc[values['ref_2']]
        values['predicted_rating'] = (values['ref_1_rating'] *  values['ref_1_similarity'] + values['ref_2_rating'] *  values['ref_2_similarity']) / (values['ref_1_similarity'] + values['ref_2_similarity'])

    rows_to_append.append(values)

df_songs = pd.DataFrame(rows_to_append)


In [111]:
df_songs

Unnamed: 0,candidate,ref_1,ref_1_similarity,ref_1_rating,ref_2,ref_2_similarity,ref_2_rating,predicted_rating
0,8063,19299,0.007944,5.0,,,,5.0
1,12709,56660,0.004105,5.0,,,,5.0
2,17029,2726,0.01324,5.0,43267.0,0.007456,3.0,4.279497
3,22763,19299,0.005505,5.0,,,,5.0
4,25182,56660,0.002357,5.0,,,,5.0
5,40712,2726,0.012574,5.0,,,,5.0
6,42906,43267,0.011221,3.0,,,,3.0
7,43827,19299,0.00763,5.0,,,,5.0
8,45026,2726,0.007124,5.0,43267.0,0.010135,3.0,3.825542
9,48731,56660,0.015657,5.0,,,,5.0
