In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from scipy.spatial.distance import cosine
import math
from collections import Counter

In [2]:
# Import the data
data = pd.read_csv('radio_songs.csv')
print(data.shape)
data.head()

(100, 285)


Unnamed: 0,user,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 1. Collaborative Filtering

#### A. Use this user-item matrix to recommend 10 songs to users who have listened to 'u2' and 'pink floyd'. Use item-item collaborative filtering to find songs that are similar using spatial distance with cosine. 

#### If based on users listening to both 'u2' and 'pink floyd', the ranking will be: 

In [3]:
song_columns = data.iloc[:, 1:]
song_column_names = song_columns.columns
cosine_list = {}

u2 = data[['u2']]
pink_floyd = data[['pink floyd']]

for name in song_column_names:
    similarity = 1 - cosine(np.array(u2), np.array(data[[name]])) + 1 - cosine(np.array(pink_floyd), np.array(data[[name]]))
    if not math.isnan(similarity):
        cosine_list[name] = similarity

In [4]:
sorted_list = sorted(cosine_list.items(), key = lambda x: x[1], reverse = True)

list(list(zip(*sorted_list))[0])[2:12]

['robbie williams',
 'genesis',
 'johnny cash',
 'misfits',
 'audioslave',
 'foo fighters',
 'pearl jam',
 'green day',
 'david bowie',
 'depeche mode']

#### If based on users listening to only 'u2', the ranking will be: 

In [5]:
rec_u2 = {}
for i in data.columns:
    rec_u2[i]= 1 - cosine(data['u2'], data[i])

df_u2 = pd.DataFrame(list(rec_u2.items()),columns = ['song','score']) 
sorted_u2 = df_u2.sort_values(['score'], ascending = False)
sorted_u2[1:11]

Unnamed: 0,song,score
221,robbie williams,0.5
177,misfits,0.5
112,green day,0.433013
72,depeche mode,0.408248
202,peter fox,0.377964
78,dire straits,0.353553
149,kelly clarkson,0.353553
165,madonna,0.353553
91,enter shikari,0.353553
137,johnny cash,0.353553


#### If based on users listening to only 'pink floyd', the ranking will be: 

In [6]:
rec_pink_floyd = {}
for i in data.columns:
    rec_pink_floyd[i]= 1 - cosine(data['pink floyd'], data[i])

df_pink_floyd = pd.DataFrame(list(rec_pink_floyd.items()),columns = ['song','score']) 
sorted_pink_floyd = df_pink_floyd.sort_values(['score'], ascending = False)
sorted_pink_floyd[1:11]

Unnamed: 0,song,score
106,genesis,0.57735
117,hans zimmer,0.408248
66,david bowie,0.408248
208,queen,0.408248
157,led zeppelin,0.408248
234,sonic syndicate,0.408248
105,funeral for a friend,0.408248
60,coldplay,0.348155
262,the rolling stones,0.333333
169,maria mena,0.333333


#### B. Find user most similar to user 1606. Use user-user collaborative filtering with cosine similarity. List the recommended songs for user 1606 (Hint: find the songs listened to by the most similar user).

In [7]:
new_header = data.T.iloc[0] 
df_trans = data.T[1:] 
df_trans.columns = new_header 

rec_1606 = {}
for i in df_trans.columns:
    rec_1606[i]= 1 - cosine(df_trans[1606], df_trans[i])

df_1606 = pd.DataFrame(list(rec_1606.items()),columns = ['user','score']) 
sorted_1606 = df_1606.sort_values(['score'], ascending = False)
sorted_1606[1:6]

Unnamed: 0,user,score
65,1144,0.27735
8,144,0.223607
79,1334,0.150756
89,1509,0.144338
45,890,0.129099


In [8]:
user1144 = data[data['user'] == 1144]
user1144.drop(user1144.columns[[0]], axis=1, inplace=True)
song_1144 = (user1144 != 0).any()

rec_song_1144 = song_1144.index[song_1144]
rec_song_1144

Index(['beastie boys', 'bob dylan', 'bob marley & the wailers', 'david bowie',
       'elvis presley', 'eric clapton', 'johnny cash', 'pearl jam',
       'pink floyd', 'the beatles', 'the doors', 'the rolling stones',
       'tom waits'],
      dtype='object')

#### C. How many of the recommended songs has already been listened to by user 1606?

In [9]:
user1606 = data[data['user'] == 1606]
user1606.drop(user1606.columns[[0]], axis=1, inplace=True)
song_1606 = (user1606 != 0).any()

song_11441606 = song_1606.index[song_1144 & song_1606]
song_11441606

Index(['elvis presley', 'the beatles'], dtype='object')

Thus, 2 songs that user 1606 has listened to are recommended.

#### D. Use a combination of user-item approach to build a recommendation score for each song for each user using the following steps for each user-

1. For each song for the user row, get the top 10 similar songs and their similarity score.
2. For each of the top 10 similar songs, get a list of the user purchases
3. Calculate a recommendation score as follows: ∑(𝑝𝑢𝑟𝑐ℎ𝑎𝑠𝑒𝐻𝑖𝑠𝑡𝑜𝑟𝑦*𝑠𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦𝑆𝑐𝑜𝑟𝑒) / ∑𝑠𝑖𝑚𝑖𝑙𝑎𝑟𝑖𝑡𝑦𝑆𝑐𝑜𝑟𝑒
4. What are the top 5 song recommendations for user 1606?

In [10]:
# get the top 10 similar songs and their similarity score
def get_10_similar_songs(name, columns):
    song_column_names = columns.columns
    song_col = data[[name]]
    cosine_list = {}
    for name in song_column_names:
        similarity = 1 - cosine(np.array(song_col), np.array(columns[[name]]))
        if not math.isnan(similarity):
            cosine_list[name] = similarity
    sorted_list = sorted(cosine_list.items(), key = lambda x: x[1], reverse = True)
    return sorted_list[1:11]

In [11]:
# calculate a recommendation score
def recommended_system(user, data):
    user = data[data['user'] == user]
    columns = data.iloc[:, 1:]
    song_column_names = columns.columns
    song_scores = {}
    for name in song_column_names:
        top_10_similar_songs = list(zip(*get_10_similar_songs(name, columns)))
        sim_score = 0
        if len(top_10_similar_songs) != 0:
            song_list = list(top_10_similar_songs[0])
            score_list = list(top_10_similar_songs[1])
            score_sum = sum(score_list)
            for count, each_song in enumerate(song_list):
                sim_score += float(user[[each_song]].values[0]) * score_list[count]
            song_scores[name] = sim_score / score_sum
    sortedlist = sorted(song_scores.items(), key = lambda x: x[1], reverse = True)
    return sortedlist

In [12]:
# top 5 song recommendations for user 1606
recommended_1606 = recommended_system(1606, data)
recommended_1606[:5]

[('elvis presley', 0.28932783543098956),
 ('abba', 0.23902308185961818),
 ('eric clapton', 0.20274011674755033),
 ('frank sinatra', 0.20113933811458254),
 ('howard shore', 0.17174865637166115)]

### 5. Conceptual Questions

1. Name 2 other similarity measures that you can use instead of cosine similarity above.

---- euclidian distance / pearson similarity / pairwise distances

2. What is needed to build a Content-Based Recommender system?

---- Content-based recommendation systems recommend items to a user by using the similarity of items. This recommender system recommends products or items based on their description or features. It identifies the similarity between the products based on their descriptions. Popular methods include: Similarity-based Methods, One-class SVMs, Matrix Factorisation, Supervised Learning method. Also. the concepts of Term Frequency (TF) and Inverse Document Frequency (IDF) are used in content based filtering mechanisms (such as a content based recommender). 

3. Name 2 methods to evaluate your recommender system.

---- Decision support accuracy metrics: Reversal rate, Weighted errors, ROC and PRC, Precision, Recall and F-measure. 

---- Statistical accuracy metrics: Mean Absolute Error (MAE) , Root Mean Square Error (RMSE) and Correlation. 

---- Metrics: Normalized Discounted Cumulative Gain: Normalized discounted cumulative gain (nDCG)