Data Importing


In [54]:
# import libraries
import pandas as pd
import numpy as np
from typing import List
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

In [55]:
# when opened using vscode
#data1=pd.read_csv('../data/interim/user-listen-count.csv')

In [56]:
# when opened using g-drive
from google.colab import drive
drive.mount('/content/drive')
url="/content/drive/MyDrive/uml/user-listen-count.csv"
data1 = pd.read_csv(url)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data Cleaning

In [57]:
df1 = data1.copy()
df1.drop(df1.columns[0], axis=1, inplace=True)

In [58]:
df1.head()

Unnamed: 0,userid,artist_track,listen_count
0,user_000001,Amon Tobin_Bloodstone,9
1,user_000001,Bonobo_Nightlite,6
2,user_000001,Cadence Weapon_Getting Dumb,6
3,user_000001,Clark_Ache Of The North,2
4,user_000001,Clark_Beg,2


In [59]:
# assign user_id and track_id for the dataset
df1= df1.assign(user_id=(df1['userid']).astype('category').cat.codes)
df1= df1.assign(track_id=(df1['artist_track']).astype('category').cat.codes)

In [60]:
# splitting datasets  
# df_track with track_id and artist_track information
# df_listen_track with user_id, track_id, listen_count information
df_track= pd.DataFrame().assign(artist_track=df1['artist_track'], track_id=df1['track_id'])
df_listen_track=pd.DataFrame().assign(user_id=df1['user_id'], track_id=df1['track_id'],listen_count=df1['listen_count'])

In [61]:
df_track.head()

Unnamed: 0,artist_track,track_id
0,Amon Tobin_Bloodstone,6984
1,Bonobo_Nightlite,21868
2,Cadence Weapon_Getting Dumb,25286
3,Clark_Ache Of The North,30316
4,Clark_Beg,30319


In [62]:
df_listen_track.head()

Unnamed: 0,user_id,track_id,listen_count
0,0,6984,9
1,0,21868,6
2,0,25286,6
3,0,30316,2
4,0,30319,2


Setting up the Dataset for Modelling


In [63]:
# # To gain a better interpretation of the data, a pivot dataframe is built by filling the null values with 0.0
df_track_features = df_listen_track.pivot(
    index='user_id',
    columns='track_id',
    values='listen_count'
).fillna(0)

In [64]:
df_track_features.head()

track_id,0,1,2,3,4,5,6,7,8,9,...,178413,178414,178415,178416,178417,178418,178419,178420,178421,178422
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# de-mean the data (normalize by each users mean) and convert it from a dataframe to a numpy array.
R = df_track_features.values
user_listen_mean = np.mean(R, axis = 1)
R_demeaned = R - user_listen_mean.reshape(-1, 1)

Singular Value Decomposition

In [15]:
# Singular value decomposition is implemented by  properly formatting and normalizing the mean of listen_count 
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned,k=50)
#Scipy function (svds) lets to choose required no of latent factors that can approximate the original listen_count matrix 

In [16]:
# Sigma$ returns the values instead of a diagonal matrix so convert it into diagonal matrix form.
sigma = np.diag(sigma)

Making Predictions from the Decomposed Matrices


In [17]:
all_user_pred_listen_count = np.dot(np.dot(U, sigma), Vt) + user_listen_mean.reshape(-1, 1)

In [18]:
preds_df = pd.DataFrame(all_user_pred_listen_count , columns = df_track_features.columns)
preds_df.head()

track_id,0,1,2,3,4,5,6,7,8,9,...,178413,178414,178415,178416,178417,178418,178419,178420,178421,178422
0,0.003245,0.003245,0.003245,0.002982,0.003245,0.003245,0.006831,0.004071,0.002821,0.003606,...,0.00373,0.003306,0.003428,0.00318,0.003306,0.003838,0.001866,0.003482,0.003471,0.003511
1,0.005128,0.005128,0.005128,0.005555,0.005128,0.005128,0.007122,0.007473,0.004474,0.004442,...,0.004875,0.004836,0.004598,0.004283,0.004836,0.004562,0.002611,0.00467,0.004992,0.004625
2,1.8e-05,1.8e-05,1.8e-05,1.7e-05,1.8e-05,1.8e-05,1.4e-05,1.2e-05,1.7e-05,1.8e-05,...,1.8e-05,1.8e-05,1.8e-05,1.7e-05,1.8e-05,1.8e-05,1.1e-05,1.8e-05,1.8e-05,1.8e-05
3,-0.000573,-0.000573,-0.000573,-0.001284,-0.000573,-0.000573,1.003841,1.001299,1.00595,0.000177,...,0.001305,0.000154,-0.000494,0.000116,0.000154,-0.001003,2.6e-05,-9.7e-05,-8.3e-05,-0.000669
4,0.00269,0.00269,0.00269,0.002027,0.00269,0.00269,0.002736,0.005086,0.005661,0.003324,...,0.004378,0.003191,0.002936,0.002964,0.003191,0.004642,0.001404,0.003221,0.003504,0.003105


# 1. Making Music and User Recommendations by predicting the listen count of the track by the user and then recommending the tracks that user has not listened using **SVD approach**


In [19]:
def recommend_music(preds_df, user_id, df_track, df_listen_track, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = user_id  # UserID starts at 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) 

    # Get the user's data and merge in the music information.
    user_data = df_listen_track[df_listen_track.user_id == (user_id)]
    user_full = (user_data.merge(df_track, how = 'left', left_on = 'track_id', right_on = 'track_id').
                     sort_values(['listen_count'], ascending=False)
                 )

    # Recommend the highest predicted listen count music that the user hasn't listened yet.
    recommendations = (df_track[~df_track['track_id'].isin(user_full['track_id'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), 
                        how = 'left', 
                        left_on = 'track_id',
                        right_on = 'track_id').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [None]:
already_listened, predictions = recommend_music(preds_df, 330,df_track, df_listen_track, 10)

In [None]:
# tracks the user has already listened
print("Tracks that are already listened by the user",already_listened.head())

Tracks that are already listened by the user      user_id  track_id  listen_count  \
636      330     75407            30   
647      330     75418            27   
706      330    108752            21   
707      330    108752            21   
664      330     80385            20   

                                       artist_track  
636                        Johnny Hollow_Nova Heart  
647                      Johnny Hollow_Worse Things  
706  Opeth_Face Of Melinda (Live At The Roundhouse)  
707  Opeth_Face Of Melinda (Live At The Roundhouse)  
664             Kings Of Convenience_Parallel Lines  


Music Recommendation


In [None]:
# recommending the tracks based on the given user_id
print("Recommended Track and Artist Information",predictions)

Recommended Track and Artist Information                                      artist_track  track_id
14665              Judas Priest_Diamonds And Rust     76536
128211             Judas Priest_Diamonds And Rust     76536
262999  Emilie Autumn_Liar (Manic Depressive Mix)     48419
11473   Emilie Autumn_Liar (Manic Depressive Mix)     48419
70790                      The Long Blondes_Guilt    153486
57301                      The Long Blondes_Guilt    153486
28950                      The Long Blondes_Guilt    153486
39810                      The Long Blondes_Guilt    153486
38048                      The Long Blondes_Guilt    153486
192850                     The Long Blondes_Guilt    153486


In [None]:
# print the recommended unique track id's to the user
k=predictions.track_id.unique()
print("recommended track id's",k)

recommended track id's [ 76536  48419 153486]


User Recommendation

In [None]:
# Recommend the users by using the recommended tracks, i.e retrieving the user information from the recommended songs and then recommending the users to the given user
df2=df_listen_track.loc[df_listen_track['track_id'].isin(k),'user_id'].unique()
print("User Recommendation",df2)

User Recommendation [ 32  39  58  81  84 125 134 163 230 297 410 563]


In [None]:
# verifying if the recommended users listen to the songs which they recommend or not
df_listen_track.loc[df_listen_track['track_id']==76536]

Unnamed: 0,user_id,track_id,listen_count
14700,39,76536,1
128458,297,76536,83


In [None]:
df_listen_track["listen_count"].max()

281

# 2. Making Music and User Recommendation by implementing surprise package and binning concept with Cross-Validation Technique by using RMSE as a metric



In [20]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.5 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630136 sha256=e137efe11616e6050717019949c941535a4d19bb55d33885066b21d7e1edbd7b
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [65]:
# import libraries
from surprise import Dataset, Reader
#Import SVD
from surprise import SVD
# Optimization and evaluation functions
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy

In [66]:
df_listen_track. sample(n=10000)
reader = Reader(rating_scale=(1, 10))

# now we apply the binning
data = Dataset.load_from_df(df_listen_track[['user_id', 'track_id', 'listen_count']], reader)

# We'll split into the trainset and testset
trainset, testset = train_test_split(data, test_size=.25)

Implemention of Grid Search

In [67]:
param_grid = {'n_factors': [120, 160], 'n_epochs': [100, 110], 'lr_all': [0.001, 0.005], 'reg_all': [0.08, 0.12]}
              
grid_search_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=4, n_jobs=2)
grid_search_svd.fit(data)
find_algo = grid_search_svd.best_estimator['rmse']
print(grid_search_svd.best_score['rmse'])
print(grid_search_svd.best_params['rmse'])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 24.3min


2.6787954362250077
{'n_factors': 160, 'n_epochs': 110, 'lr_all': 0.001, 'reg_all': 0.08}


[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed: 58.4min finished


Without cross-validation- RMSE values

In [76]:
intial_algorithm = SVD(n_factors=160, n_epochs=110, lr_all=0.001, reg_all=0.08)
intial_algorithm.fit(trainset)
test_predictions = intial_algorithm.test(testset)
print(f"The RMSE without cross-validation is {accuracy.rmse(test_predictions, verbose=True)}")

RMSE: 3.0175
The RMSE without cross-validation is 3.017504861006738


With cross-validation

In [80]:
cross_validate(find_algo, data, measures=['RMSE'], cv=3, verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    2.5297  2.8285  2.6716  2.6766  0.1221  
Fit time          104.02  102.59  102.88  103.17  0.62    
Test time         0.83    1.54    0.83    1.06    0.34    


{'fit_time': (104.02432751655579, 102.58983159065247, 102.88114285469055),
 'test_rmse': array([2.52967613, 2.82852946, 2.67162498]),
 'test_time': (0.8261687755584717, 1.5418295860290527, 0.8257262706756592)}

In [81]:
trainset=data.build_full_trainset()

In [82]:
final_algorithm = SVD(n_factors=120, n_epochs=110, lr_all=0.001, reg_all=0.08)
final_algorithm.fit(trainset)
test_predictions = final_algorithm.test(testset)
print(f"The RMSE with cross-validation is {accuracy.rmse(test_predictions, verbose=True)}")

RMSE: 2.4806
The RMSE with cross-validation is 2.4805911263269866


Predict listen_count when user_id and track_id are given


In [83]:
final_algorithm.predict(uid=10, iid=100)


Prediction(uid=10, iid=100, r_ui=None, est=3.255975963130763, details={'was_impossible': False})

Recommendation Algorithm

In [84]:
def merged_info(x, y):
    z = x.copy()   # start with keys and values of x
    z.update(y)    # modifies z with keys and values of y
    return z

In [85]:
import difflib
import random

def get_track_id(artist_track, df_track):
    
    """
    Gets the track ID for an artist_track based on the closest match in the df_track dataframe.
    """
    
    existing_tracks = list(df_track['artist_track'].values) # list of existing track titles
    closest_tracks = difflib.get_close_matches(artist_track, existing_tracks) # compare the given and existing track titles and retrieves the existing track titles that are similar to given track titles.
    track_id = df_track[df_track['artist_track'] == closest_tracks[0]]['track_id'].values[0] # retrieves the track_id of the given track title
    return track_id

def get_track_info(ptrack_id, df_track,df_listen_track):
    
    """
    Returns some basic information about a track given the track id and the df_listen_track dataframe.
    """
    
    track_info = df_track[df_track['track_id'] == ptrack_id][['track_id', 'artist_track']] # retrives the track title when a track_id is given

    user_id_info=df_listen_track[df_listen_track['track_id'] == ptrack_id][['user_id']] # retrieves the user_id when a track_id is given, i.e lists the users who have listened to the given track_id

    merged=merged_info(track_info.to_dict(), user_id_info.to_dict()) # stores the track_id, user_id
                                                                                             
    return merged

def predict_listen(user_id, artist_track, model, df_track):
    
    """
    Predicts the listen_count value that a user would assign to a specific artist_track. 
    """
    
    track_id = get_track_id(artist_track, df_track) # retrieves track_id when track_title is given
    review_prediction = model.predict(uid=user_id, iid=track_id) # predicts listen_count for the given track_id for a user_id
    return review_prediction.est # returns predictions of listen_count of the track_id for a specific user_id

def generate_recommendation(user_id, model, df_track, df_listen_count,thresh=4):
    
    """
    Generates a music recommendation for a user based on a listen_count threshold. Only
    tracks with a predicted listen_count at or above the threshold will be recommended
    """
    
    artist_tracks = list(df_track['artist_track'].values) # retrives a list of track_titles
    random.shuffle(artist_tracks) # shuffles the track_titles
    
    for artist_track in artist_tracks: # for every track
        listen_count_preds = predict_listen(user_id, artist_track, model, df_track)  # predicts the listen_count for a track when a user_id is given
        if listen_count_preds >= thresh: # recommends the track only when the predicted listen count is greater than the threshold listen_count
            track_id = get_track_id(artist_track, df_track) # retrieves the track_id to recommend the track
            return get_track_info(track_id, df_track,df_listen_track) # retrieves the recommended track_title, track_id and user_id


Music and User Recommendations with given threshold value

In [87]:
generate_recommendation(330, final_algorithm, df_track,df_listen_track,thresh=8)

{'artist_track': {22606: 'Death Cab For Cutie_Transatlanticism',
  37223: 'Death Cab For Cutie_Transatlanticism',
  69274: 'Death Cab For Cutie_Transatlanticism',
  70048: 'Death Cab For Cutie_Transatlanticism',
  71831: 'Death Cab For Cutie_Transatlanticism',
  77852: 'Death Cab For Cutie_Transatlanticism',
  79501: 'Death Cab For Cutie_Transatlanticism',
  107924: 'Death Cab For Cutie_Transatlanticism',
  121443: 'Death Cab For Cutie_Transatlanticism',
  128897: 'Death Cab For Cutie_Transatlanticism',
  133645: 'Death Cab For Cutie_Transatlanticism',
  148256: 'Death Cab For Cutie_Transatlanticism',
  158811: 'Death Cab For Cutie_Transatlanticism',
  167745: 'Death Cab For Cutie_Transatlanticism',
  199243: 'Death Cab For Cutie_Transatlanticism',
  235043: 'Death Cab For Cutie_Transatlanticism',
  255947: 'Death Cab For Cutie_Transatlanticism'},
 'track_id': {22606: 38053,
  37223: 38053,
  69274: 38053,
  70048: 38053,
  71831: 38053,
  77852: 38053,
  79501: 38053,
  107924: 38053,

T-SNE

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)
users_embedding = tsne.fit_transform(final_algorithm.pu)
projection = pd.DataFrame(columns=['x', 'y'], data=users_embedding)
projection['user_info'] = df_listen_track['user_id']



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 584 samples in 0.000s...
[t-SNE] Computed neighbors for 584 samples in 0.046s...
[t-SNE] Computed conditional probabilities for sample 584 / 584
[t-SNE] Mean sigma: 0.179626
[t-SNE] Computed conditional probabilities in 0.061s
[t-SNE] Iteration 50: error = 105.7358093, gradient norm = 0.2914197 (50 iterations in 0.405s)
[t-SNE] Iteration 100: error = 106.4726257, gradient norm = 0.3020097 (50 iterations in 0.317s)
[t-SNE] Iteration 150: error = 115.4475555, gradient norm = 0.2264133 (50 iterations in 0.281s)
[t-SNE] Iteration 200: error = 113.1772079, gradient norm = 0.1785383 (50 iterations in 0.306s)
[t-SNE] Iteration 250: error = 109.1842041, gradient norm = 0.2450173 (50 iterations in 0.290s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 109.184204
[t-SNE] Iteration 300: error = 2.6887240, gradient norm = 0.0041871 (50 iterations in 0.228s)
[t-SNE] Iteration 350: error = 2.2652702, gradient norm = 0.002

In [None]:
import plotly.express as px
fig = px.scatter(
    projection, x='x', y='y'
)
fig.show()