# Unsupervised Tasks

In this notebook, recommendation engines are explored

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
from data_prep import DataPrep

from unsupervised import *
from data_loading import *
from sklearn.metrics import silhouette_score

### all features

In [None]:
df=pd.read_csv('spotify.csv')

In [None]:
dp = DataPrep(df)

columns_to_drop=['Artist URI(s)', 'Album URI', 'Album Artist URI(s)',
                 'Album Image URL', 'Disc Number', 'Track Preview URL', 'ISRC',
                 'Added By', 'Added At', 'Copyrights', 'Album Genres']

cat_columns = ['Artist Name(s)','Label','Decade','Key']

dp.prepare_data(drop_columns=columns_to_drop,
                cat_columns=cat_columns,
                add_decade=True,
                text_columns=['Track Name','Album Name'],
                n_components_text=2)

In [None]:
data = dp.df

data = get_unsupervised_data(data,normalise=True)

In [None]:
vec_proj,vec_viz = train_projectors(data,20)

In [None]:
get_most_similar(idx=1003,vector=vec_proj,data=dp.orig_df)

In [None]:
get_eps(vec_proj)

In [None]:
labels = train_dbscan_and_get_labels(vec_proj,eps=0.5)
plot_clusters(vec_viz,labels)

In [None]:
uris = dp.df['Track URI']

orig = df.copy()
orig = orig.loc[orig['Track URI'].isin(uris)].drop_duplicates()
orig['labels']=labels

orig[orig.labels==labels[1003]].sample(n=3)[['Artist Name(s)','Track Name','Artist Genres']]

In [None]:
print('Clustering score',silhouette_score(vec_proj,labels))

### Continuous only

In [None]:
data = dp.df

data = get_unsupervised_data(data,normalise=True)

data = data[['Explicit', 'Popularity', 'Danceability',
       'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness',
       'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Time Signature']]

In [None]:
vec_proj,vec_viz = train_projectors(data,20)

In [None]:
get_most_similar(idx=1003,vector=vec_proj,data=dp.orig_df)

In [None]:
get_eps(vec_proj)

In [None]:
labels = train_dbscan_and_get_labels(vec_proj,eps=0.4)
plot_clusters(vec_viz,labels)

In [None]:
uris = dp.df['Track URI']

orig = df.copy()
orig = orig.loc[orig['Track URI'].isin(uris)].drop_duplicates()
orig['labels']=labels

orig[orig.labels==labels[1003]].sample(n=3)[['Artist Name(s)','Track Name','Artist Genres']]

In [None]:
print('Clustering score',silhouette_score(vec_proj,labels))

### free-text only

In [None]:
dp = DataPrep(df)

columns_to_drop=['Artist URI(s)', 'Artist Name(s)',
       'Album URI', 'Album Artist URI(s)',
       'Album Artist Name(s)', 'Album Release Date', 'Album Image URL',
       'Disc Number', 'Track Number', 'Track Duration (ms)',
       'Track Preview URL', 'Explicit', 'Popularity', 'ISRC', 'Added By',
       'Added At', 'Artist Genres', 'Danceability', 'Energy', 'Key',
       'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Liveness', 'Valence', 'Tempo', 'Time Signature', 'Album Genres',
       'Label', 'Copyrights']

dp.prepare_data(drop_columns=columns_to_drop,
                add_decade=True,
                cat_columns=None,
                text_columns=['Track Name','Album Name'],
                n_components_text=64)

data = dp.df

In [None]:
data = get_unsupervised_data(data,normalise=True)

In [None]:
vec_proj,vec_viz = train_projectors(data,20)

In [None]:
get_most_similar(idx=1003,vector=vec_viz,data=dp.orig_df,n=20)

In [None]:
get_eps(vec_proj)

In [None]:
labels = train_dbscan_and_get_labels(vec_proj,eps=0.4)
plot_clusters(vec_viz,labels)

In [None]:
uris = dp.df['Track URI']

orig = df.copy()
orig = orig.loc[orig['Track URI'].isin(uris)].drop_duplicates()
orig['labels']=labels

orig[orig.labels==labels[1003]].sample(n=3)[['Artist Name(s)','Track Name','Artist Genres']]

In [None]:
print('Clustering score',silhouette_score(vec_proj,labels))