# Set Up

In [None]:
import pandas as pd
import numpy as np
from langdetect import detect
import matplotlib.pyplot as plt
import seaborn as sbn
plt.style.use("seaborn")

In [None]:
import spotiscience as sps

# Downloader

In [None]:
#create a dictionary with authorization keys
CREDENTIALS = {}
CREDENTIALS['client_id'] = client_id
CREDENTIALS['client_secret'] = client_secret
CREDENTIALS['redirect_url'] = redirect_url
CREDENTIALS['user_id'] = userid
CREDENTIALS['genius_access_token'] = genius_access_token # genius is optional, only

"""You also can set your credentials id on credentials.py and import from spotiscience"""

# returns 'downloader class'
sd = sps.downloader.SpotiScienceDownloader(credentials=CREDENTIALS)

# Basic features

## Playlist download

In [None]:
#Return song features of playlist
len_playlist=50
playlist = "" # playlist id
playlist_data = sd.get_playlist_song_features(playlist_id=playlist,n_songs=len_playlist)

In [None]:
song = playlist_data['fusion'][2]

## Artist info

In [None]:
# On peut obtenir des informations des artistes directement
# problème : demande à chaque fois l'autorisation so meh for automatisation
artist = 'metallica'
sd.get_artist_information(artist=artist)

## Prediction : mood and topics, examples on one song

In [None]:
# returns 'predicter class'
sp = sps.SpotiSciencePredicter()

### Mood

In [None]:
#returns the tag of mood 
mood = sp.predict_song_mood(song=song)
song['mood'] = mood

### Get Song Genre

In [None]:
genre = sd.get_song_music_genre(song_id=song['id'])
song['genre'] = genre

### Lyrics topics prediction

#### Get Lyrics

In [None]:
lyrics = sd.get_song_lyrics(songname=song['name'],artistname=song['artist'])

The topic prediction of song lyrics uses any of the algorithms Latent Dirichlet Allocation Model (LDA), Non Negative Matrix Factorization Model (NMF) or Latent Semantic Indexing Model (LSI). To do this, I based my code on the following article which you can read here.

To predict the topic of lyrics you must configure the following parameters:

lyric = the lyric of the song

model = the model to use [options are “lsi”,”lda” or “nmf”]

lang = language of the song lyric [options are “english” or “spanish”]

n_grams = number of subsence of words to group

n_topics = number of returned topics

top_n = number of words per returned topic

For more information about the parameter n_grams, you can read the official documentation about vectorization with sklearn by clicking here

#### Recognize language and topics

There is a necessity to add errors handling for topics : not every song has lyrics avaible, nor every lyrics is in french, english, spanish or german.

In [None]:
languages = {
    'en' : 'english',
    'fr' : 'french',
    'es': 'spanish',
    'de': 'german'
}

In [None]:
lyrics = sd.get_song_lyrics(songname=song['name'],artistname=song['artist'])
model = 'lda' # (available type 'lda', 'lsi', 'nmf')
lang = languages[detect(lyrics)] # (available type 'english','spanish', 'french', 'german')
# check for the last one.
# using langdetect detect function to identify the language
# need to had a error gestion regarding the language, if the lyrics can't be identify in those
# or install the core relevant to the language
n_grams = (1,1)
n_topics = 1
top_n = 5
topics = sp.predict_topic_lyric(lyrics,model,lang,n_grams,n_topics,top_n)
song['topics'] = topics

In [None]:
list_topic=[]
for nb in topics:
    for topic in topics[nb][0]:
        list_topic.append(topic[0])

In [None]:
print(list_topic)

In [None]:
sp = sps.SpotiSciencePredicter()

In [None]:
languages = {
    'en' : 'english',
    'fr' : 'french',
    'es': 'spanish',
    'de': 'german'
}
name = 'fusion' # name of the playlist u want to extract the predictions from
len_playlist = 654 #828

In [None]:
#Return song features of playlist
playlist_data = sd.get_playlist_song_features(playlist_id=playlist,n_songs=len_playlist)
playlist_data

In [None]:
for song in playlist_data[name]:

    mood = sp.predict_song_mood(song=song)
    song['mood'] = str(mood)

    genre = sd.get_song_music_genre(song_id=song['id'])
    song['genre'] = genre
    
    

In [None]:
sd = sps.downloader.SpotiScienceDownloader(credentials=CREDENTIALS)

In [None]:
for i in range(0, len(playlist_data[name])):
    #create error : juste relaunch it at the same place it stops
    #linked to get_feature_name
    song = playlist_data[name][i]
    try:
        lyrics = sd.get_song_lyrics(songname=song['name'],artistname=song['artist'])
    except (TimeoutError, ConnectionError, ConnectionAbortedError, ConnectionRefusedError, Exception) as e:
        print(e, song['name']," we set it has no lyrics")
        lyrics = ''
    if lyrics == '':
        song['has_lyrics'] = False
        song['topics'] = []
    else:
        song['has_lyrics'] = True
        model = 'lda' # (available type 'lda', 'lsi', 'nmf')
        lang = detect(lyrics)
        if not lang in languages.keys():
            #language of the song not yet supported, feel free to add it
            song['topics'] = []
        else:
            lang = languages[lang] # (available type 'english','spanish', 'french', 'german')
        
            # check for the last one.
            # using langdetect detect function to identify the language
            # need to had a error gestion regarding the language, if the lyrics can't be identify in those
            # or install the core relevant to the language
            #default parameter
            n_grams = (1,1)
            n_topics = 1
            top_n = 5
            try:
                topics = sp.predict_topic_lyric(lyrics,model,lang,n_grams,n_topics,top_n)
                list_topic=[]
                for nb in topics:
                    for topic in topics[nb][0]:
                        list_topic.append(topic[0])
                song['topics'] = list_topic
            except AttributeError as e:
                print(e, "set to no topics")
                song['topics'] = []
            

In [None]:
i

In [None]:
df_playlist = pd.DataFrame.from_records(playlist_data[name])

In [None]:
print("nb tracks : ", len(df_playlist['topics']), "Number of tracks missing topics due to timeout :",len(df_playlist[[len(t)==0 for t in df_playlist['topics']]]) )


#### Genre summarization

In [None]:
collected_genre = df_playlist['genre'].sum()

In [None]:
fig = plt.figure()
fig.set_size_inches(10,60)
pd.Series(collected_genre).value_counts().plot.barh()

On retient en genre les suivants :
- jazz ou funk
- pop
- electro ou techno ou club ou tronica
- synth
- rock
- prog ou exp
- alt
- dance, rave
- rap
- ind for indie ou indé (in french)
- franc (français, france), french
- hip hop


In [None]:
selfdefined_genre = ['jazz,funk','pop','electro,tronica','techno,club','house','synth,dream',
'rock','prog,exp','alt,modern','dance,rave','rap','franc, french','ind','hip,hop','metal']
for genre in selfdefined_genre:
    df_playlist[genre] = ''

In [None]:
# identification of the song 49 (50th) to the chosen genres
song = df_playlist.iloc[49]
for genre in selfdefined_genre:
    subgenre = genre.split(',')
    b = False
    for sub in subgenre:
        for song_genre in song['genre']:
            b = b or (sub in song_genre) #we test if in the str of the genre of the song we find an occurence of our subgenre

In [None]:
# identification for every song
for genre in selfdefined_genre:
    rows_in_genre = []
    for song_genre in df_playlist['genre']:
        subgenre = genre.split(',')
        b = False
        for sub in subgenre:
            for song_subgenre in song_genre:
                b = b or (sub in song_subgenre) #we test if in the str of the genre of the song we find an occurence of our subgenre
        rows_in_genre.append(b)
    df_playlist[genre] = rows_in_genre


In [None]:
df_playlist.sample(3)

In [None]:
df_playlist.to_pickle("fusion_with_mood_genre_lyrics.pkl")

In [None]:
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_playlist.keys()

In [None]:
numvar = ['popularity', 'length',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'valence', 'loudness', 'speechiness', 'tempo']
categorical_var = ['mood','artist']+selfdefined_genre
list_var = ['genre','topics']
df_clustering = df_playlist[numvar+categorical_var+list_var]
# a question can be ask about popularity


## Numeric Only Clustering

In [None]:
df_clustering[numvar].sample(3)

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(2, 2, figsize=(15,8))
j=0
for i in [5,6,7,8]:#[3, 4, 6, 10]:
    j += 1
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(j, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    ax[q-1][mod].set_title("k = " + str(i))
    visualizer.fit(df_clustering[numvar])


## Numeric and Categorical clustering

Since we made a lot of work to predic a mood and some genre, we must try to use those categorical variables, which we can't do with K-Means.
So first I based the following code on [Jorge Martin Lasaosa post](https://towardsdatascience.com/clustering-on-numerical-and-categorical-features-6e0ebcf1cbad) about Gower Distance, check it out.

Gower distance $ ps_{ij}^{(f)} $ between $i$ and $j$ for the variable $f$ is : 
- for a numeric type : $ ps_{ij}^{(f)} = 1 - |x_{if}-x_{jf}|/R_{f} $ avec $ R_f = \max f - \min f $
- For a categorical feature, the partial similarity between two individuals is one only when both observations have exactly the same value for this feature. Zero otherwise.

Code inspired by this [post](https://www.thinkdatascience.com/post/2019-12-16-introducing-python-package-gower/).

In [None]:
import gower
# we compute this distance explained above


Extension to custom distance matrix :
- if it's a numeric or a categorical feature, use Gower distance
- if it's a list, use Jaccard distance.

Here we are going to use Jaccard distance to measure distance of the brute genres and brutes topics. We also calculate Jaccard distance for a vector of the selected genres.
[Jaccard distance](https://flavien-vidal.medium.com/similarity-distances-for-natural-language-processing-16f63cd5ba55) is defined for a subset A and B (here represented by a list) by :

$d_J(A,B) = 1 - J(A,B)$ where $J(A,B)$ is the Jaccard index defined by :

$J(A,B) = card(A \cap B) / card(A \cup B)$ = proportion of common points

The code is inspired by this [Statology post](https://www.statology.org/jaccard-similarity-python/)

In [None]:
#define Jaccard Similarity function
def jaccard(list1, list2):
    if len(list1+list2) == 0:
        return 1
    else:
        intersection = len(list(set(list1).intersection(list2)))
        union = (len(list1) + len(list2)) - intersection
        return float(intersection) / union       
def jaccard_dissimilarity_matrix(df):
    n = len(df)
    matrix = np.zeros((n,n))
    for i in range(n):
        for j  in range(i):
            d = 1 - jaccard(df.iloc[i],df.iloc[j])
            matrix[i,j] = d
            matrix[j,i] = d
    return matrix

def custom_distance_matrix(df_clustering):
    """the idea is to do an extension of the gower distance with the defined custom distance
    since the final distance matrix is the mean of each dissimilarity matrix we will use the distance matrix to compute for 
    numerical and categorical and compute our own matrix to compute for list var"""
    distance_matrix = gower.gower_matrix(df_clustering[numvar+categorical_var])
    distance_matrix = len(numvar+categorical_var)*distance_matrix
    for var in list_var:
        distance_matrix += jaccard_dissimilarity_matrix(df_clustering[var])
    distance_matrix = distance_matrix/len(numvar+categorical_var+list_var)
    return distance_matrix

In [None]:
distance_matrix = custom_distance_matrix(df_clustering)

In [None]:
(distance_matrix<0).any() or (distance_matrix>1).any()

### With clustering DBScan

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.cluster import DBSCAN

#km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
dbscan_cluster = DBSCAN(eps=0.11, 
                    min_samples=2, 
                    metric="precomputed")
q, mod = divmod(j, 2)
'''
Create SilhouetteVisualizer instance with KMeans instance
Fit the visualizer
'''
visualizer = SilhouetteVisualizer(dbscan_cluster, colors='yellowbrick', ax=ax[q-1][mod])
ax[q-1][mod].set_title("k = " + str(i))
dbscan_cluster.fit(distance_matrix)


In [None]:
df_clustering["cluster"] = dbscan_cluster.labels_

Number of clusters :

(-1 = outliers)

In [None]:
df_clustering['cluster'].nunique()

Number of song by cluster

In [None]:
for i in range(-1, df_clustering['cluster'].nunique()):
    print(len(df_clustering[df_clustering['cluster']==i]), end=', ')

Mostly a failure : the first cluster is too big, which is coherent, DBScan is made for outliers spotting

###  AffinityPropagation
Trying with another clustering methods : AffinityPropagation. Results are corrects

In [None]:
#from sklearn.cluster import SpectralClustering essayé fonctionne très mal
from sklearn.cluster import AffinityPropagation

Examples : we can set a series of examples

In [None]:
examples_id = ['6z4n862KhNJNWDYSn4aLL5','7uv632EkfwYhXoqf8rhYrg','1GMQEnykhAyTLwkIViTFQk','16tvIGzCNfRLVbm8G39DDo',
'5mY8mY7DSfuqVbY2psq3Cg','6jkN2vp6rSP1WMlPJVlWQB','6TSDRzJGwbK9cajVbtqlPV','56iv5TqfvxVa4zLMs6SvmP','0cx1vZcndRhwSDgR8NtEfk',
'4h33lJL2YU05kEMaSkao47','7Fe3ZwOjTVppElF4TMfxNP','46WOptLnXUtH3LOyYvmMO1','4Ztvl8C9Ld3IdHNo1a3UBe','5N7NLLGdrTy4QYuyM6ewm0','2Y0iGXY6m6immVb2ktbseM',
'3gcmn2CtOE9SjBevmvGVEk','2mcMoXYHmVLxmCgAvaO2cS','3XZssUmtDdhFK1tZJasgXD']

In [None]:
len(examples_id)

In [None]:
examples_index = list(df_playlist.loc[df_playlist['id'].isin(examples_id)].index)
preferences = np.zeros(len(df_clustering))
for i in examples_index:
    preferences[i]=1/len(examples_id)

In [None]:
#choosen_model = KMeans(n_clusters=10, init='k-means++', n_init=10, max_iter=2000, random_state=100)# 200,42
choosen_model = AffinityPropagation(damping = 0.5, affinity='precomputed', max_iter=1000,preference=preferences)
# idée assigné des points au musique en fonction de mon nombre d'écoute => voir la doc dans preference

In [None]:
results = choosen_model.fit(distance_matrix)

In [None]:
df_clustering['cluster'] = results.labels_

### Analysing results

Number of clusters : (and in the idea the number of playlist)

In [None]:
df_clustering['cluster'].nunique()

How many songs by playlist :

In [None]:
for i in range(10):
    print(f"For the cluster {i}, there are {len(df_clustering[df_clustering['cluster']==i])} songs")


What kind of characteristics has each cluster : 

In [None]:
df_clustering.groupby('cluster').sum()

We can also see for each cluster what is the central track

In [None]:
df_playlist.iloc[results.cluster_centers_indices_]

In [None]:
nb_playlist = df_clustering['cluster'].nunique()

In [None]:
"""U2FsdGVkX1/Dr36Htbd0NHeSyt7zcwtOQAlJhOZtI448GRilIUDqUXbG2VcrJ3P9vptgDzKVDtdT4Oem
8or4aSpWtATxKVoPB0zFaiDwvO9xGZ3LMmczFu4VdpSm41KwAVd47tUU3rB6GYnBfBbLgglun0vTKlp0
EkSa+Nf8A3paQIcCKVIjvQFD4jUyIdqucqh+De+Q83QfBXsawCoByXTbVrMeA+KClO6eFO5yvh8cc0GS
Xa/ulwOiGLIZR94ZatZNG4HHOHdPnFlJ5VS48olv0FzUV9tXom/9omdtF+Qm2nragQ30nfMQUG4q56Dl
D2bCxpM7LHcqUeyt8rtDCGKvLKm3Op17OJ3mgCDQpRpXFc+fqJCyKXhEglMOJ4g2gox4xvBtCVtPlKsp
9HLO2GKPXO9GsFGmYlGtVkMm1ZqfAR90tWZvfYWNUBI5ZlffvB9VGmsRi4zoM+LyETIhJtEtPOUG0NVO
/I1pVHy20yGBZC4/szf8/NUnbf0Njelz4gc6oppflk2fDeZn5pr5Zl38rL81G/Ev2gmm476UXdE="""

In [None]:
#create a dictionary with authorization keys
CREDENTIALS = {}
CREDENTIALS['client_id'] = client_id
CREDENTIALS['client_secret'] = client_secret
CREDENTIALS['redirect_url'] = redirect_url
CREDENTIALS['user_id'] = userid
CREDENTIALS['genius_access_token'] = genius_access_token # genius is optional, only

"""You also can set your credentials id on credentials.py and import from spotiscience"""

# returns 'downloader class'
sd = sps.downloader.SpotiScienceDownloader(credentials=CREDENTIALS)


In [None]:
from importlib import reload

In [None]:
reload(sps.downloader)

In [None]:
sd = sps.downloader.SpotiScienceDownloader(credentials=CREDENTIALS)

In [None]:
nb_playlist

In [None]:
for i in range(nb_playlist):
    song_ids =[]
    for id in df_playlist[df_clustering['cluster']==i]['id']:
        song_ids.append("spotify:track:"+id)
    sd.add_playlist(f"AffinityPropagation Cluster V3 {i}",song_ids)

In [None]:
# idée d'amélioration possible mettre moins de poids à certaines choses en faisant moi même la moyenne 