In [None]:
import pandas as pd
import numpy as np

general = pd.read_csv('data/Dados_totais.csv')
data_genre = pd.read_csv('data/data_by_genres.csv')
data_year = pd.read_csv('data/data_by_year.csv')

In [None]:
general["year"].unique()

In [None]:
general.shape

In [None]:
general = general.drop(['explicit', 'key', 'mode'], axis=1)

In [None]:
general.isnull().sum()

In [None]:
general.isna().sum()

In [None]:
data_genre.head()

In [None]:
data_genre = data_genre.drop(['key', 'mode'], axis=1)

In [None]:
data_year.head()

In [None]:
data_year = data_year.drop(['key', 'mode'], axis=1)

In [None]:
data_year['year'].unique()

In [None]:
data_year = data_year[data_year['year'] >= 2000]

In [None]:
data_year['year'].unique()

In [None]:
data_year.reset_index(drop=True, inplace=True)

In [None]:
import plotly.express as px

fig = px.line(data_year, x="year", y="loudness", title='Loudness over the years')
# fig.show() # Loudness over the years.png

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['acousticness'], name='Acousticness'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['valence'], name='Valence'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['danceability'], name='Danceability'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['energy'], name='Energy'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['liveness'], name='Liveness'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['speechiness'], name='Speechiness'))
fig.add_trace(go.Scatter(x=data_year['year'], y=data_year['instrumentalness'], name='Instrumentalness'))

# plot_trace.png

In [None]:
fig = px.imshow(data_year.corr(), title='Correlation between features', text_auto=True)

# fig.show() # plot_corr.png

In [None]:
data_genre['genres'].value_counts().sum()

In [None]:
data_genre_1 = data_genre.drop(['genres'], axis=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

SEED = 1224
np.random.seed(SEED)

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=2, random_state=SEED))])

In [None]:
genre_embedding_pca = pca_pipeline.fit_transform(data_genre_1)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding_pca)

In [None]:
projection

In [None]:
from sklearn.cluster import KMeans

kmeans_pca = KMeans(n_clusters=5, random_state=SEED, n_init=10)

kmeans_pca.fit(projection)

data_genre['cluster_pca'] = kmeans_pca.predict(projection)

projection['cluster_pca'] = data_genre['cluster_pca']

In [None]:
projection

In [None]:
projection['genres'] = data_genre['genres']

In [None]:
projection

In [None]:
fig = px.scatter(projection, x='x', y='y', color='cluster_pca', hover_data=['x', 'y', 'genres'])

# fig.show() # plot_cluster.png

In [None]:
pca_pipeline[1].explained_variance_ratio_.sum()

In [None]:
pca_pipeline[1].explained_variance_.sum()

In [None]:
general.head()

In [None]:
general.value_counts('artists')

In [None]:
general.value_counts('artists_song')

In [None]:
# OneHotEncoder - OneHotEncoder is used to encode categorical integer features as a one-hot numeric array. (0, 1)
from sklearn.preprocessing import OneHotEncoder

# Defino o objeto ohe, o tipo de dado que será retornado é inteiro, e a coluna que será transformada no caso é a 'artists'
ohe = OneHotEncoder(dtype=int)
artists_encoded = ohe.fit_transform(general[['artists']]).toarray()

general_2 = general.drop(['artists'], axis=1)

general_musics_dummies = pd.concat([general_2, pd.DataFrame(artists_encoded, columns=ohe.get_feature_names_out(['artists']))], axis=1)
general_musics_dummies

In [None]:
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.7, random_state=SEED))])

music_embedding_pca = pca_pipeline.fit_transform(general_musics_dummies.drop(['id','name','artists_song'], axis=1))
projection_music = pd.DataFrame(data=music_embedding_pca)

In [None]:
pca_pipeline[1].n_components_

In [None]:
kmeans_pca_pipeline = KMeans(n_clusters=50, verbose=False, random_state=SEED)

kmeans_pca_pipeline.fit(projection_music)

general['cluster_pca'] = kmeans_pca_pipeline.predict(projection_music)
projection_music['cluster_pca'] = kmeans_pca_pipeline.predict(projection_music)

In [None]:
projection_music['artists'] = general['artists']
projection_music['song'] = general['artists_song']

projection_music

In [None]:
fig = px.scatter(projection_music, x=0, y=1, color='cluster_pca', hover_data=[0,1, 'song'])

#fig.show() #plot_cluster_music.png

In [None]:
#fig = px.scatter_3d(projection_music, x=0, y=1, z=2, color='cluster_pca',hover_data=['song'])
#fig.update_traces(marker_size = 2)
#fig.show()

RECOMENDADOR

In [85]:
artist = 'Slipknot'

In [None]:
projection_music[projection_music['artists'] == artist]

In [None]:
music_name = 'Slipknot - Before I Forget'

projection_music[projection_music['song'] == music_name]

In [None]:
cluster = list(projection_music[projection_music['song'] == music_name]['cluster_pca'])[0]

cluster

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

recommendations = projection_music[projection_music['cluster_pca'] == cluster][[0,1,'song']]

x_music = list(projection_music[projection_music['song'] == music_name][0])[0]
y_music = list(projection_music[projection_music['song'] == music_name][1])[0]

# Distancia euclidiana
distances = euclidean_distances(recommendations[[0,1]], [[x_music, y_music]])

recommendations['id'] = general['id']
recommendations['distance'] = distances

recommendations = recommendations.sort_values('distance').head(10) # As mais recomendadas são as primeiras da lista, ordenado pela distancia euclidiana

recommendations

In [None]:
# Spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

# Definindo as credenciais
client_id = '5465959bb5ee4dfdac8fe6048f015511'
client_secret = 'client_secret'
scope = 'user-library-read playlist-modify-private'

OAuth = SpotifyOAuth(client_id=client_id, client_secret=client_secret, scope=scope, redirect_uri='http://localhost:8888/callback')

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, auth_manager=OAuth)

In [None]:
import matplotlib.pyplot as plt
from skimage import io

music_name = 'Slipknot - Before I Forget'
id = general[general['artists_song'] == music_name]['id'].iloc[0]
track = sp.track(id)
url = track['album']['images'][0]['url']
name = track['name']

image = io.imread(url)
plt.imshow(image)
plt.xlabel(name, fontsize=20)
plt.show() # Slipknot - Before I Forget.png

In [58]:
def recommended_id(playlist_id):
    url = []
    name = []
    for id in playlist_id:
        track = sp.track(id)
        url.append(track['album']['images'][0]['url'])
        name.append(track['name'])
    return url, name

In [60]:
url, name = recommended_id(recommendations['id'])

In [73]:
def plot_recommended(url, name):
    
    plt.figure(figsize=(15, 10))
    columns = 5
    
    for n, u in enumerate(url):
        ax = plt.subplot(len(url) // columns + 1, columns, n + 1)
        image = io.imread(u)
        plt.imshow(image)
        ax.get_yaxis().set_visible(False)
        plt.xticks(color='w')
        plt.yticks(color='w')
        plt.xlabel(name[n], fontsize = 8)
        plt.tight_layout(h_pad=0.7, w_pad=0)
        plt.tick_params(bottom=False, left=False)
        plt.grid(False)
    plt.show()
        

In [None]:
plot_recommended(url, name) # top10_recommended.png

Final Product

In [77]:
def recommender(music_name):
    ## Cluster da música escolhida
    cluster = list(projection_music[projection_music['song'] == music_name]['cluster_pca'])[0]
    recommendations = projection_music[projection_music['cluster_pca'] == cluster][[0,1,'song']]
    ## Coordenadas da música escolhida
    x_music = list(projection_music[projection_music['song'] == music_name][0])[0]
    y_music = list(projection_music[projection_music['song'] == music_name][1])[0]
    ## Distancia euclidiana
    distances = euclidean_distances(recommendations[[0,1]], [[x_music, y_music]])
    ## Adicionando as colunas id e distance
    recommendations['id'] = general['id']
    recommendations['distance'] = distances
    ## Ordenando as recomendações
    recommendations = recommendations.sort_values('distance').head(10)
    
    ## Retornando as recomendações
    playlist_id = recommendations['id']
    
    url = []
    name = []
    for id in playlist_id:
        track = sp.track(id)
        url.append(track['album']['images'][0]['url'])
        name.append(track['name'])
        
    plt.figure(figsize=(15, 10))
    columns = 5
    
    ## Plotando as recomendações
    for n, u in enumerate(url):
        ax = plt.subplot(len(url) // columns + 1, columns, n + 1)
        image = io.imread(u)
        plt.imshow(image)
        ax.get_yaxis().set_visible(False)
        plt.xticks(color='w')
        plt.yticks(color='w')
        plt.xlabel(name[n], fontsize = 8)
        plt.tight_layout(h_pad=0.7, w_pad=0)
        plt.tick_params(bottom=False, left=False)
        plt.grid(False)
    plt.show()

In [None]:
artist = 'Green Day'
projection_music[projection_music['artists'] == artist]

In [None]:
recommender('Green Day - American Idiot') # Green Day - American Idiot_recommended.png