In [351]:
!pip install pyspark



In [352]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark import SparkFiles


sessao_spark = SparkSession.builder\
                           .appName("Recomendador PySpark")\
                           .getOrCreate()

# Análise dos dados


## Tabela "musicas"

In [353]:
url_dados = 'https://github.com/IgorNascAlves/dados/blob/main/dados_musicas.csv?raw=true'
sessao_spark.sparkContext.addFile(url_dados)
path_dados_file = 'file://' + SparkFiles.get('dados_musicas.csv')
dados = sessao_spark.read.csv(path_dados_file, header=True, sep=';', inferSchema=True)

In [354]:
dados.show()

+------------------+----+------------+--------------+------------------+-----------+------------------+--------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+------------------+--------------------+
|           valence|year|acousticness|       artists|      danceability|duration_ms|            energy|explicit|                  id|    instrumentalness|key|liveness|           loudness|mode|                name|popularity|speechiness|             tempo|        artists_song|
+------------------+----+------------+--------------+------------------+-----------+------------------+--------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+------------------+--------------------+
|             0.285|2000|     0.00239|      Coldplay|             0.429|     266773|0.6609999999999999|       0|3AJwUDP919kvQ9Qco...|             1.21E-4| 11|   0.234|  

In [355]:
# Remoção das variáveis que não são importantes para a clusterização

dados = dados.drop(*["tempo", "explicit"])

In [356]:
dados.printSchema()

root
 |-- valence: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: double (nullable = true)
 |-- id: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- artists_song: string (nullable = true)



In [357]:
dados.count()

20311

In [358]:
len(dados.columns)

17

In [359]:
dados.describe().toPandas()

Unnamed: 0,summary,valence,year,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,artists_song
0,count,20311.0,20311.0,20311.0,20311,20311.0,20311.0,20311.0,20311,20311.0,20311.0,20311.0,20311.0,20311.0,20311,20311.0,20311.0,20311
1,mean,0.4819282694106629,2010.1832012210127,0.2427988409640108,311.0,0.5877016444291282,228628.97582590717,0.6507661190931082,,0.0601727205204076,5.239673083550786,0.1940039436758413,-7.083095120870492,0.6590517453596574,Infinity,56.44187878489488,0.1006024715671306,
2,stddev,0.2464126044283155,6.063967678081537,0.2844158390976112,0.0,0.1713221446483973,70465.2332595994,0.2217740293079784,,0.1993150557263067,3.5834791842526643,0.1577578644501423,4.320102071415199,0.4740396670519579,,11.56095130904621,0.1060598558222307,
3,min,0.0,2000.0,0.0,$NOT,0.0,30301.0,0.0,000GyYHG4uWmlXieKLij8u,0.0,0.0,0.0,-60.0,0.0,"""""""The Take Over, The Breaks Over""""""",0.0,0.0,"""*NSYNC - Space Cowboy (Yippie-Yi-Yay) (feat. ..."
4,max,1.0,2020.0,0.996,iann dior,0.986,2558287.0,0.999,7zywdG4ysfC5XNBzjQAo2o,1.0,11.0,0.997,1.483,1.0,龍捲風,100.0,0.95,iann dior - romance361


In [360]:
# Verificação de valores ausentes

dados.select([f.count(f.when(f.isnull(c), 1)).alias(c) for c in dados.columns]).show()

+-------+----+------------+-------+------------+-----------+------+---+----------------+---+--------+--------+----+----+----------+-----------+------------+
|valence|year|acousticness|artists|danceability|duration_ms|energy| id|instrumentalness|key|liveness|loudness|mode|name|popularity|speechiness|artists_song|
+-------+----+------------+-------+------------+-----------+------+---+----------------+---+--------+--------+----+----+----------+-----------+------------+
|      0|   0|           0|      0|           0|          0|     0|  0|               0|  0|       0|       0|   0|   0|         0|          0|           0|
+-------+----+------------+-------+------------+-----------+------+---+----------------+---+--------+--------+----+----+----------+-----------+------------+



In [361]:
# Verificação dos anos

print(sorted(dados.select('year').distinct().collect()))

[Row(year=2000), Row(year=2001), Row(year=2002), Row(year=2003), Row(year=2004), Row(year=2005), Row(year=2006), Row(year=2007), Row(year=2008), Row(year=2009), Row(year=2010), Row(year=2011), Row(year=2012), Row(year=2013), Row(year=2014), Row(year=2015), Row(year=2016), Row(year=2017), Row(year=2018), Row(year=2019), Row(year=2020)]


In [362]:
# Média da variação intensidade sonora nos anos

import plotly.graph_objects as go
import plotly.express as px

mean_loudness = dados.groupBy('year').agg(f.mean('loudness').alias('mean_loudness')).sort('year')

fig = px.line(mean_loudness, x='year', y='mean_loudness', markers=True, title='Variação da intensidade sonora conforme os anos')
fig.show()

In [363]:
# Visualização da variabilidade das variaveis em relação aos anos
fig = go.Figure()

temp = dados.groupBy('year').agg(
    *[f.mean(x).alias(f'mean {x}') for x in dados.columns if x != 'year']
).sort('year').toPandas()

fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean acousticness'], name='Acousticness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean valence'],
                    name='Valence'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean danceability'],
                    name='Danceability'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean energy'],
                    name='Energy'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean instrumentalness'],
                    name='Instrumentalness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean liveness'],
                    name='Liveness'))
fig.add_trace(go.Scatter(x=temp['year'], y=temp['mean speechiness'],
                    name='Speechiness'))

fig.show()

## Aula 2.3 Matriz de correlação

In [364]:
fig = px.imshow(dados.drop('mode').toPandas().corr(numeric_only=True), text_auto=True)
fig.show()

Considerações finais sobre a análise descritiva dos dados:
- O atributo relacionado a empolgação da música tem forte relação (considerando o método de Pearson) com o volume.
- Não há grandes variações das variáveis numéricas em relação ao ano.  

# Modelo de recomendação


# Aula 4 - Clusterização por música

## Aula 4.1 Redução de dimensionalidade com PCA

In [365]:
from pyspark.ml.feature import PCA, StandardScaler, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.functions import vector_to_array
from pyspark.ml import Pipeline
import numpy as np


# Remoção das variáveis categóricas

X = dados.columns
X.remove('artists')
X.remove('id')
X.remove('name')
X.remove('artists_song')
X.remove('key')
X.remove('mode')

In [366]:
X

['valence',
 'year',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'popularity',
 'speechiness']

In [367]:
# Vetorização do dataframe
dados_encoded_vector = VectorAssembler(inputCols=X, outputCol='features').transform(dados)

In [368]:
dados_encoded_vector.select('features').show(truncate=False, n=5)

+-----------------------------------------------------------------------------------------------------+
|features                                                                                             |
+-----------------------------------------------------------------------------------------------------+
|[0.285,2000.0,0.00239,0.429,266773.0,0.6609999999999999,1.21E-4,0.234,-7.227,84.0,0.0281]            |
|[0.613,2000.0,0.143,0.843,270507.0,0.8059999999999999,0.0,0.0771,-5.9460000000000015,80.0,0.269]     |
|[0.4,2000.0,0.00958,0.556,216880.0,0.8640000000000001,0.0,0.209,-5.87,84.0,0.0584]                   |
|[0.5429999999999999,2000.0,0.00664,0.545,233933.0,0.865,1.1E-5,0.168,-5.7079999999999975,78.0,0.0286]|
|[0.76,2000.0,0.0302,0.949,284200.0,0.6609999999999999,0.0,0.0454,-4.244,80.0,0.0572]                 |
+-----------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [369]:
# Normalização das variáveis

scaler = StandardScaler(inputCol='features', outputCol='features_scaled')
model_scaler = scaler.fit(dados_encoded_vector)
dados_musicas_scaler = model_scaler.transform(dados_encoded_vector)

In [370]:
dados_musicas_scaler.select('features_scaled').show(truncate=False, n=5)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features_scaled                                                                                                                                                                                                     |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[1.156596679221045,329.8170613984441,0.008403188822334736,2.5040545743834373,3.7858811737298526,2.980511298201045,6.070790766862789E-4,1.4832857988766257,-1.672877140523799,7.265838057312001,0.2649447312760731]  |
|[2.4876974188157917,329.8170613984441,0.5027849379053837,4.9205547930192015,3.8388718448349017,3.634329964220942,0.0,0.48872365424524716,-1

In [371]:
k = len(X)
k

11

In [372]:
# Seleção do melhor número dimensões
lista_valores = [sum(model_pca.explainedVariance[0:i+1]) for i in range(k)]
lista_valores

[0.278665827097831,
 0.44149528137440053,
 0.559435012822716,
 0.6569888890820815,
 0.7395159941439322,
 0.8111141607920362,
 0.875421487582109,
 0.9201794397916018,
 0.9201794397916018,
 0.9201794397916018,
 0.9201794397916018]

In [373]:
k = sum(np.array(lista_valores) <= 0.85)
k

6

In [374]:
# Treinamento do PCA

pca = PCA(k=k, inputCol='features_scaled', outputCol='pca_features')
model_pca = pca.fit(dados_musicas_scaler)
dados_musicas_pca_final = model_pca.transform(dados_musicas_scaler)

In [375]:
dados_musicas_pca_final.select('pca_features').show(truncate=False, n=5)

+--------------------------------------------------------------------------------------------------------------------+
|pca_features                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------+
|[-23.469331234339744,191.9372421920982,120.9814748770573,1.5563286633453421,-20.690273879418424,3.58082489129993]   |
|[-25.021543350318545,192.9401960738028,118.94458130635749,2.81700654954751,-18.80998673344184,2.7685091096427814]   |
|[-24.464333702373533,192.1962057815164,120.30332444666243,1.563159140908613,-20.92834578458773,3.1216085440427923]  |
|[-24.52833687602008,191.82623290616436,119.88246504712356,1.2584481970074808,-20.731776881421155,3.2818995808756606]|
|[-25.114795027441307,192.8897440967857,118.45153191166948,1.2840665127574855,-18.983950952921493,4.025062279791538] |
+-----------------------------------------------

In [376]:
sum(model_pca.explainedVariance) *100

81.11141607920362

## Aplicação do cluster com K-Means

In [377]:
pca_pipeline = Pipeline(stages=[VectorAssembler(inputCols=X, outputCol='features'),
                                StandardScaler(inputCol='features', outputCol='features_scaled'),
                                PCA(k=6, inputCol='features_scaled', outputCol='pca_features')])

In [378]:
model_pca_pipeline = pca_pipeline.fit(dados)

In [379]:
projection = model_pca_pipeline.transform(dados)

In [380]:
projection.select('pca_features').show(truncate=False, n=5)

+--------------------------------------------------------------------------------------------------------------------+
|pca_features                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------+
|[-23.469331234339744,191.9372421920982,120.9814748770573,1.5563286633453421,-20.690273879418424,3.58082489129993]   |
|[-25.021543350318545,192.9401960738028,118.94458130635749,2.81700654954751,-18.80998673344184,2.7685091096427814]   |
|[-24.464333702373533,192.1962057815164,120.30332444666243,1.563159140908613,-20.92834578458773,3.1216085440427923]  |
|[-24.52833687602008,191.82623290616436,119.88246504712356,1.2584481970074808,-20.731776881421155,3.2818995808756606]|
|[-25.114795027441307,192.8897440967857,118.45153191166948,1.2840665127574855,-18.983950952921493,4.025062279791538] |
+-----------------------------------------------

In [381]:
kmeans = KMeans(k=50, featuresCol='pca_features', predictionCol='cluster_pca', seed=51)

In [382]:
modelo_kmeans = kmeans.fit(projection)

In [383]:
projetion_kmeans = modelo_kmeans.transform(projection)

In [384]:
projetion_kmeans.select(['pca_features','cluster_pca']).show()

+--------------------+-----------+
|        pca_features|cluster_pca|
+--------------------+-----------+
|[-23.469331234339...|         35|
|[-25.021543350318...|         47|
|[-24.464333702373...|         26|
|[-24.528336876020...|         49|
|[-25.114795027441...|         30|
|[-25.425876679541...|         30|
|[-24.727753819290...|         30|
|[-25.120275972778...|         19|
|[-25.183616298458...|         49|
|[-24.989471132729...|         16|
|[-20.701904779539...|         20|
|[-24.839269428333...|         49|
|[-24.820183029360...|         14|
|[-24.521349965141...|         30|
|[-23.248516314253...|         22|
|[-24.141133579299...|         16|
|[-24.364883359701...|         43|
|[-24.091245115852...|          5|
|[-24.850853496623...|          9|
|[-23.237755256876...|         33|
+--------------------+-----------+
only showing top 20 rows



In [385]:
projetion_kmeans = projetion_kmeans.withColumn('x', vector_to_array('pca_features')[0])\
                                   .withColumn('y', vector_to_array('pca_features')[1])

In [386]:
projetion_kmeans.show()

+------------------+----+------------+--------------+------------------+-----------+------------------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+--------------------+--------------------+--------------------+--------------------+-----------+-------------------+------------------+
|           valence|year|acousticness|       artists|      danceability|duration_ms|            energy|                  id|    instrumentalness|key|liveness|           loudness|mode|                name|popularity|speechiness|        artists_song|            features|     features_scaled|        pca_features|cluster_pca|                  x|                 y|
+------------------+----+------------+--------------+------------------+-----------+------------------+--------------------+--------------------+---+--------+-------------------+----+--------------------+----------+-----------+--------------------+--------------------+-----

# Aula 5 - Sistemas de Recomendação

## Aula 5.3 Biblioteca Spotipy

In [403]:
!pip install spotipy



In [404]:
# Importação das bibliotecas para conexão com Spotify

import spotipy
from spotipy.oauth2 import SpotifyOAuth, SpotifyClientCredentials

**ATENÇÃO!**

Antes de rodar essa parte do código, você precisa fazer uma conta na API do Spotify e gerar suas próprias **client_id** e **client_secret**

In [405]:
scope = "user-library-read playlist-modify-private"

OAuth = SpotifyOAuth(
        scope=scope,
        redirect_uri='http://localhost:5000/callback',
        client_id = '566b010ba65541ebb537ea12f225fee3',
        client_secret = '0d71d1191ca7418d9af15433c6acfd62')

In [406]:
client_credentials_manager = SpotifyClientCredentials(client_id = '566b010ba65541ebb537ea12f225fee3',
                                                      client_secret = '0d71d1191ca7418d9af15433c6acfd62')

sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [409]:
def recomendador(nome_musica):
  # Calcula musicas recomendadas
  cluster = projetion_kmeans.filter(projetion_kmeans.artists_song == nome_musica).select('cluster_pca').collect()[0][0]
  musicas_recomendadas = projetion_kmeans.filter(projetion_kmeans.cluster_pca == cluster)\
                                       .select('artists_song', 'id', 'pca_features')
  componenetes_musica = musicas_recomendadas.filter(musicas_recomendadas.artists_song == nome_musica)\
                                          .select('pca_features').collect()[0][0]

  def calcula_distance(value):
    return euclidean(componenetes_musica, value)

  udf_calcula_distance = f.udf(calcula_distance, FloatType())

  musicas_recomendadas_dist = musicas_recomendadas.withColumn('Dist', udf_calcula_distance('pca_features'))

  recomendadas = sessao_spark.createDataFrame(musicas_recomendadas_dist.sort('Dist').take(10)).select(['artists_song', 'id', 'Dist'])

  recomendadas.select('artists_song').show(truncate=False)

  #Pegar informações da API

  playlist_id = recomendadas.select('id').collect()

  playlist_track = []

  for id in playlist_id:
    playlist_track.append(sp.track(id[0]))

  return len(playlist_track)

In [410]:
recomendador('Coldplay - Yellow')

+------------------------------------------+
|artists_song                              |
+------------------------------------------+
|Coldplay - Yellow                         |
|Avril Lavigne - When You're Gone          |
|The Script - For the First Time           |
|Cody Johnson - I Ain't Going Nowhere Baby |
|The Killers - Here With Me                |
|Chris Tomlin - At The Cross (Love Ran Red)|
|Foo Fighters - These Days                 |
|Lana Del Rey - Born To Die                |
|Kings of Leon - Back Down South           |
|Paramore - Last Hope                      |
+------------------------------------------+



10

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-track