# <font color='blue'>Data Science Academy</font>
# <font color='blue'>Big Data Real-Time Analytics com Python e Spark</font>

## <font color='blue'>Mini-Projeto 7</font>

### <font color='blue'>Sistema de Recomendação em Tempo Real com Machine Learning, PySpark, Spark Streaming e Kafka</font>

![title](imagens/MP7.png)

In [None]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

In [None]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
#!pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

# Instala o pacote watermark. 
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
#!pip install -q -U watermark

In [1]:
# Imports
import os
import time
import random
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Normalizer, StandardScaler

In [2]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

Author: Data Science Academy

sys    : 3.10.9 (main, Mar  1 2023, 18:23:06) [GCC 11.2.0]
pyspark: 3.3.2



In [3]:
# Endereço do servidor Kafka
SERVER = 'localhost:9092'

In [4]:
# Nome do tópico
TOPIC = "dsaminiprojeto7"

In [5]:
# Conectores do Spark para o Apache Kafka
# jar (java archive), é um arquivo compactado que contém classes java.
spark_jars =  ("{},{},{},{},{}".format(os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.2.1.jar",  
                                       os.getcwd() + "/jars/kafka-clients-2.1.1.jar", 
                                       os.getcwd() + "/jars/spark-streaming-kafka-0-10-assembly_2.12-3.3.2.jar", 
                                       os.getcwd() + "/jars/commons-pool2-2.8.0.jar",  
                                       os.getcwd() + "/jars/spark-token-provider-kafka-0-10_2.12-3.1.2.jar"))

In [6]:
# Inicializa sessão Spark
spark = SparkSession \
        .builder \
        .config("spark.jars", spark_jars) \
        .appName("Mini-Projeto7") \
        .getOrCreate()

23/07/20 05:15:36 WARN Utils: Your hostname, leonam-pc resolves to a loopback address: 127.0.1.1; using 192.168.15.37 instead (on interface wlx64700215b737)
23/07/20 05:15:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/07/20 05:15:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [7]:
spark.sparkContext.setLogLevel("ERROR")

In [8]:
# Usamos o Spark Streaming para leitura do streaming de dados do Kafka e salvamos em um dataframe
df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", SERVER) \
        .option("subscribe", TOPIC) \
        .option("startingOffsets", "latest") \
        .load()

In [9]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [10]:
# Selecionamos a coluna timestamp como string e salvamos em um novo dataframe
df1 = df.selectExpr("CAST(value AS STRING)", "timestamp") 
df1.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [11]:
# Definimos o schema com o nome de cada coluna e o tipo de dado
def_schema = "order_id INT, id STRING, name STRING, popularity INT, duration_ms DOUBLE, " \
             + "artists STRING, id_artists STRING, release_date STRING, " \
             + "danceability DOUBLE,energy DOUBLE, key INT, loudness DOUBLE, " \
             + "mode INT,speechiness DOUBLE," \
             + "acousticness DOUBLE, instrumentalness DOUBLE, liveness DOUBLE, " \
             + "valence DOUBLE, tempo DOUBLE, time_signature DOUBLE"

In [12]:
# Selecionamos o streaming de dados de acordo com o schema e salvamos em um novo dataframe
df2 = df1.select(from_csv(col("value"), def_schema).alias("song"), "timestamp")
df2.printSchema()

root
 |-- song: struct (nullable = true)
 |    |-- order_id: integer (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- popularity: integer (nullable = true)
 |    |-- duration_ms: double (nullable = true)
 |    |-- artists: string (nullable = true)
 |    |-- id_artists: string (nullable = true)
 |    |-- release_date: string (nullable = true)
 |    |-- danceability: double (nullable = true)
 |    |-- energy: double (nullable = true)
 |    |-- key: integer (nullable = true)
 |    |-- loudness: double (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- speechiness: double (nullable = true)
 |    |-- acousticness: double (nullable = true)
 |    |-- instrumentalness: double (nullable = true)
 |    |-- liveness: double (nullable = true)
 |    |-- valence: double (nullable = true)
 |    |-- tempo: double (nullable = true)
 |    |-- time_signature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [13]:
# Criamos uma view na memória do Spark e visualizamos o schema
df3 = df2.select("song.*", "timestamp")  
df3.createOrReplaceTempView("df3_View");
df3.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- id_artists: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [14]:
# Selecionamos os dados com as músicas do stream
musicas_stream = spark.sql("SELECT * FROM df3_View")

In [None]:
# Não podemos visualizar ainda, pois temos que gerar o stream do Spark Streaming
# musicas_stream.show()

In [15]:
# Criamos o stream de dados no Spark Streaming
musicas_stream_spark = musicas_stream \
        .writeStream \
        .trigger(processingTime = '5 seconds') \
        .outputMode("append") \
        .option("truncate", "false") \
        .format("memory") \
        .queryName("tabela_spark") \
        .start()

musicas_stream_spark.awaitTermination(1)

False

In [16]:
# Selecionamos as músicas da tabela de stream do Spark
spark_songs = spark.sql("SELECT * FROM tabela_spark")

In [17]:
# Agora sim podemos visualizar o stream em tempo real como tabela do Spark
spark_songs.show(5)

+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|order_id|                  id|      name|popularity|duration_ms|             artists|          id_artists|release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|           timestamp|
+--------+--------------------+----------+----------+-----------+--------------------+--------------------+------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+--------------------+
|     606|65qxSIjP13dioahJb...| sUpa Ugly|         0|   196897.0|         GodDamnChan|   AvHXmhShvORtjrNQK|  2018-02-23|       0.602| 0.681|  2| -12.059|   1|     0.0648|       0.634|           0.806|   0.086|  0.3

In [18]:
# Podemos visualizar apenas algumas colunas, por exemplo
spark_songs.select('order_id', 'id', 'name', 'popularity', 'duration_ms', 'artists').show(5)

+--------+--------------------+--------------------+----------+-----------+--------------------+
|order_id|                  id|                name|popularity|duration_ms|             artists|
+--------+--------------------+--------------------+----------+-----------+--------------------+
|     606|65qxSIjP13dioahJb...|           sUpa Ugly|         0|   196897.0|         GodDamnChan|
|     607|5UTbwnxRh9DUFMKHS...|                Dusk|         0|   255233.0|        WhyspBaribal|
|     608|29MyHIVuh2jrMIxFO...|          Give It Up|        17|   159252.0|YannLaurenWolfski...|
|     609|5YREQJO8jVZSCW1Mp...|Fight or Flight Club|         0|   151117.0|               Madge|
|     610|68PgM3ORNYfhr9iO9...|         Blue Cheese|         0|   266181.0|             ChezAli|
+--------+--------------------+--------------------+----------+-----------+--------------------+
only showing top 5 rows



In [19]:
# Contagem de músicas extraídas em tempo real
spark_songs.count()

23

Aguarde alguns minutos antes de seguir com a execução para que o streaming de dados possa ser coletado.

> Vamos agora trabalhar na extração de dados do Spotify.

In [20]:
# https://pypi.org/project/spotipy/
!pip install -q spotipy

In [21]:
# Imports
import os
import ujson
import spotipy
import spotipy.util
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import random
warnings.filterwarnings("ignore")

In [22]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

Author: Data Science Academy

matplotlib: 3.7.0
sys       : 3.10.9 (main, Mar  1 2023, 18:23:06) [GCC 11.2.0]
pyspark   : 3.3.2
seaborn   : 0.12.2
pandas    : 1.5.3
spotipy   : 2.23.0
numpy     : 1.23.5
ujson     : 5.4.0



                                                                                

Leia o manual em pdf no Capítulo 16 do curso com os detalhes sobre a criação da API.

In [25]:
# Aqui você coloca as suas chaves da API do Spotify
os.environ["SPOTIPY_CLIENT_ID"] = 'inclua_aqui_seu_client_id'
os.environ["SPOTIPY_CLIENT_SECRET"] = 'inclua_aqui_seu_client_secret'
os.environ["SPOTIPY_REDIRECT_URI"] = 'http://localhost:7777/callback'

                                                                                

https://developer.spotify.com/documentation/general/guides/authorization/scopes/

In [24]:
# Escopo de extração das preferências do usuário
scope = 'user-library-read'

                                                                                

In [26]:
# Username no Spotify
username = 'inclua_aqui_seu_email_spotify'

                                                                                

In [27]:
# Criação do tohen de acesso
token = spotipy.util.prompt_for_user_token(username, scope)

In [28]:
# Cria o objeto de autenticação
spotipy_obj = spotipy.Spotify(auth = token)

                                                                                

In [29]:
# Extrai até 50 músicas da lista de favoritos do usuário
saved_tracks = spotipy_obj.current_user_saved_tracks(limit = 50) 

In [30]:
saved_tracks

{'href': 'https://api.spotify.com/v1/me/tracks?offset=0&limit=50',
 'items': [{'added_at': '2023-07-20T06:41:06Z',
   'track': {'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4Iyvo65peG4ImDz5AzTqQB'},
       'href': 'https://api.spotify.com/v1/artists/4Iyvo65peG4ImDz5AzTqQB',
       'id': '4Iyvo65peG4ImDz5AzTqQB',
       'name': 'Element Eighty',
       'type': 'artist',
       'uri': 'spotify:artist:4Iyvo65peG4ImDz5AzTqQB'}],
     'available_markets': ['AD',
      'AG',
      'AR',
      'AT',
      'BD',
      'BE',
      'BG',
      'BI',
      'BJ',
      'BN',
      'BO',
      'BR',
      'BT',
      'BW',
      'CA',
      'CH',
      'CI',
      'CL',
      'CM',
      'CO',
      'CR',
      'CV',
      'CY',
      'CZ',
      'DE',
      'DK',
      'DM',
      'DO',
      'EC',
      'EE',
      'ES',
      'FI',
      'FM',
      'FR',
      'GB',
      'GD',
      'GN',
      'GQ',
      'GR',
      'GT',
  

In [31]:
# Número de músicas extraídas
n_tracks = saved_tracks['total']
print('Total de Tracks: %d ' % n_tracks)

Total de Tracks: 101 


In [32]:
# Função para extrair os atributos da lista de músicas do usuário
def select_features(track_response):
    return {        
        'id': str(track_response['track']['id']),
        'name': str(track_response['track']['name']),
        'artists': [artist['name'] for artist in track_response['track']['artists']],
        'popularity': track_response['track']['popularity']
    }

In [33]:
# Aplica a função
tracks = [select_features(track) for track in saved_tracks['items']]

                                                                                

In [34]:
# Extrai os atributos das músicas preferidas do usuário
while saved_tracks['next']:
    saved_tracks = spotipy_obj.next(saved_tracks)
    tracks.extend([select_features(track) for track in saved_tracks['items']])

In [35]:
# Criamos o dataframe do pandas
df_tracks = pd.DataFrame(tracks)
pd.set_option('display.max_rows', len(tracks))
df_tracks['artists'] = df_tracks['artists'].apply(lambda artists: artists[0])

In [36]:
df_tracks.head(10)

Unnamed: 0,id,name,artists,popularity
0,2YnvR7uhlWpPdYUn1eR25y,Broken Promises,Element Eighty,55
1,4oDynzwp2QDey7zDS9TGft,Nine Thou - Grant Mohrman Superstars Remix,Styles Of Beyond,64
2,3oZObqxgEjNuCiFfE11szK,Басы долбят,SOSKA 69,61
3,3WsgkhWH001sMkbZVcjreS,And the Hero Will Drown,Story Of The Year,55
4,5Txeau6Fi96zS4THXUJ4w7,IMMACULATE,VISXGE,71
5,3vc1sGOGAVxhx8S1kERqyn,You'll Be UNDER MY WHEELS,The Prodigy,56
6,6ifNuuYjiS5NdMabN4Jvrs,Six Days - Remix,DJ Shadow,67
7,01kfSdF9zfcDLri5sSWEoL,RAVE,Dxrk ダーク,85
8,6wk8RkIpZ6mbQigwKPr1oW,Riders,onimanxd,68
9,4cpmqPshK5SbCuGEORAKUS,SUICIDE YEAR - RAIZHELL Remix,WEEDMANE,56


                                                                                

In [37]:
# Dicionário para os atributos de áudio
audio_features = {}

In [38]:
# Extrai os atributos de áudio
for idd in df_tracks['id'].tolist():
    audio_features[idd] = spotipy_obj.audio_features(idd)[0]

                                                                                

In [39]:
# Adicionamos os atributos de áudio ao dataframe
df_tracks['acousticness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['acousticness'])
df_tracks['speechiness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['speechiness'])
df_tracks['key'] = df_tracks['id'].apply(lambda idd: str(audio_features[idd]['key']))
df_tracks['liveness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['liveness'])
df_tracks['instrumentalness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['instrumentalness'])
df_tracks['energy'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['energy'])
df_tracks['tempo'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['tempo'])
df_tracks['loudness'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['loudness'])
df_tracks['danceability'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['danceability'])
df_tracks['valence'] = df_tracks['id'].apply(lambda idd: audio_features[idd]['valence'])

In [40]:
df_tracks.head()

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
0,2YnvR7uhlWpPdYUn1eR25y,Broken Promises,Element Eighty,55,0.00487,0.1,1,0.0385,1.4e-05,0.947,173.625,-2.983,0.446,0.653
1,4oDynzwp2QDey7zDS9TGft,Nine Thou - Grant Mohrman Superstars Remix,Styles Of Beyond,64,0.00208,0.104,1,0.263,0.0,0.874,107.934,-6.529,0.635,0.577
2,3oZObqxgEjNuCiFfE11szK,Басы долбят,SOSKA 69,61,0.00386,0.21,3,0.474,0.162,0.726,130.039,-7.426,0.755,0.415
3,3WsgkhWH001sMkbZVcjreS,And the Hero Will Drown,Story Of The Year,55,0.000687,0.242,4,0.403,0.0,0.985,100.223,-3.358,0.433,0.327
4,5Txeau6Fi96zS4THXUJ4w7,IMMACULATE,VISXGE,71,0.00151,0.358,7,0.661,5e-06,0.385,117.931,-7.568,0.78,0.34


In [52]:
# Selecionamos uma música randomicamente
musica_randomica = random.randint(0,len(df_tracks)-1)
df_musica_randomica = df_tracks.head(musica_randomica)[-1:]
df_musica_randomica

Unnamed: 0,id,name,artists,popularity,acousticness,speechiness,key,liveness,instrumentalness,energy,tempo,loudness,danceability,valence
29,1niIoI0IVTswfNxdyzxc4U,Quarter,Fuel,41,0.00012,0.0919,2,0.254,0.683,0.93,120.118,-5.297,0.422,0.201


In [53]:
# Músicas do streaming do Spark
spark_songs.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|                  id|                name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|65qxSIjP13dioahJb...|           sUpa Ugly|         0|         GodDamnChan|       0.602| 0.681|  2| -12.059|     0.0648|       0.634|           0.806|   0.086|  0.318| 64.994|
|5UTbwnxRh9DUFMKHS...|                Dusk|         0|        WhyspBaribal|       0.732| 0.302|  1| -14.041|      0.186|       0.617|           0.272|   0.118|  0.236| 90.042|
|29MyHIVuh2jrMIxFO...|          Give It Up|        17|YannLaurenWolfski...|       0.887| 0.441|  8|  -7.192|      0.154|

In [57]:
# Não precisamos mais dessas colunas
spark_songs = spark_songs.drop('order_id', 
                               'mode', 
                               'release_date', 
                               'id_artists',
                               'time_signature', 
                               'duration_ms',
                               'timestamp')

                                                                                

In [58]:
# Cria o dataframe com a música escolhida randomicamente
df_sp = spark.createDataFrame(df_musica_randomica)

In [59]:
# Concatena músicas do streaming do Spark com a música do Spotify
df = spark_songs.union(df_sp)

In [60]:
df.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|                  id|                name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|
+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+
|65qxSIjP13dioahJb...|           sUpa Ugly|         0|         GodDamnChan|       0.602| 0.681|  2| -12.059|     0.0648|       0.634|           0.806|   0.086|  0.318| 64.994|
|5UTbwnxRh9DUFMKHS...|                Dusk|         0|        WhyspBaribal|       0.732| 0.302|  1| -14.041|      0.186|       0.617|           0.272|   0.118|  0.236| 90.042|
|29MyHIVuh2jrMIxFO...|          Give It Up|        17|YannLaurenWolfski...|       0.887| 0.441|  8|  -7.192|      0.154|

In [61]:
df.count()

353

## Pré-Processamento dos Dados

In [62]:
# Preparamos o VectorAssembler
vetor = VectorAssembler(inputCols = ['danceability',
                                     'energy',
                                     'loudness',
                                     'speechiness',
                                     'acousticness',
                                     'instrumentalness',
                                     'liveness',
                                     'valence',
                                     'tempo'], 
                        outputCol = 'song_features')

In [63]:
# Descartamos valores inválidos
assembled = vetor.setHandleInvalid("skip").transform(df)

In [64]:
assembled.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+
|                  id|                name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|       song_features|
+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+
|65qxSIjP13dioahJb...|           sUpa Ugly|         0|         GodDamnChan|       0.602| 0.681|  2| -12.059|     0.0648|       0.634|           0.806|   0.086|  0.318| 64.994|[0.602,0.681,-12....|
|5UTbwnxRh9DUFMKHS...|                Dusk|         0|        WhyspBaribal|       0.732| 0.302|  1| -14.041|      0.186|       0.617|           0.272|   0.118|  0.236| 90.042|[0.732,0.302,-14....|
|29MyHIVuh2jrMI

In [66]:
# Preparamos o padronizador
std = StandardScaler(inputCol = 'song_features', outputCol = 'standardized')

In [67]:
# Treinamos o padronizador
scale = std.fit(assembled)

In [68]:
# Dataframe com dados padronizados
df = scale.transform(assembled)

In [69]:
df.show(5)

+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|                  id|                name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|       song_features|        standardized|
+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+
|65qxSIjP13dioahJb...|           sUpa Ugly|         0|         GodDamnChan|       0.602| 0.681|  2| -12.059|     0.0648|       0.634|           0.806|   0.086|  0.318| 64.994|[0.602,0.681,-12....|[3.41857490170965...|
|5UTbwnxRh9DUFMKHS...|                Dusk|         0|        WhyspBaribal|       0.732| 0.302|  1| -14.041|      0.186|       0

## Machine Learning com Aprendizado Não Supervisionado

In [70]:
# Cria o objeto do modelo
objeto_KMeans = KMeans(featuresCol = 'standardized', k = 3)

In [71]:
# Treina o modelo
modelo_KMeans = objeto_KMeans.fit(df)

In [72]:
# Previsões do modelo
df_output = modelo_KMeans.transform(df)

In [73]:
df_output.count()

421

In [74]:
df_output.show(10)

+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|                  id|                name|popularity|             artists|danceability|energy|key|loudness|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|       song_features|        standardized|prediction|
+--------------------+--------------------+----------+--------------------+------------+------+---+--------+-----------+------------+----------------+--------+-------+-------+--------------------+--------------------+----------+
|65qxSIjP13dioahJb...|           sUpa Ugly|         0|         GodDamnChan|       0.602| 0.681|  2| -12.059|     0.0648|       0.634|           0.806|   0.086|  0.318| 64.994|[0.602,0.681,-12....|[3.41857490170965...|         0|
|5UTbwnxRh9DUFMKHS...|                Dusk|         0|        WhyspBaribal|       0.

## Sistema de Recomendação

In [75]:
# Classe
class RecoSystem():
    
    # Método construtor
    def __init__(self, data):
        self.data_ = data
    
    # Método de recomendação
    def Recomm(self, nome_musica, amount = 1):
        
        # Lista para as distâncias
        distancias = []
        
        # Seleciona a música
        song = self.data_[(self.data_.name.str.lower() == nome_musica.lower())].head(1).values[0]
        res_dt = self.data_[self.data_.name.str.lower() != nome_musica.lower()]
        
        # Loop para o cálculo das distâncias
        for i_song in tqdm(res_dt.values):
            
            # Inicializa a distância
            distancia = 0
            
            # Loop para calcular a distância
            for col in np.arange(len(res_dt.columns)):
                if not col in [0,1,2,14]:
                    distancia = distancia + np.absolute(float(song[col]) - float(i_song[col]))
            
            # Adiciona na lista de distâncias
            distancias.append(distancia)
        
        res_dt['distance'] = distancias
        res_dt = res_dt.sort_values('distance')
        
        columns = ['id','name', 
                   'artists', 
                   'acousticness', 
                   'liveness', 
                   'instrumentalness', 
                   'energy', 
                   'danceability', 
                   'valence']
        
        return res_dt[columns][:amount]

In [76]:
# Nomes das colunas
datalabel = df_output.select('id',
                             'name',
                             'artists',
                             'danceability',
                             'energy',
                             'key',
                             'loudness',
                             'speechiness',
                             'acousticness',
                             'instrumentalness',
                             'liveness',
                             'valence',
                             'tempo',
                             'prediction')

In [77]:
# Dataset final
df_final = datalabel.toPandas()
df_final.drop(df_final[df_final['artists'] == '0'].index, inplace = True)
df_final.drop_duplicates(inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['liveness'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['instrumentalness'] == 0.000000].index, inplace = True)
df_final.drop(df_final[df_final['energy'] == 0.0000].index, inplace = True)
df_final.drop(df_final[df_final['danceability'] == 0.000].index, inplace = True)
df_final.drop(df_final[df_final['valence'] == 0.000].index, inplace = True)

In [78]:
df_final.shape

(379, 14)

In [79]:
df_final.sample(5)

Unnamed: 0,id,name,artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,prediction
165,4pAcD7T540cRmPhpWlYcbQ,Song About A Dream,AlZanders,0.664,0.778,8,-9.757,0.0703,0.403,0.00197,0.0993,0.32,124.513,1
355,1TdtpAUTwx99XeEDULGVkE,Ryso,PsychicHealth,0.612,0.45,8,-18.505,0.0384,0.384,0.826,0.0876,0.0378,126.007,0
393,4bO0oh8PS9lmvImY20X6ac,Trapped in Your Universe,FatCamp,0.559,0.418,9,-10.914,0.0251,0.551,0.353,0.152,0.486,92.218,0
418,6AwRgxSdb1Ly25nQWjCjzI,What You Gonna Do,Nemi,0.639,0.465,0,-8.634,0.112,0.133,4e-06,0.0566,0.302,160.08,1
217,2HP4sePgGIRL243WsLfJtQ,That's My Girl,RakeemMiles,0.746,0.487,11,-7.491,0.34,0.108,0.151,0.0733,0.146,194.008,1


In [80]:
# Cria o objeto
reco_obj = RecoSystem(df_final)

In [81]:
musica = df_musica_randomica['name'].tolist()[0]

In [82]:
print(musica)

Quarter


In [83]:
# Executa a recomendação
recomendacao = reco_obj.Recomm(musica)

100%|██████████| 378/378 [00:00<00:00, 45182.30it/s]


In [84]:
# Extrai a música randômica da lista de favoritos do Spotify
y = df_musica_randomica[['id','name', 
                         'artists',  
                         'acousticness', 
                         'liveness', 
                         'instrumentalness', 
                         'energy', 
                         'danceability', 
                         'valence']]

In [85]:
# Concatena a recomendação com a música randômica da lista de favoritos do Spotify
recomendacao = pd.concat([recomendacao, y])

In [86]:
# Salva a recomendação em disco
recomendacao.to_csv('recomendacoes/recomendacao.csv')

In [87]:
# Carrega o arquivo do disco
df_reco = (spark.read.format("csv").options(header = "true").load("recomendacoes/recomendacao.csv"))

In [88]:
# Recomendação de música
df_reco.show()

+---+--------------------+--------+----------+------------+--------+----------------+------+------------+-------+
|_c0|                  id|    name|   artists|acousticness|liveness|instrumentalness|energy|danceability|valence|
+---+--------------------+--------+----------+------------+--------+----------------+------+------------+-------+
|332|7khpPruHJK39VTBUQ...|Stranger|MildOrange|       0.412|   0.109|           0.113| 0.491|       0.334|  0.452|
| 29|1niIoI0IVTswfNxdy...| Quarter|      Fuel|     0.00012|   0.254|           0.683|  0.93|       0.422|  0.201|
+---+--------------------+--------+----------+------------+--------+----------------+------+------------+-------+



                                                                                

# Fim