# <font color='blue'>Data Science Academy</font>
# <font color='blue'>Big Data Real-Time Analytics com Python e Spark</font>

## <font color='blue'>Mini-Projeto 7</font>

### <font color='blue'>Sistema de Recomendação em Tempo Real com Machine Learning, PySpark, Spark Streaming e Kafka</font>

![title](imagens/MP7.png)

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.10.9


In [2]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
#!pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

# Instala o pacote watermark. 
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
!pip install -q -U watermark

In [3]:
# https://kafka-python.readthedocs.io/en/master/
# Conector python para o kafka
!pip install -q kafka-python

In [4]:
# Imports
import time
import random
import kafka
import numpy as np
import pandas as pd
from json import dumps
from kafka import KafkaProducer
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

Author: Data Science Academy

numpy : 1.23.5
pandas: 1.5.3
kafka : 2.0.2



In [6]:
# Endereço do servidor Kafka no docker
SERVER = 'localhost:9092'

In [7]:
# Nome do tópico
TOPIC = "dsaminiprojeto7"

In [8]:
# Carregamos o conjunto de dados de músicas
df_dsaminiprojeto7 = pd.read_csv("dados/dataset.csv")
df_dsaminiprojeto7.head()

Unnamed: 0,Spotify ID,Artist IDs,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,22a0Ji6EQKkY0tBohlN4Od,7qL7yYYh4SlsjwymwVK0wW,There You Are,There You Are,Kirsten Ludwig,2018-08-06,231240,2,spotify:user:predict0,2018-08-28T19:51:58Z,...,9,-5.596,0,0.0304,0.334,0.282,0.105,0.316,129.856,4
1,4J39ZEbwqHwtWLImUKmrn9,"5CRfAxYjJsDBH5wvWF2nja,55RRRPXwFwQmoTNqN4HBGU",88 Days,Heat,"Sara King,Ian Olney",2018-08-04,227961,8,spotify:user:predict0,2018-08-28T19:51:58Z,...,3,-10.749,1,0.0333,0.134,0.582,0.134,0.233,155.062,4
2,0a12d4HUjOmQSqHqLopWYx,0hy4t2HTGT4flktWAhKcxQ,Castaway,Castaway,ARZLEE,2018-08-10,230000,0,spotify:user:predict0,2018-08-28T19:51:58Z,...,1,-11.29,1,0.0314,0.11,3.2e-05,0.119,0.29,83.988,4
3,4u1DykFW1HjYAGNoDCiXfC,"6Wjyo1J1HR7HlT2bUTZ82T,021wqpA4geq0mtJlARXjon",Arouse,Arouse,"Shagabond,goodboy noah",2018-08-03,213913,30,spotify:user:predict0,2018-08-28T19:51:58Z,...,1,-6.066,1,0.433,0.0728,0.0,0.368,0.533,91.961,4
4,0u7JZm9ORerlZnnxxSdMwl,3Ad8KmjgFzpcTvmVf69GwR,Lonely,Lonely,Hayleau,2018-08-10,258738,21,spotify:user:predict0,2018-08-28T19:51:58Z,...,8,-3.921,0,0.0406,0.0169,0.00063,0.0542,0.577,98.954,4


In [9]:
# Ajustamos o formato de 3 colunas importantes: order_id, Artist Name(s) e Artist IDs
df_dsaminiprojeto7['order_id'] = np.arange(len(df_dsaminiprojeto7))
df_dsaminiprojeto7['Artist Name(s)'] = df_dsaminiprojeto7['Artist Name(s)'].str.replace('[^a-zA-Z]', '')
df_dsaminiprojeto7['Artist IDs'] = df_dsaminiprojeto7['Artist IDs'].str.replace('[^a-zA-Z]', '')

In [10]:
df_dsaminiprojeto7.shape

(4399, 24)

In [11]:
df_dsaminiprojeto7.head(10)

Unnamed: 0,Spotify ID,Artist IDs,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,...,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,order_id
0,22a0Ji6EQKkY0tBohlN4Od,qLyYYhSlsjwymwVKwW,There You Are,There You Are,KirstenLudwig,2018-08-06,231240,2,spotify:user:predict0,2018-08-28T19:51:58Z,...,-5.596,0,0.0304,0.334,0.282,0.105,0.316,129.856,4,0
1,4J39ZEbwqHwtWLImUKmrn9,CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU,88 Days,Heat,SaraKingIanOlney,2018-08-04,227961,8,spotify:user:predict0,2018-08-28T19:51:58Z,...,-10.749,1,0.0333,0.134,0.582,0.134,0.233,155.062,4,1
2,0a12d4HUjOmQSqHqLopWYx,hytHTGTflktWAhKcxQ,Castaway,Castaway,ARZLEE,2018-08-10,230000,0,spotify:user:predict0,2018-08-28T19:51:58Z,...,-11.29,1,0.0314,0.11,3.2e-05,0.119,0.29,83.988,4,2
3,4u1DykFW1HjYAGNoDCiXfC,WjyoJHRHlTbUTZTwqpAgeqmtJlARXjon,Arouse,Arouse,Shagabondgoodboynoah,2018-08-03,213913,30,spotify:user:predict0,2018-08-28T19:51:58Z,...,-6.066,1,0.433,0.0728,0.0,0.368,0.533,91.961,4,3
4,0u7JZm9ORerlZnnxxSdMwl,AdKmjgFzpcTvmVfGwR,Lonely,Lonely,Hayleau,2018-08-10,258738,21,spotify:user:predict0,2018-08-28T19:51:58Z,...,-3.921,0,0.0406,0.0169,0.00063,0.0542,0.577,98.954,4,4
5,0wuy2BYIVLbflFDqnR9Jay,kCwrYUFSJCubbbnZrE,Orsay,Strange Affairs,TheSvens,2018-08-03,413658,6,spotify:user:predict0,2018-08-28T19:51:58Z,...,-11.858,1,0.0316,0.0486,0.886,0.128,0.283,122.992,4,5
6,6LkIZZRrPQIbHMyBR5mTc2,TOsWuafqeWtrvYXqbnYAV,Nurture,Comrade,IslandFox,2018-08-09,191641,0,spotify:user:predict0,2018-08-28T19:51:58Z,...,-11.415,0,0.0504,0.0159,0.639,0.181,0.266,133.925,4,6
7,5U27fxNSd27XtX876xUsfV,HsKUExgNcRJojPmBcNqzgwpvzedAIjuDBM,Dinosaur Hair - Remix,Dinosaur Hair,AndyFerroCharlieConway,2018-08-10,257152,1,spotify:user:predict0,2018-08-28T19:51:58Z,...,-10.086,1,0.0383,0.456,0.92,0.135,0.663,125.908,4,7
8,5ogJOpmyDsvrAdttU6JLnN,gslbnQQLLcNzfjnxQY,Breathing Underwater,Long Way Down,MorningWars,2018-08-03,174999,0,spotify:user:predict0,2018-08-28T19:51:58Z,...,-11.258,0,0.0461,5e-06,0.0204,0.115,0.477,150.042,4,8
9,65rLHt6A58MFRxlNWVDU1Z,WlYiRrlrChWktQDo,Summer,Summer,NoSo,2018-08-01,232746,22,spotify:user:predict0,2018-08-28T19:51:58Z,...,-7.517,1,0.038,0.0168,0.00723,0.0706,0.21,123.962,4,9


In [12]:
# Convertemos o dataframe em um dicionário de músicas
dict_musicas = df_dsaminiprojeto7.to_dict(orient = "records")

In [13]:
dict_musicas[0:3]

[{'Spotify ID': '22a0Ji6EQKkY0tBohlN4Od',
  'Artist IDs': 'qLyYYhSlsjwymwVKwW',
  'Track Name': 'There You Are',
  'Album Name': 'There You Are',
  'Artist Name(s)': 'KirstenLudwig',
  'Release Date': '2018-08-06',
  'Duration (ms)': 231240,
  'Popularity': 2,
  'Added By': 'spotify:user:predict0',
  'Added At': '2018-08-28T19:51:58Z',
  'Genres': 'experimental folk',
  'Danceability': 0.487,
  'Energy': 0.707,
  'Key': 9,
  'Loudness': -5.596,
  'Mode': 0,
  'Speechiness': 0.0304,
  'Acousticness': 0.334,
  'Instrumentalness': 0.282,
  'Liveness': 0.105,
  'Valence': 0.316,
  'Tempo': 129.856,
  'Time Signature': 4,
  'order_id': 0},
 {'Spotify ID': '4J39ZEbwqHwtWLImUKmrn9',
  'Artist IDs': 'CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU',
  'Track Name': '88 Days',
  'Album Name': 'Heat',
  'Artist Name(s)': 'SaraKingIanOlney',
  'Release Date': '2018-08-04',
  'Duration (ms)': 227961,
  'Popularity': 8,
  'Added By': 'spotify:user:predict0',
  'Added At': '2018-08-28T19:51:58Z',
  'Genres':

In [14]:
# Kafka Producer
if __name__ == "__main__":

    # Cria o producer
    producer = KafkaProducer(bootstrap_servers = SERVER, 
                             value_serializer = lambda x: x.encode('utf-8'))
    
    # Variáveis de controle
    send = []    
    send = None

    # Loop pelo dicionário de músicas
    for musica in dict_musicas:
        
        # Cria a lista com dados que serão enviados para o Kafka
        sending = []
        
        # Append de cada coluna
        sending.append(musica["order_id"])
        sending.append(musica["Spotify ID"])
        sending.append(musica["Track Name"])
        sending.append(musica["Popularity"])
        sending.append(musica["Duration (ms)"])
        sending.append(musica["Artist Name(s)"])
        sending.append(musica["Artist IDs"])
        sending.append(musica["Release Date"])
        sending.append(musica["Danceability"])
        sending.append(musica["Energy"])
        sending.append(musica["Key"])
        sending.append(musica["Loudness"])
        sending.append(musica["Mode"])
        sending.append(musica["Speechiness"])
        sending.append(musica["Acousticness"])
        sending.append(musica["Instrumentalness"])
        sending.append(musica["Liveness"])
        sending.append(musica["Valence"])
        sending.append(musica["Tempo"])
        sending.append(musica["Time Signature"])
        
        # Junta tudo
        musica = ','.join(str(v) for v in sending)

        # Envia os dados para o tópico
        print("Próxima Música:" )
        print(musica)
        producer.send(TOPIC, musica)
        time.sleep(1)

    print("Concluído")

Próxima Música:
0,22a0Ji6EQKkY0tBohlN4Od,There You Are,2,231240,KirstenLudwig,qLyYYhSlsjwymwVKwW,2018-08-06,0.487,0.707,9,-5.596,0,0.0304,0.334,0.282,0.105,0.316,129.856,4
Próxima Música:
1,4J39ZEbwqHwtWLImUKmrn9,88 Days,8,227961,SaraKingIanOlney,CRfAxYjJsDBHwvWFnjaRRRPXwFwQmoTNqNHBGU,2018-08-04,0.335,0.401,3,-10.749,1,0.0333,0.134,0.582,0.134,0.233,155.062,4
Próxima Música:
2,0a12d4HUjOmQSqHqLopWYx,Castaway,0,230000,ARZLEE,hytHTGTflktWAhKcxQ,2018-08-10,0.553,0.422,1,-11.29,1,0.0314,0.11,3.25e-05,0.119,0.29,83.988,4
Próxima Música:
3,4u1DykFW1HjYAGNoDCiXfC,Arouse,30,213913,Shagabondgoodboynoah,WjyoJHRHlTbUTZTwqpAgeqmtJlARXjon,2018-08-03,0.67,0.751,1,-6.066,1,0.433,0.0728,0.0,0.368,0.533,91.961,4
Próxima Música:
4,0u7JZm9ORerlZnnxxSdMwl,Lonely,21,258738,Hayleau,AdKmjgFzpcTvmVfGwR,2018-08-10,0.67,0.709,8,-3.921,0,0.0406,0.0169,0.00063,0.0542,0.577,98.954,4
Próxima Música:
5,0wuy2BYIVLbflFDqnR9Jay,Orsay,6,413658,TheSvens,kCwrYUFSJCubbbnZrE,2018-08-03,0.61,0.444,0,-11.858,1,0.0316,0.0486,0

# Fim