In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
import re
from itertools import chain
import joblib
import boto3
import pickle
import io
from io import BytesIO

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 26614f9d-3e06-4f8a-a3e0-1e7cf90771aa
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 26614f9d-3e06-4f8a-a3e0-1e7cf90771aa to get into ready status...
Session 26614f9d-3e06-4f8a-a3e0-1e7cf90771aa has been created.
True
[nltk_data] Downloading package punkt to /home/spark/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/spark/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /hom

In [2]:
stop_words = set(stopwords.words('english'))
wnet = nltk.WordNetLemmatizer()




In [3]:
def read_txt_file_from_s3(bucket, key):
    try:
        # Recupera o objeto do S3
        obj = s3_client.get_object(Bucket=bucket, Key=key)
        # Lê o conteúdo do arquivo
        content = obj['Body'].read().decode('utf-8')
        return content
    except Exception as e:
        print(f"Erro ao ler o arquivo do S3: {str(e)}")
        return None




In [4]:
genres = [
    'Country',
    'Rap',
    'Rock'
]




In [46]:
s3_bucket = 'ml-models-sprint2'
vectorize_key = 'vectorize.pkl'
rap_key = 'Rap_model.pkl'
rock_key = 'Rock_model.pkl'
country_key = 'Country_model.pkl'
loaded_models = {}
s3_client = boto3.client('s3')




In [52]:
for genre in genres:
    model_key = f'{genre}_model.pkl'
    local_model_path = f'/tmp/{model_key}'
    s3_client.download_file(s3_bucket, model_key, local_model_path)
    with open(local_model_path, 'rb') as f:
        loaded_models[genre] = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [47]:
# Caminho temporário local no Glue para salvar o modelo baixado
local_model_path = '/tmp/rap.pkl'

# Baixar o modelo do S3
s3_client.download_file(s3_bucket, vectorize_key, local_model_path)

# Carregar o modelo utilizando joblib (ou pickle)
with open("/tmp/rap.pkl", "rb") as f:
    Rap_model = pickle.load(f)




In [48]:
# Caminho temporário local no Glue para salvar o modelo baixado
local_model_path = '/tmp/rock.pkl'

# Baixar o modelo do S3
s3_client.download_file(s3_bucket, vectorize_key, local_model_path)

# Carregar o modelo utilizando joblib (ou pickle)
with open("/tmp/rock.pkl", "rb") as f:
    Rock_model = pickle.load(f)




In [49]:
# Caminho temporário local no Glue para salvar o modelo baixado
local_model_path = '/tmp/country.pkl'

# Baixar o modelo do S3
s3_client.download_file(s3_bucket, vectorize_key, local_model_path)

# Carregar o modelo utilizando joblib (ou pickle)
with open("/tmp/country.pkl", "rb") as f:
    Country = pickle.load(f)




In [50]:
# Caminho temporário local no Glue para salvar o modelo baixado
local_model_path = '/tmp/vectorize.pkl'

# Baixar o modelo do S3
s3_client.download_file(s3_bucket, vectorize_key, local_model_path)

# Carregar o modelo utilizando joblib (ou pickle)
with open("/tmp/vectorize.pkl", "rb") as f:
    vectorize = pickle.load(f)




In [14]:
file_path = 'lyric.txt'
text_content = read_txt_file_from_s3(s3_bucket, file_path)

if text_content:
    df = pd.DataFrame({'lyrics': [text_content]})
    print(df.head())
else:
    print("Falha ao ler o conteúdo do arquivo do S3.")

                                              lyrics
0  I used to spend my nights out in a barroom\r\n...


In [15]:
text_content = read_txt_file_from_s3(s3_bucket, file_path)
df = pd.DataFrame({'lyrics': [text_content]})
tokenized = [word_tokenize(lyr) for lyr in df['lyrics'].astype(str)]
stop_vec = [[w for w in tok if w not in stop_words] for tok in tokenized]
clean_vec = [[word for word in lyr if word.isalpha()] for lyr in stop_vec]
lem = [[wnet.lemmatize(w) for w in lyr] for lyr in clean_vec]
lem

[['I', 'used', 'spend', 'night', 'barroom', 'Liquor', 'love', 'I', 'known', 'But', 'rescued', 'reachin', 'bottom', 'And', 'brought', 'back', 'far', 'gone', 'You', 'smooth', 'Tennessee', 'whiskey', 'You', 'sweet', 'strawberry', 'wine', 'You', 'warm', 'glass', 'brandy', 'And', 'honey', 'I', 'stay', 'stoned', 'love', 'time', 'I', 'looked', 'love', 'old', 'place', 'Found', 'bottom', 'bottle', 'always', 'dry', 'But', 'poured', 'heart', 'I', 'waste', 'Cause', 'nothing', 'like', 'love', 'get', 'high', 'You', 'smooth', 'Tennessee', 'whiskey', 'You', 'sweet', 'strawberry', 'wine', 'You', 'warm', 'glass', 'brandy', 'And', 'honey', 'I', 'stay', 'stoned', 'love', 'time', 'You', 'smooth', 'Tennessee', 'whiskey', 'You', 'sweet', 'strawberry', 'wine', 'You', 'warm', 'glass', 'brandy', 'And', 'honey', 'I', 'stay', 'stoned', 'love', 'time', 'You', 'smooth', 'Tennessee', 'whiskey', 'Tennessee', 'whiskey', 'Tennessee', 'whiskey', 'You', 'smooth', 'Tennessee', 'whiskey', 'Tennessee', 'whiskey', 'Tennessee

In [54]:
lyrics_tay = [' '.join(lyr) for lyr in lem]
single_entry = vectorize.transform(lyrics_tay)
s_e = single_entry.todense().tolist()

probabilities = {genre: loaded_models[genre].predict_proba(s_e)[0][1] for genre in genres}



In [55]:
for genre, prob in probabilities.items():
    print(f"{genre}: {prob * 100:.2f}%")

Country: 80.87%
Rap: 10.04%
Rock: 47.82%
