In [256]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
import re
from itertools import chain

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joao.victor.ribeiro\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joao.victor.ribeiro\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

In [257]:
artists = pd.read_csv('artists-data.csv')
songs = pd.read_csv('lyrics-data.csv')

In [258]:
genres_list = [str(s).split(';') for s in artists['Genres'].unique()]
res = list(chain(*genres_list))
res = [gen.strip() for gen in res]
genres = list(set(res))

In [259]:
for genre in genres:
    contains = [True if re.search(genre, str(art_gen)) else False for art_gen in artists['Genres']]
    artists[genre] = contains

In [260]:
all_songs = songs.merge(artists, how='outer', left_on='ALink', right_on='Link')

In [261]:
no_country = all_songs[all_songs['Country']==False].sample(n=2500, random_state=1)
yes_country = all_songs[all_songs['Country']==True].sample(n=2500, random_state=1)
no_rock = all_songs[all_songs['Rock']==False].sample(n=2500, random_state=1)
yes_rock = all_songs[all_songs['Rock']==True].sample(n=2500, random_state=1)
no_rap = all_songs[all_songs['Rap']==False].sample(n=2500, random_state=1)
yes_rap = all_songs[all_songs['Rap']==True].sample(n=2500, random_state=1)

In [262]:
all_songs = pd.concat([no_country, yes_country, no_rock, yes_rock, no_rap, yes_rap]).reset_index(drop=True)

In [263]:
lyrics = all_songs['Lyric'].astype(str)
low = [lyr.lower() for lyr in lyrics]
tokenized = [word_tokenize(lyr) for lyr in low]
stop_words = set(stopwords.words('english'))
stop_vec = [[w for w in tok if w not in stop_words] for tok in tokenized]
clean_vec = [[word for word in lyr if word.isalpha()] for lyr in stop_vec]

In [264]:
lyrics = [' '.join(lyr) for lyr in clean_vec]
vectorize = TfidfVectorizer(min_df=5, max_df=0.8)
vectors = vectorize.fit_transform(lyrics)
feature_names = vectorize.get_feature_names_out()
dense = vectors.todense()
dense_list = dense.tolist()
df = pd.DataFrame(dense_list, columns=feature_names)

In [265]:
df['Country'] = all_songs['Country'].copy()
df['Rock'] = all_songs['Rock'].copy()
df['Rap'] = all_songs['Rap'].copy()

In [266]:
genres = ['Country', 'Rock', 'Rap']
models = {}

In [267]:
for genre in genres:
    y = df[genre].copy()
    x = df.drop(genres, axis=1)  # Remove all genre columns
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)
    
    clf = MultinomialNB(alpha=0.1)
    clf.fit(x_train, y_train.astype(bool))
    models[genre] = clf
    
    y_pred = clf.predict(x_test)

    accuracy = metrics.accuracy_score(y_test.astype(bool), y_pred)
    precision = metrics.precision_score(y_test.astype(bool), y_pred)
    recall = metrics.recall_score(y_test.astype(bool), y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test.astype(bool), y_pred)
    
    print(f"Results for {genre}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Confusion Matrix: \n{confusion_matrix}\n")

Results for Country:
Accuracy: 0.8253333333333334
Precision: 0.6262626262626263
Recall: 0.21602787456445993
Confusion Matrix: 
[[3528  111]
 [ 675  186]]

Results for Rock:
Accuracy: 0.7206666666666667
Precision: 0.5460434983803795
Recall: 0.8104395604395604
Confusion Matrix: 
[[2063  981]
 [ 276 1180]]

Results for Rap:
Accuracy: 0.9042222222222223
Precision: 0.7940761636107193
Recall: 0.6639150943396226
Confusion Matrix: 
[[3506  146]
 [ 285  563]]



In [268]:
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

In [293]:
file_path = 'lyric.txt'
text_content = read_txt_file(file_path)
df = pd.DataFrame({'lyrics': [text_content]})
df

Unnamed: 0,lyrics
0,"Yeah, I know sometimes\nThings may not always ..."


In [294]:
csv_path = 'lyric.csv'

df.to_csv(csv_path, index=False)

In [295]:
tay = pd.read_csv('lyric.csv', encoding='latin1')
tay.columns
teste = tay['lyrics']
tokenized = [word_tokenize(lyr) for lyr in teste.astype(str)]
stop_vec = [[w for w in tok if w not in stop_words] for tok in tokenized]
clean_vec = [[word for word in lyr if word.isalpha()] for lyr in stop_vec]
wnet = nltk.WordNetLemmatizer()
lem = [[wnet.lemmatize(w) for w in lyr] for lyr in clean_vec]

lem

[['Yeah',
  'I',
  'know',
  'sometimes',
  'Things',
  'may',
  'always',
  'make',
  'sense',
  'right',
  'But',
  'hey',
  'daddy',
  'always',
  'tell',
  'Straighten',
  'little',
  'soldier',
  'Stiffen',
  'upper',
  'lip',
  'What',
  'cryin',
  'You',
  'got',
  'Hailie',
  'I',
  'know',
  'miss',
  'mom',
  'I',
  'know',
  'miss',
  'dad',
  'Well',
  'I',
  'gone',
  'I',
  'tryin',
  'give',
  'life',
  'I',
  'never',
  'I',
  'see',
  'sad',
  'even',
  'smile',
  'even',
  'laugh',
  'I',
  'see',
  'eye',
  'deep',
  'inside',
  'want',
  'cry',
  'scared',
  'I',
  'ai',
  'Daddy',
  'prayer',
  'No',
  'cryin',
  'wipe',
  'tear',
  'Daddy',
  'nightmare',
  'We',
  'gon',
  'pull',
  'together',
  'gon',
  'Lainie',
  'uncle',
  'crazy',
  'ai',
  'Yeah',
  'love',
  'girl',
  'better',
  'know',
  'We',
  'got',
  'world',
  'When',
  'spin',
  'swirl',
  'When',
  'whirl',
  'twirl',
  'Two',
  'little',
  'beautiful',
  'girl',
  'Lookin',
  'puzzled',
  'daze'

In [296]:
lyrics_tay = [' '.join(lyr) for lyr in lem]
single_entry = vectorize.transform(lyrics_tay)
s_e = single_entry.todense().tolist()

lyrics_tay

['Yeah I know sometimes Things may always make sense right But hey daddy always tell Straighten little soldier Stiffen upper lip What cryin You got Hailie I know miss mom I know miss dad Well I gone I tryin give life I never I see sad even smile even laugh I see eye deep inside want cry scared I ai Daddy prayer No cryin wipe tear Daddy nightmare We gon pull together gon Lainie uncle crazy ai Yeah love girl better know We got world When spin swirl When whirl twirl Two little beautiful girl Lookin puzzled daze I know confusin Daddy always move mama always news I try keep sheltered somehow seems The harder I try backfire All thing growin daddy see Daddy want see see much We plan way mother But thing gotten bad u I see u ever bein together ever Like used teenager But course everything always happens reason I guess never meant But something control destiny But worry rest head go sleep Maybe one day wake dream Now hush little baby cry Everything gon na alright Stiffen upper lip little lady I

In [297]:
predictions = {genre: clf.predict(s_e) for genre in genres}



In [298]:
probabilities = {genre: models[genre].predict_proba(s_e)[0][1] for genre in genres}

for genre, prob in probabilities.items():
    print(f"{genre}: {prob * 100:.2f}%")

Country: 17.88%
Rock: 93.86%
Rap: 66.42%


