In [None]:
%pip install -Uq bertopic

In [None]:
import pandas as pd
import nltk
import string
import umap
import hdbscan
import bertopic
import sentence_transformers
import sklearn
import numpy as np

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from umap import UMAP
from hdbscan import HDBSCAN

import random

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)


In [None]:
normalization = 1
eliminate_hip_hop = 1
generate_fig = 0
save_model = 0
reduce_outliers = 0

In [None]:
#Moura, Luan; Fontelles, Emanuel; Sampaio, Vinicius; França, Mardônio (2020), “Music Dataset: Lyrics and Metadata from 1950 to 2019”, Mendeley Data, V2, doi: 10.17632/3t9vbwxgr5.2
df = pd.read_csv('data/tcc_ceds_music.csv')

print(df.shape, "\n", df.columns)

#display(df)

In [None]:
# df_usage = df.sample(8000)
# df_usage.reset_index(inplace = True,drop = True)

df_usage = df.copy()

#display(df_usage)

In [None]:
if eliminate_hip_hop == 1:
  genre_dict = {}
  for i in df_usage['genre']:
    if i in genre_dict:
      genre_dict[i] += 1
    else:
      genre_dict[i] = 1

  for i in genre_dict:
    print(i, genre_dict[i])

  df_without_hiphop = df_usage[df_usage.genre != "hip hop"]
  df_usage = df_without_hiphop.copy()
  print("\n")

if normalization == 1:
  genre_dict = {}
  for i in df_usage['genre']:
    if i in genre_dict:
      genre_dict[i] += 1
    else:
      genre_dict[i] = 1

  for i in genre_dict:
    print(i, genre_dict[i])

  min_value = min(genre_dict.values())
  print(min_value)

  new_df = df_usage.groupby('genre').sample(min_value)

  new_df = new_df.sample(frac = 1)
  new_df.reset_index(inplace = True,drop = True)

  display(new_df)

  df_usage = new_df.copy()

In [None]:
eng_stopword_list = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer(language='english')

for lyric_counter in range(len(df_usage["lyrics"])):
  
  temp_lowercase_tokenize_doc = nltk.word_tokenize(
      df_usage["lyrics"][lyric_counter].lower())

  for stopword in eng_stopword_list:
    if stopword in temp_lowercase_tokenize_doc:
      temp_list = []
      for elem in temp_lowercase_tokenize_doc:
        if elem != stopword:
          temp_list.append(elem)
      temp_lowercase_tokenize_doc = temp_list
   
  punctuations = string.punctuation
  for symbol in punctuations:
    temp_list = []
    for elem in temp_lowercase_tokenize_doc: 
      if symbol != elem:
        if symbol in elem:
          elem = elem.replace(symbol, '')
        temp_list.append(elem)
    temp_lowercase_tokenize_doc = temp_list 

  temp_list = []
  for elem in temp_lowercase_tokenize_doc:
    temp_list.append(stemmer.stem(elem))
  temp_lowercase_tokenize_doc = temp_list

  temp_doc = ' '.join(temp_lowercase_tokenize_doc)
  df_usage["lyrics"][lyric_counter] = temp_doc

  if lyric_counter % 1000 == 0:
    print(lyric_counter)

#display(df_usage)

In [None]:
genre_dict = {}
for i in df_usage['genre']:
  if i in genre_dict:
    genre_dict[i] += 1
  else:
    genre_dict[i] = 1

for i in genre_dict:
  print(i, genre_dict[i])

In [None]:
umap_model = UMAP(n_neighbors=30, n_components=2, min_dist=0.01)
hdbscan_model = HDBSCAN(min_cluster_size=50, min_samples=2, prediction_data=True, cluster_selection_method="leaf")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    language='english',
    verbose=True,
    hdbscan_model = hdbscan_model
)


#Comment out next line for loading
topics, probs = model.fit_transform(df_usage['lyrics'])

In [None]:
print(model.get_topic_info(), "\n")

In [None]:
if generate_fig == 1:
    print(model.get_topic_info(), "\n")

    fig = model.visualize_documents(df_usage['lyrics'])
    fig.write_html("Graph1_1.html")

    fig

In [None]:
if save_model == 1:
    model.save("Model1_1")

In [None]:
if save_model == 1:
    model = model.load("Model1_1")
    print(model.get_topic_info(), "\n")

In [None]:
print(model.get_topic_info())

size_param = 25

model.reduce_topics(nr_topics=size_param, docs=df_usage["lyrics"])

print(model.get_topic_info())

In [None]:
if generate_fig == 1:
    print(model.get_topic_info(), "\n")

    fig = model.visualize_documents(df_usage['lyrics'])
    fig.write_html("Graph1_2.html")

    fig

In [None]:
if save_model == 1:
    model.save("Model1_2")

In [None]:
if save_model == 1:
    model = model.load("Model1_2")
    print(model.get_topic_info(), "\n")

In [None]:
if reduce_outliers == 1:
    new_topics = model.reduce_outliers(documents=df_usage["lyrics"], topics=model.topics_)

    model.update_topics(docs=df_usage["lyrics"], topics=new_topics)

    print(model.get_topic_info(), "\n")

In [None]:
if generate_fig == 1:
    print(model.get_topic_info(), "\n")

    fig = model.visualize_documents(df_usage['lyrics'])
    fig.write_html("Graph1_3.html")

    fig

In [None]:
if save_model == 1:
    model.save("Model1_3")

In [None]:
if save_model == 1:
    model = model.load("Model1_3")
    print(model.get_topic_info(), "\n")

In [None]:
print(model.get_topic_info(), "\n")

topic_dict = model.get_topics()
topic_genre_index_zip = list(zip(model.topics_, df_usage["genre"]))

genre_list = df_usage['genre'].unique()
frequency_arr = np.zeros((len(topic_dict),len(genre_list)))

genre_dict = {}
for index in range(len(genre_list)):
  genre_dict[genre_list[index]] = index

for elem in genre_dict:
  print(elem, genre_dict[elem])

for index in range(len(topic_genre_index_zip)):
  frequency_arr[topic_genre_index_zip[index][0]][genre_dict[topic_genre_index_zip[index][1]]] += 1

print("\n")

output = pd.DataFrame(frequency_arr, columns = genre_list)

print(output)

In [None]:
eng_stopword_list = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer(language='english')

for lyric_counter in range(len(df_usage["track_name"])):
  
  temp_lowercase_tokenize_doc = nltk.word_tokenize(
      df_usage["track_name"][lyric_counter].lower())
   
  punctuations = string.punctuation
  for symbol in punctuations:
    temp_list = []
    for elem in temp_lowercase_tokenize_doc: 
      if symbol != elem:
        if symbol in elem:
          elem = elem.replace(symbol, '')
        temp_list.append(elem)
    temp_lowercase_tokenize_doc = temp_list 

  temp_doc = ' '.join(temp_lowercase_tokenize_doc)
  df_usage["track_name"][lyric_counter] = temp_doc

  if lyric_counter % 1000 == 0:
    print(lyric_counter)


In [None]:
print(model.get_topics())

topic_dict2 = model.get_topics()

topic_dict2_pre_process = {}

for key in topic_dict2:
  topic_str = ""
  for elem in topic_dict2[key]:
    topic_str += (elem[0] + " ")

  topic_str = topic_str.rstrip(topic_str[-1])
  topic_dict2_pre_process[key] = topic_str

topic_title_genre_index_zip = list(zip(model.topics_, df_usage["track_name"], df_usage["genre"]))

topictitle_genre_arr = []
for elem in topic_title_genre_index_zip:
  if elem[0] != -1:
    temp_list = topic_dict2_pre_process[elem[0]].split(" ")
    random.shuffle(temp_list)
    shuffle_topic = ' '.join(temp_list)
    topictitle_str = shuffle_topic + " " + elem[1]
    topictitle_genre_arr.append((elem[2], topictitle_str))

output_test = pd.DataFrame(topictitle_genre_arr, columns = ["genre", "topictitle"])

display(output_test)



In [None]:
output_test.to_csv("genre_topic_titel.csv", index=False)