In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import nltk
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

In [2]:


# Download NLTK stopwords and tokenizer
nltk.download('stopwords')
nltk.download('punkt')
# Step 1: Load the CSV
file_path = './Data/all_artists_lyrics.csv'
# file_path = './Data/MoodyLyrics.csv'
# Replace with the path to your CSV
useCols = ['Artist','Title', 'Lyric']
data = pd.read_csv(file_path, encoding='utf-8', delimiter=',', usecols=useCols)
data['Lyric'] = data['Lyric'].fillna('').astype(str)

num_topics = 10

# Step 2: Preprocess Lyrics
def preprocess_lyrics(lyrics):
    # Remove square brackets and contents
    lyrics = re.sub(r'\[.*?\]', '', str(lyrics))
    # Remove parentheses and contents
    lyrics = re.sub(r'\(.*?\)', '', lyrics)
    # Remove punctuation
    lyrics = re.sub(r'[^\w\s]', '', lyrics)
    # Tokenize and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(lyrics.lower())
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Group by artist and preprocess all lyrics
artist_docs = data.groupby('Artist')['Lyric'].apply(lambda x: ' '.join(x)).reset_index()
artist_docs.rename(columns={'Lyric': 'Accumulated_Lyrics'}, inplace=True)
artist_docs['processed_lyrics'] = artist_docs['Accumulated_Lyrics'].apply(preprocess_lyrics)
processed_artist_document = [doc for doc in artist_docs['processed_lyrics']]
# Step 3: Create a Dictionary and Corpus for LDA
dictionary = corpora.Dictionary(processed_artist_document)
corpus = [dictionary.doc2bow(doc) for doc in artist_docs['processed_lyrics']] ##doc2bow is a function that converts a list of tokens into a bag-of-words representation.

# Step 4: Train the LDA Model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=10)

# Step 5: Classify Each Artist into Topics
artist_docs['Topics'] = artist_docs['processed_lyrics'].apply(
    lambda doc: lda_model[dictionary.doc2bow(doc)]
)

# Display Topics for Each Artist
for idx, row in artist_docs.iterrows():
    print(f"Artist: {row['Artist']}")
    print(f"Topics: {row['Topics']}\n")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Artist: Ariana Grande
Topics: [(3, 0.9996268)]

Artist: BTS (・ｰｩ・・・・・・・・・・・ｨ)
Topics: [(5, 0.99999034)]

Artist: Beyoncﾃｩ
Topics: [(2, 0.013210376), (3, 0.9851532)]

Artist: Billie Eilish
Topics: [(1, 0.0141680725), (3, 0.9844868)]

Artist: Cardi B
Topics: [(6, 0.44999537), (9, 0.5409651)]

Artist: Charlie Puth
Topics: [(3, 0.9932194)]

Artist: Coldplay
Topics: [(4, 0.998945)]

Artist: Drake
Topics: [(2, 0.018304847), (3, 0.18907213), (6, 0.6976738), (9, 0.08690977)]

Artist: Dua Lipa
Topics: [(3, 0.99995446)]

Artist: Ed Sheeran
Topics: [(2, 0.99299204)]

Artist: Eminem
Topics: [(9, 0.9987958)]

Artist: Justin Bieber
Topics: [(3, 0.9976032)]

Artist: Katy Perry
Topics: [(2, 0.9950101)]

Artist: Khalid
Topics: [(3, 0.99119127)]

Artist: Lady Gaga
Topics: [(0, 0.9994664)]

Artist: Maroon 5
Topics: [(3, 0.9954238)]

Artist: Nicki Minaj
Topics: [(2, 0.6410999), (6, 0.17280656), (9, 0.18562217)]

Artist: Post Malone
Topics: [(3, 0.03496946), (6, 0.9650047)]

Artist: Rihanna
Topics: [(3, 0.

In [3]:
doc_bow = dictionary.doc2bow(processed_artist_document[0])  # Example: First artist's lyrics
# topic_distribution = lda_model.get_document_topics(doc_bow)
# print(topic_distribution)  
print(doc_bow)

[(0, 12), (1, 9), (2, 8), (3, 1), (4, 1), (5, 97), (6, 1), (7, 24), (8, 6), (9, 2), (10, 19), (11, 1), (12, 1), (13, 1), (14, 1), (15, 3), (16, 1), (17, 1), (18, 5), (19, 12), (20, 1), (21, 2), (22, 9), (23, 2), (24, 1), (25, 10), (26, 2), (27, 2), (28, 4), (29, 1), (30, 1), (31, 4), (32, 1), (33, 3), (34, 1), (35, 10), (36, 5), (37, 1), (38, 1), (39, 2), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 5), (47, 12), (48, 1), (49, 2), (50, 1), (51, 1), (52, 1), (53, 2), (54, 6), (55, 3), (56, 1), (57, 4), (58, 3), (59, 3), (60, 2), (61, 1), (62, 22), (63, 8), (64, 1), (65, 2), (66, 1), (67, 3), (68, 4), (69, 4), (70, 1), (71, 12), (72, 2), (73, 1), (74, 1), (75, 1), (76, 149), (77, 15), (78, 9), (79, 11), (80, 1), (81, 3), (82, 1), (83, 1), (84, 1), (85, 19), (86, 8), (87, 406), (88, 1), (89, 1), (90, 31), (91, 4), (92, 3), (93, 20), (94, 8), (95, 1), (96, 1), (97, 1), (98, 1), (99, 8), (100, 14), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 3), (108, 18)

In [4]:
# print(dictionary.token2id)  # Shows word to ID mapping
topics = lda_model.print_topics(num_words=10)
    
print(topics)

print(topics[0][1])

topics_ls = []
for topic in topics:
    words = topic[1].split("+")
    topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
    topics_ls.append(topic_words)
for topics in topics_ls:
    print(topics)



[(0, '0.024*"im" + 0.021*"want" + 0.019*"love" + 0.014*"dont" + 0.013*"baby" + 0.012*"oh" + 0.012*"na" + 0.008*"way" + 0.007*"cause" + 0.007*"gaga"'), (1, '0.013*"im" + 0.013*"like" + 0.012*"know" + 0.011*"dont" + 0.010*"oh" + 0.009*"love" + 0.009*"never" + 0.008*"youre" + 0.007*"na" + 0.007*"baby"'), (2, '0.023*"im" + 0.015*"like" + 0.014*"dont" + 0.013*"love" + 0.012*"know" + 0.011*"oh" + 0.010*"youre" + 0.009*"na" + 0.009*"get" + 0.009*"got"'), (3, '0.022*"im" + 0.017*"dont" + 0.016*"love" + 0.016*"like" + 0.015*"yeah" + 0.014*"oh" + 0.014*"know" + 0.013*"got" + 0.012*"baby" + 0.011*"na"'), (4, '0.023*"oh" + 0.013*"im" + 0.012*"dont" + 0.010*"know" + 0.009*"na" + 0.009*"go" + 0.008*"get" + 0.008*"love" + 0.007*"see" + 0.007*"yeah"'), (5, '0.031*"ｴ" + 0.018*"ｰ" + 0.014*"ｧ" + 0.009*"ｼ" + 0.009*"jungkook" + 0.008*"yeah" + 0.008*"ｲ" + 0.007*"oh" + 0.007*"jimin" + 0.007*"love"'), (6, '0.024*"im" + 0.018*"yeah" + 0.017*"know" + 0.016*"like" + 0.015*"got" + 0.013*"dont" + 0.010*"get" + 0.0

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./Results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './Results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

: 