In [2]:
# ==============================================================================
# Cell 1: Imports & Setup
# ==============================================================================
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora

# Download NLTK data (run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Paths
PROCESSED_DATA_PATH = '../data/processed'
OUTPUT_PATH = '../output'
MODELS_PATH = '../output/models'

os.makedirs(MODELS_PATH, exist_ok=True)

print("✅ Libraries imported and NLTK data downloaded.")

✅ Libraries imported and NLTK data downloaded.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mohammedpathariya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mohammedpathariya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mohammedpathariya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# ==============================================================================
# Cell 2: Load Lyrics Data
# ==============================================================================
lyrics_path = os.path.join(PROCESSED_DATA_PATH, 'song_lyrics.csv')
df_lyrics = pd.read_csv(lyrics_path)

print(f"Loaded {len(df_lyrics)} songs.")
print(df_lyrics.head(2))

Loaded 700 songs.
    artist           title              year  \
0  J. Cole  No Role Modelz  December 9, 2014   
1  J. Cole       She Knows  October 29, 2013   

                                              lyrics  
0  First things first: rest in peace, Uncle Phil\...  
1  She knows\nShe knows, ayy\nBad things happen t...  


In [4]:
# ==============================================================================
# Cell 3: Preprocessing Function (The Cleaning Crew)
# ==============================================================================
# We need to turn raw lyrics into clean lists of words.

stop_words = set(stopwords.words('english'))
# Add custom hip-hop/song stopwords that don't add semantic meaning
custom_stops = {'yeah', 'oh', 'like', 'got', 'get', 'know', 'go', 'feat', 'verse', 'chorus', 'intro', 'outro'}
stop_words.update(custom_stops)

lemmatizer = WordNetLemmatizer()

def clean_lyrics(text):
    if not isinstance(text, str): return []
    
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove bracketed text (e.g., [Verse 1], [Chorus])
    text = re.sub(r'\[.*?\]', '', text)
    
    # 3. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 4. Tokenize (split into words)
    tokens = text.split()
    
    # 5. Remove stopwords and Lemmatize (running -> run)
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    return clean_tokens

# Apply cleaning
print("Cleaning lyrics... (this may take a moment)")
df_lyrics['tokens'] = df_lyrics['lyrics'].apply(clean_lyrics)

print("✅ Lyrics cleaned.")
print(f"Example tokens: {df_lyrics['tokens'].iloc[0][:10]}")

Cleaning lyrics... (this may take a moment)
✅ Lyrics cleaned.
Example tokens: ['first', 'thing', 'first', 'rest', 'peace', 'uncle', 'phil', 'real', 'father', 'ever']


In [5]:
# ==============================================================================
# Cell 4: Prepare for LDA (Dictionary & Corpus)
# ==============================================================================
# LDA needs a "Dictionary" (id -> word mapping) and a "Corpus" (word counts per doc)

# Create Dictionary
id2word = corpora.Dictionary(df_lyrics['tokens'])

# Filter extremes (remove words that appear in less than 3 songs or more than 50% of songs)
# This helps remove rare typos and super-common words like "rap"
id2word.filter_extremes(no_below=3, no_above=0.5)

# Create Corpus
corpus = [id2word.doc2bow(text) for text in df_lyrics['tokens']]

print(f"Dictionary created with {len(id2word)} unique tokens.")
print(f"Corpus created for {len(corpus)} documents.")

Dictionary created with 5762 unique tokens.
Corpus created for 700 documents.


In [6]:
# ==============================================================================
# Cell 5: Train LDA Model
# ==============================================================================
# This is the AI part. We ask it to find 'k' topics.
# We'll start with 5 topics to match our 5 communities, but you can change this number.

NUM_TOPICS = 5

print(f"Training LDA model with {NUM_TOPICS} topics...")
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=NUM_TOPICS, 
                                       random_state=42,
                                       passes=10,
                                       workers=2)

print("✅ Model training complete.")

# Save the model
model_path = os.path.join(MODELS_PATH, 'lda_model')
lda_model.save(model_path)
print(f"Model saved to {model_path}")

Training LDA model with 5 topics...
✅ Model training complete.
Model saved to ../output/models/lda_model


In [7]:
# ==============================================================================
# Cell 6: View Topics
# ==============================================================================
print("--- Discovered Topics ---")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}\n")
    
# Assign the dominant topic to each song
def get_dominant_topic(bow):
    topic_probs = lda_model.get_document_topics(bow)
    # Sort by probability and get the top one
    topic_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
    return topic_probs[0][0]

df_lyrics['Dominant_Topic'] = [get_dominant_topic(bow) for bow in corpus]

# Save the results with topics
final_lyrics_path = os.path.join(PROCESSED_DATA_PATH, 'song_topics.csv')
df_lyrics.to_csv(final_lyrics_path, index=False)
print(f"✅ Saved songs with assigned topics to: {final_lyrics_path}")

--- Discovered Topics ---
Topic 0: 0.019*"ooh" + 0.010*"versace" + 0.008*"feel" + 0.008*"young" + 0.007*"need" + 0.006*"dope" + 0.006*"bad" + 0.006*"light" + 0.006*"gon" + 0.005*"keep"

Topic 1: 0.010*"way" + 0.010*"woo" + 0.009*"money" + 0.009*"ayy" + 0.007*"woah" + 0.006*"lil" + 0.006*"hey" + 0.006*"gon" + 0.006*"gotta" + 0.005*"baby"

Topic 2: 0.006*"think" + 0.006*"fuckin" + 0.004*"tell" + 0.004*"hope" + 0.004*"fucking" + 0.004*"youre" + 0.004*"gon" + 0.004*"come" + 0.004*"look" + 0.004*"give"

Topic 3: 0.009*"life" + 0.008*"right" + 0.007*"feel" + 0.007*"walk" + 0.006*"god" + 0.006*"real" + 0.006*"talk" + 0.006*"man" + 0.005*"gang" + 0.005*"night"

Topic 4: 0.012*"girl" + 0.010*"wanna" + 0.009*"baby" + 0.008*"youre" + 0.008*"right" + 0.007*"tell" + 0.007*"need" + 0.006*"ill" + 0.006*"way" + 0.006*"even"

✅ Saved songs with assigned topics to: ../data/processed/song_topics.csv
