In [9]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


False

In [11]:
df = pd.read_csv('../data/hiphop_eras_songs.csv')

In [12]:
print(df.shape)
print(df.columns)
print(df.head(5))


(70, 5)
Index(['Song Title', 'Artist', 'Year', 'Era', 'Lyrics'], dtype='object')
         Song Title                     Artist  Year         Era  \
0  Rapper's Delight             Sugarhill Gang  1979  Old School   
1        The Breaks                Kurtis Blow  1980  Old School   
2       The Message          Grandmaster Flash  1982  Old School   
3       Planet Rock           Afrika Bambaataa  1982  Old School   
4            Apache  The Incredible Bongo Band  1981  Old School   

                                              Lyrics  
0  [Chorus: Wonder Mike]\nI said a hip-hop, the h...  
1  [Intro]\nClap your hands everybody\nIf you got...  
2  [Intro: Duke Bootee]\nIt's like a jungle somet...  
3  [Intro: Afrika Bambaataa]\nParty peopleParty p...  
4                                                NaN  


After 

Data cleaning

In [13]:
# 1. Data Exploration and Initial Cleaning
def explore_data(df):
    # Check missing values
    print("Missing Values:\n", df.isnull().sum())

    # Remove rows with missing lyrics
    df_clean = df.dropna(subset=['Lyrics'])

    # Check distribution of eras
    print("\nEra Distribution:\n", df_clean['Era'].value_counts())
    # Check distribution of years
    print("\nYear Distribution:\n", df_clean['Year'].value_counts())
    
    
    return df_clean
df_clean = explore_data(df)
print(df_clean.shape)

Missing Values:
 Song Title    0
Artist        0
Year          0
Era           0
Lyrics        9
dtype: int64

Era Distribution:
 Era
Blog Era & Trap         10
Conscious Resurgence    10
East vs. West            9
Southern Rap             9
Old School               8
Bling Era                8
Golden Age               7
Name: count, dtype: int64

Year Distribution:
 Year
2017    6
1994    5
2018    4
2004    3
2003    3
1996    3
2015    3
1995    2
2000    2
2005    2
1980    2
1992    2
1989    2
1982    2
1983    2
2010    2
1999    2
1987    1
1997    1
1993    1
1984    1
1979    1
1988    1
1991    1
2001    1
2006    1
2002    1
2012    1
1998    1
2016    1
2019    1
Name: count, dtype: int64
(61, 5)


After removing rows with missing lyrics, we end up with 61 songs.

In [14]:
# 2. Text Preprocessing for Lyrics
def preprocess_lyrics(df):
    # Lowercase all lyrics
    df['Lyrics'] = df['Lyrics'].str.lower()

    # Remove punctuation
    df['Lyrics'] = df['Lyrics'].str.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    df['Lyrics'] = df['Lyrics'].str.strip()
    df['Lyrics'] = df['Lyrics'].str.replace(r'\s+', ' ', regex=True)

    # Remove stopwords
    stop = stopwords.words('english')
    df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df


In [15]:

# 3. TF-IDF Vectorization
def tfidf_embedding(df):
    tfidf = TfidfVectorizer(max_features=5000)
    lyrics_tfidf = tfidf.fit_transform(df['Lyrics'])
    return lyrics_tfidf

# 4. Exploratory Data Analysis (EDA)
def exploratory_analysis(df):
    # Word frequency
    all_words = ' '.join(df['Lyrics']).split()
    word_freq = Counter(all_words)
    most_common_words = word_freq.most_common(20)
    print("\nMost Common Words:\n", most_common_words)

    # Length of lyrics
    df['Lyrics_Length'] = df['Lyrics'].apply(lambda x: len(x.split()))
    print("\nLyrics Length Statistics:\n", df['Lyrics_Length'].describe())

    # Era-based analysis
    era_lyrics_length = df.groupby('Era')['Lyrics_Length'].mean()
    print("\nAverage Lyrics Length by Era:\n", era_lyrics_length)




In [16]:
# Main function to run all steps
def main(df):
    # Step 1: Explore and clean data
    df_clean = explore_data(df)

    # Step 2: Preprocess lyrics
    df_clean = preprocess_lyrics(df_clean)

    # Step 3: Generate TF-IDF Embeddings
    lyrics_tfidf = tfidf_embedding(df_clean)

    # Step 4: Perform Exploratory Data Analysis
    exploratory_analysis(df_clean)

    return df_clean, lyrics_tfidf

# Example usage:
df = pd.read_csv('../data/hiphop_eras_songs.csv')
df_clean, lyrics_tfidf = main(df)  
# Save preprocessed data if needed
# df_clean.to_csv("preprocessed_lyrics.csv", index=False)

Missing Values:
 Song Title    0
Artist        0
Year          0
Era           0
Lyrics        9
dtype: int64

Era Distribution:
 Era
Blog Era & Trap         10
Conscious Resurgence    10
East vs. West            9
Southern Rap             9
Old School               8
Bling Era                8
Golden Age               7
Name: count, dtype: int64

Year Distribution:
 Year
2017    6
1994    5
2018    4
2004    3
2003    3
1996    3
2015    3
1995    2
2000    2
2005    2
1980    2
1992    2
1989    2
1982    2
1983    2
2010    2
1999    2
1987    1
1997    1
1993    1
1984    1
1979    1
1988    1
1991    1
2001    1
2006    1
2002    1
2012    1
1998    1
2016    1
2019    1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lyrics'] = df['Lyrics'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lyrics'] = df['Lyrics'].str.translate(str.maketrans('', '', string.punctuation))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lyrics'] = df['Lyrics'].str.strip()
A value is trying to be set on a copy of


Most Common Words:
 [('im', 467), ('get', 346), ('like', 317), ('know', 294), ('got', 290), ('nigga', 282), ('dont', 268), ('yeah', 243), ('go', 185), ('verse', 183), ('aint', 164), ('chorus', 158), ('cause', 152), ('back', 151), ('say', 144), ('make', 133), ('thats', 130), ('shit', 124), ('see', 119), ('one', 116)]

Lyrics Length Statistics:
 count      61.000000
mean      425.868852
std       214.312192
min       121.000000
25%       315.000000
50%       397.000000
75%       461.000000
max      1636.000000
Name: Lyrics_Length, dtype: float64

Average Lyrics Length by Era:
 Era
Bling Era               428.500000
Blog Era & Trap         291.400000
Conscious Resurgence    434.900000
East vs. West           457.000000
Golden Age              355.857143
Old School              578.125000
Southern Rap            450.888889
Name: Lyrics_Length, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lyrics_Length'] = df['Lyrics'].apply(lambda x: len(x.split()))
