In [164]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re

In [165]:
# Import the data
df = pd.read_csv("song_lyrics.csv", nrows=200)


# Select only the English songs
df = df.query('language == "en" and language_cld3 == "en" and language_ft == "en"')

# Drop the columns that are not needed
df = df.drop(['title',
              'tag',
              'artist',
              'year',
              'views',
              'features',
              'language_cld3',
              'language_ft',
              'language'], axis=1)

# Show the first 5 rows of the data
df.head()

Unnamed: 0,lyrics,id
0,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1
1,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
2,Maybe cause I'm eatin\nAnd these bastards fien...,4
3,[Produced by Kanye West and Brian Miller]\n\n[...,5
4,"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6


In [166]:
# Function to normalize the text
def normalize(texts):
    new_texts = []
    for text in texts:
        text = text.lower()
        text = re.sub('\r','',text)
        text = re.sub('\t',' ',text)
        text = re.sub('\f','',text)
        text = re.sub('\n','',text)
        text = re.sub('\x80','',text)
        text = re.sub('\x99t','',text)
        text = re.sub("\(.*?\)",'',text)
        text = re.sub("\[.*?\]",'',text) 
        
        if text != '':
            new_texts.append(text)
        
    return new_texts

In [167]:
# function to remove the stopwords
corpus = []
for text in df['lyrics']:
    line = []  # Reset line for each text
    splitted = text.split('\n')
    for i in splitted:   
        if len(i.strip().split()) < 3:
            continue
        else:
            line.append(i)
    corpus.append(line)

# update the lyrics column
df['lyrics'] = corpus
del corpus

# show the first 5 rows of the data
df.head()

Unnamed: 0,lyrics,id
0,"[[Chorus: Opera Steve & Cam'ron], Killa Cam, K...",1
1,"[[Produced by Irv Gotti], Yeah, hah, yeah, Roc...",3
2,"[Maybe cause I'm eatin, And these bastards fie...",4
3,"[[Produced by Kanye West and Brian Miller], [I...",5
4,"[So they ask me, What you gon' do the second t...",6


In [168]:
# Normalize the lyrics and join them back together
df['lyrics'] = df['lyrics'].apply(normalize).apply(lambda x: ' '.join(x))

# View the first normalized lyrics
df.head()

Unnamed: 0,lyrics,id
0,"killa cam, killa cam, cam killa cam, killa cam...",1
1,"yeah, hah, yeah, roc-a-fella we invite you to ...",3
2,maybe cause i'm eatin and these bastards fiend...,4
3,"kanye, this that 1970s heron flow, huh? yeah, ...",5
4,so they ask me what you gon' do the second tim...,6


In [170]:
# Create the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['lyrics'])


# K-means Clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(tfidf_matrix)

# Add cluster labels to the dataframe
df['cluster_label'] = cluster_labels

# Show the first 5 rows of the data
print(df)

                                                lyrics   id  cluster_label
0    killa cam, killa cam, cam killa cam, killa cam...    1              0
1    yeah, hah, yeah, roc-a-fella we invite you to ...    3              1
2    maybe cause i'm eatin and these bastards fiend...    4              2
3    kanye, this that 1970s heron flow, huh? yeah, ...    5              1
4    so they ask me what you gon' do the second tim...    6              2
..                                                 ...  ...            ...
195  them niggas actors they deserve oscars we pull...  196              1
196  we gon drink, we gon ride we gon smoke and get...  197              2
197  okay, i'm going to attempt to drown myself you...  198              2
198  uh huh, ge-ge ge-ge-geah ye-ye-yeah, ye-ye-yea...  199              1
199  i'm at your door, your eyes are like, "why are...  200              1

[199 rows x 3 columns]
