In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv('whatyouknow.csv')
if 'Lyrics' not in data.columns:
    raise KeyError("The dataset does not contain a 'Lyrics' column.")

lyrics = data['Lyrics'].fillna('')  # Handle missing or NaN values

# Combined lowercase conversion and punctuation removal
def preprocess_lyrics(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation (only keeping words and spaces)
    return text

# Clean lyrics by removing stopwords
def clean_lyrics(text):
    tokens = word_tokenize(text)  # Tokenize words
    stop_words = set(stopwords.words('english'))  # Get stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens) if tokens else ''  # Handle empty tokens

# Apply transformations
data['Processed Lyrics'] = data['Lyrics'].fillna('').apply(preprocess_lyrics)
data['Cleaned Lyrics'] = data['Processed Lyrics'].apply(clean_lyrics)

# Save the cleaned lyrics to a new CSV file
data.to_csv("whatyouknow_cleaned.csv", index=False)

print(data.head())  # Print the first few rows to check the transformations


                                              Lyrics  \
0                     In a few weeks I will get time   
1               To realize it's right before my eyes   
2        And I can take it if it's what I want to do   
3  And I am leaving and this is starting to feel ...   
4                          It's right before my eyes   

                                    Processed Lyrics  \
0                     in a few weeks i will get time   
1                to realize its right before my eyes   
2         and i can take it if its what i want to do   
3  and i am leaving and this is starting to feel ...   
4                           its right before my eyes   

               Cleaned Lyrics  
0              weeks get time  
1          realize right eyes  
2                   take want  
3  leaving starting feel like  
4                  right eyes  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
