In [27]:
import pandas as pd

In [28]:
df = pd.read_csv("Portishead_songs_with_lyrics2.csv")
df.head()

Unnamed: 0,track_name,album_name,release_date,track_id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics_url,lyrics_text
0,Silence,Third,2008-01-01,6CLLkUajKRN9LS9PeK3OZF,299586,0.419,0.825,11,-7.453,1,0.091,0.195,0.146,0.339,0.104,127.528,https://genius.com/Portishead-silence-lyrics,[Intro]\nEsteja alerta para as regras dos três...
1,Hunter,Third,2008-01-01,0J3bnjae0jkfUpbdHr81Sr,237413,0.523,0.408,10,-9.563,0,0.0369,0.896,0.134,0.098,0.157,134.179,https://genius.com/Portishead-hunter-lyrics,[Verse 1]\nNo one said\nWe'd ever known each o...
2,Nylon Smile,Third,2008-01-01,67oK2xcjCuLe8i3OuUjEUr,196600,0.696,0.674,7,-9.953,1,0.159,0.867,0.371,0.229,0.35,127.909,https://genius.com/Portishead-nylon-smile-lyrics,[Verse]\nI'd like to laugh at what you said\nB...
3,The Rip,Third,2008-01-01,5KX2DSPC6aCA0pdDidTmBC,270946,0.471,0.484,9,-9.414,0,0.0317,0.925,0.499,0.0987,0.236,74.982,https://genius.com/Portishead-the-rip-lyrics,[Verse 1]\nAs she walks in the room\nCentered ...
4,Plastic,Third,2008-01-01,6K3gtAIEWNqYHfd4mSEKcT,207573,0.353,0.553,2,-8.145,1,0.0856,0.753,0.0415,0.0861,0.0652,153.656,https://genius.com/Portishead-plastic-lyrics,[Verse 1]\nI wonder why\nI don't know what you...


In [29]:
df['track_name']

0                           Silence
1                            Hunter
2                       Nylon Smile
3                           The Rip
4                           Plastic
                  ...              
66                             Numb
67                 Numbed In Moscow
68            Revenge Of The Number
69                Numb-Earth Linger
70    A Tribute To Monk & Canatella
Name: track_name, Length: 71, dtype: object

In [30]:
df.columns

Index(['track_name', 'album_name', 'release_date', 'track_id', 'duration_ms',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'lyrics_url', 'lyrics_text'],
      dtype='object')

In [31]:
# Filter out rows with "Lyrics not found" or "Error" in the lyrics_text column
df = df[~df['lyrics_text'].isin(["Lyrics not found", "Error", "null"])]

In [32]:
# Standardize case and trim whitespace in text columns
df['track_name'] = df['track_name'].str.title().str.strip()
df['album_name'] = df['album_name'].str.title().str.strip()
df['lyrics_text'] = df['lyrics_text'].str.strip()


In [33]:
# Convert release_date to datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')


In [34]:
# Convert columns to appropriate numeric data types, coerce errors to NaN
numeric_columns = ['duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 
                   'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')


In [35]:
df.dropna(subset=numeric_columns, inplace=True)


In [36]:
# Remove duplicates based on the track_id
df.drop_duplicates(subset='track_id', keep='first', inplace=True)


In [37]:
import re


# Ensure 'lyrics_text' column only processes non-null strings
df['lyrics_text'] = df['lyrics_text'].apply(lambda x: re.sub(r"\[.*?\]", "", str(x)).strip() if isinstance(x, str) else x)


In [38]:
# Cleaned dataset for data analysis (excluding track_id and lyrics_url)
data_analysis_df = df.drop(columns=['track_id', 'lyrics_url'])


# Dataset for auto-lyrics (only track_name and lyrics_text)
auto_lyrics_df = df[['track_name', 'lyrics_text']]

In [39]:
# Display the first few rows of each to verify
print("Data Analysis Dataset:")
display(data_analysis_df.head())


Data Analysis Dataset:


Unnamed: 0,track_name,album_name,release_date,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics_text
0,Silence,Third,2008-01-01,299586,0.419,0.825,11,-7.453,1,0.091,0.195,0.146,0.339,0.104,127.528,Esteja alerta para as regras dos três\nO que v...
1,Hunter,Third,2008-01-01,237413,0.523,0.408,10,-9.563,0,0.0369,0.896,0.134,0.098,0.157,134.179,No one said\nWe'd ever known each other\nAnd n...
2,Nylon Smile,Third,2008-01-01,196600,0.696,0.674,7,-9.953,1,0.159,0.867,0.371,0.229,0.35,127.909,I'd like to laugh at what you said\nBut I just...
3,The Rip,Third,2008-01-01,270946,0.471,0.484,9,-9.414,0,0.0317,0.925,0.499,0.0987,0.236,74.982,As she walks in the room\nCentered and tall\nH...
4,Plastic,Third,2008-01-01,207573,0.353,0.553,2,-8.145,1,0.0856,0.753,0.0415,0.0861,0.0652,153.656,I wonder why\nI don't know what you see\nOf co...


In [40]:
print("\nAuto-Lyrics Dataset:")
display(auto_lyrics_df.head())


Auto-Lyrics Dataset:


Unnamed: 0,track_name,lyrics_text
0,Silence,Esteja alerta para as regras dos três\nO que v...
1,Hunter,No one said\nWe'd ever known each other\nAnd n...
2,Nylon Smile,I'd like to laugh at what you said\nBut I just...
3,The Rip,As she walks in the room\nCentered and tall\nH...
4,Plastic,I wonder why\nI don't know what you see\nOf co...


In [41]:
# Save both datasets as CSV files
data_analysis_df.to_csv('portishead_data_analysis.csv', index=False)
auto_lyrics_df.to_csv('portishead_auto_lyrics.csv', index=False)