In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('Björk_songs_with_lyrics_final.csv')
df.head()

Unnamed: 0,track_name,album_name,release_date,track_id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics_url,lyrics_text
0,Ocean,Mount Wittenberg Orca (Expanded Edition),2023-06-16,7oMy4krSWbBPVHtXBiyHrk,129026,0.104,0.0475,10,-13.914,1,0.0451,0.59,0.987,0.13,0.038,76.759,https://genius.com/Bjork-living-by-the-ocean-i...,Anchor Song I wrote moving to my house\nWhich ...
1,On and Ever Onward,Mount Wittenberg Orca (Expanded Edition),2023-06-16,2GxGDzq6dpPXUADQsa6o9T,121013,0.719,0.236,7,-9.063,1,0.033,0.774,0.0,0.0621,0.836,112.27,Lyrics not found,Lyrics not found
2,When the World Comes to an End,Mount Wittenberg Orca (Expanded Edition),2023-06-16,09TNNle5fDBPUtLpO6a7BK,188053,0.439,0.318,5,-11.552,1,0.046,0.579,7e-06,0.121,0.507,156.127,Lyrics not found,Lyrics not found
3,Beautiful Mother,Mount Wittenberg Orca (Expanded Edition),2023-06-16,3shbvAgUyh9nhJa53JN8mB,136360,0.86,0.154,7,-10.016,1,0.0539,0.679,0.00113,0.116,0.398,126.6,Lyrics not found,Lyrics not found
4,Sharing Orb,Mount Wittenberg Orca (Expanded Edition),2023-06-16,1M4euFTqRYW2xTLhqYGC0C,168000,0.496,0.0524,7,-9.198,1,0.0411,0.888,0.0,0.19,0.241,118.455,Lyrics not found,Lyrics not found


# Handle Missing Values

In [3]:
print("Missing values before cleaning:")
print(df.isnull().sum())
df.dropna(subset=['track_name', 'release_date', 'tempo'], inplace=True)  # Critical columns
df['lyrics_text'].fillna("Lyrics not found", inplace=True)


Missing values before cleaning:
track_name          0
album_name          0
release_date        0
track_id            0
duration_ms         0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
lyrics_url          0
lyrics_text         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['lyrics_text'].fillna("Lyrics not found", inplace=True)


# Remove Duplicates

In [4]:
print(f"Duplicates before cleaning: {df.duplicated(subset=['track_name', 'album_name']).sum()}")
df.drop_duplicates(subset=['track_name', 'album_name'], inplace=True)

Duplicates before cleaning: 9


# Validate Data

In [5]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')  # Convert to datetime
df['duration_ms'] = pd.to_numeric(df['duration_ms'], errors='coerce')
df['tempo'] = pd.to_numeric(df['tempo'], errors='coerce')
df.head()

Unnamed: 0,track_name,album_name,release_date,track_id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics_url,lyrics_text
0,Ocean,Mount Wittenberg Orca (Expanded Edition),2023-06-16,7oMy4krSWbBPVHtXBiyHrk,129026,0.104,0.0475,10,-13.914,1,0.0451,0.59,0.987,0.13,0.038,76.759,https://genius.com/Bjork-living-by-the-ocean-i...,Anchor Song I wrote moving to my house\nWhich ...
1,On and Ever Onward,Mount Wittenberg Orca (Expanded Edition),2023-06-16,2GxGDzq6dpPXUADQsa6o9T,121013,0.719,0.236,7,-9.063,1,0.033,0.774,0.0,0.0621,0.836,112.27,Lyrics not found,Lyrics not found
2,When the World Comes to an End,Mount Wittenberg Orca (Expanded Edition),2023-06-16,09TNNle5fDBPUtLpO6a7BK,188053,0.439,0.318,5,-11.552,1,0.046,0.579,7e-06,0.121,0.507,156.127,Lyrics not found,Lyrics not found
3,Beautiful Mother,Mount Wittenberg Orca (Expanded Edition),2023-06-16,3shbvAgUyh9nhJa53JN8mB,136360,0.86,0.154,7,-10.016,1,0.0539,0.679,0.00113,0.116,0.398,126.6,Lyrics not found,Lyrics not found
4,Sharing Orb,Mount Wittenberg Orca (Expanded Edition),2023-06-16,1M4euFTqRYW2xTLhqYGC0C,168000,0.496,0.0524,7,-9.198,1,0.0411,0.888,0.0,0.19,0.241,118.455,Lyrics not found,Lyrics not found


# Trim Whitespace

In [None]:
df['track_name'] = df['track_name'].str.strip()
df['album_name'] = df['album_name'].str.strip()

# Standardize Case

In [None]:
df['track_name'] = df['track_name'].str.lower()
df['album_name'] = df['album_name'].str.lower()

# Remove Unnecessary Columns

In [6]:
columns_to_drop = ['track_id', 'lyrics_url']  # Add any column you don't want to keep
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
df.head()

Unnamed: 0,track_name,album_name,release_date,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics_text
0,Ocean,Mount Wittenberg Orca (Expanded Edition),2023-06-16,129026,0.104,0.0475,10,-13.914,1,0.0451,0.59,0.987,0.13,0.038,76.759,Anchor Song I wrote moving to my house\nWhich ...
1,On and Ever Onward,Mount Wittenberg Orca (Expanded Edition),2023-06-16,121013,0.719,0.236,7,-9.063,1,0.033,0.774,0.0,0.0621,0.836,112.27,Lyrics not found
2,When the World Comes to an End,Mount Wittenberg Orca (Expanded Edition),2023-06-16,188053,0.439,0.318,5,-11.552,1,0.046,0.579,7e-06,0.121,0.507,156.127,Lyrics not found
3,Beautiful Mother,Mount Wittenberg Orca (Expanded Edition),2023-06-16,136360,0.86,0.154,7,-10.016,1,0.0539,0.679,0.00113,0.116,0.398,126.6,Lyrics not found
4,Sharing Orb,Mount Wittenberg Orca (Expanded Edition),2023-06-16,168000,0.496,0.0524,7,-9.198,1,0.0411,0.888,0.0,0.19,0.241,118.455,Lyrics not found


# Validate Audio Features

In [None]:
audio_features = ['danceability', 'energy', 'valence', 'tempo']
for feature in audio_features:
    df[feature] = df[feature].apply(lambda x: np.nan if x < 0 or x > 1 else x)


# Clean Lyrics Text

In [7]:
df['lyrics_text'] = df['lyrics_text'].str.replace(r'\n+', '\n', regex=True).str.strip()

# Save Cleaned Dataset

In [8]:
df.to_csv('Björk_songs_cleaned.csv', index=False)
print("Data cleaning complete. Saved as 'Björk_songs_cleaned.csv'.")

Data cleaning complete. Saved as 'Björk_songs_cleaned.csv'.
