In [2]:
import pandas as pd
import numpy as np
from WrongData import introduce_filthy_data
from sklearn.impute import KNNImputer

## Adding filthy data

In [3]:
df = pd.read_csv("./dataset/spotify_songs_restructured.csv", sep=",")
new_data = introduce_filthy_data(df, 1500)
new_data.to_csv("./dataset/spotify_songs_restructured_filthy.csv", index=False)
new_data.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,31232.0,31233.0,31215.0,31208.0,31206.0,31160.0,31191.0,31206.0,31183.0,31202.0,31279.0,31161.0,31169.0
mean,42.495485,0.654683,0.698614,5.370578,-6.717708,0.566175,0.106941,0.175068,0.084605,0.190169,0.510443,120.850429,225809.484039
std,24.981764,0.145071,0.180778,3.611493,2.990833,0.49561,0.101192,0.21933,0.223996,0.153978,0.233081,26.911912,59927.264879
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0
25%,24.0,0.563,0.581,2.0,-8.167,0.0,0.041,0.0151,0.0,0.0928,0.331,99.951,187742.0
50%,45.0,0.671,0.721,6.0,-6.164,1.0,0.0624,0.0802,1.6e-05,0.127,0.512,121.978,216021.0
75%,62.0,0.76,0.84,9.0,-4.64625,1.0,0.131,0.254,0.00484,0.248,0.693,133.947,253627.0
max,100.0,0.983,1.0,11.0,1.275,1.0,0.918,0.994,0.994,0.996,0.991,239.44,517810.0


## Transforming the data

In [4]:
# nan van naam weghalen
df_clean = new_data.dropna(subset=['track_name', 'track_artist', 'track_album_name', 'track_album_release_date'])
df_clean.head(10)

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66.0,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,pop,dance pop,0.748,0.916,6.0,,1.0,0.0583,0.102,0.0,0.0653,0.518,122.036,194754.0
1,Memories - Dillon Francis Remix,Maroon 5,67.0,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,pop,dance pop,0.726,0.815,,-4.969,1.0,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600.0
2,All the Time - Don Diablo Remix,Zara Larsson,70.0,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,pop,dance pop,0.675,0.931,1.0,-3.432,0.0,,0.0794,2.3e-05,0.11,0.613,124.008,176616.0
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,60.0,Call You Mine - The Remixes,2019-07-19,Pop Remix,pop,dance pop,0.718,0.93,7.0,-3.778,,0.102,0.0287,9e-06,0.204,0.277,121.956,169093.0
5,Beautiful People (feat. Khalid) - Jack Wins Remix,Ed Sheeran,67.0,Beautiful People (feat. Khalid) [Jack Wins Remix],2019-07-11,Pop Remix,pop,dance pop,0.675,0.919,8.0,-5.385,,0.127,0.0799,0.0,0.143,0.585,124.982,163049.0
6,Never Really Over - R3HAB Remix,Katy Perry,62.0,Never Really Over (R3HAB Remix),2019-07-26,Pop Remix,pop,dance pop,0.449,0.856,5.0,-4.788,0.0,0.0623,0.187,0.0,0.176,,112.648,187675.0
9,If I Can't Have You - Gryffin Remix,Shawn Mendes,67.0,If I Can't Have You (Gryffin Remix),2019-06-20,Pop Remix,pop,dance pop,0.642,0.818,2.0,-4.552,1.0,0.032,0.0567,0.0,,0.59,124.957,253040.0
10,Cross Me (feat. Chance the Rapper & PnB Rock) ...,Ed Sheeran,58.0,Cross Me (feat. Chance the Rapper & PnB Rock) ...,2019-06-21,Pop Remix,pop,dance pop,0.679,0.923,6.0,-6.5,1.0,0.181,,5e-06,0.124,0.752,121.984,207894.0
12,Body On My,Loud Luxury,67.0,Body On My,2019-03-29,Pop Remix,pop,dance pop,0.744,0.726,1.0,-4.675,1.0,0.0463,0.0399,0.0,0.374,0.687,,192507.0
13,SOS - Laidback Luke Tribute Remix / Radio Edit,Avicii,68.0,SOS (Laidback Luke Tribute Remix),2019-05-17,Pop Remix,,dance pop,0.572,0.915,5.0,-4.451,0.0,0.0625,0.0111,0.0,0.339,0.678,123.919,164516.0


### Imputing missing values

In [5]:
df_clean = df_clean.fillna({
    "danceability": df_clean["danceability"].mean(), 
    "energy": df_clean.energy.mean(), 
    "key": df_clean.key.mean(), 
    "loudness": df_clean.loudness.mean(), 
    "mode": df_clean['mode'].mean(), 
    "speechiness": df_clean.speechiness.mean(), 
    "acousticness": df_clean.acousticness.mean(), 
    "instrumentalness": df_clean.instrumentalness.mean(), 
    "liveness": df_clean.liveness.mean(), 
    "valence": df_clean.valence.mean(), 
    "tempo": df_clean.tempo.mean(), 
    "duration_ms": df_clean.duration_ms.mean(),
    "track_popularity": df_clean.track_popularity.mean()
})

df_clean.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0,26241.0
mean,42.43056,0.654814,0.698468,5.373157,-6.711124,0.56484,0.107319,0.175537,0.085134,0.189597,0.510806,120.89734,225976.246653
std,24.186369,0.140709,0.175156,3.500774,2.889417,0.479723,0.098401,0.213206,0.217475,0.148058,0.226519,26.055057,58337.937373
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0
25%,26.0,0.571,0.592,2.0,-7.989,0.0,0.0421,0.0174,0.0,0.0949,0.343,100.05,189600.0
50%,43.0,0.659,0.706,5.373157,-6.376,1.0,0.0671,0.0958,3.3e-05,0.135,0.510806,120.89734,220440.0
75%,61.0,0.754,0.832,8.0,-4.744,1.0,0.123,0.234,0.0182,0.234,0.681,131.059,250710.0
max,100.0,0.979,1.0,11.0,1.275,1.0,0.918,0.994,0.994,0.996,0.991,239.44,517810.0


### outliers removal

In [6]:
# Define the columns where you want to remove outliers
columns_to_clean = ["danceability", "energy", "key", "loudness", "mode", 
                    "speechiness", "acousticness", "instrumentalness", 
                    "liveness", "valence", "tempo", "duration_ms"]

# Loop through each column and apply the custom quantile-based outlier removal
for col in columns_to_clean:
    Q01 = df_clean[col].quantile(0.01)  # 1st percentile
    Q99 = df_clean[col].quantile(0.99)  # 99th percentile
    
    # Remove rows where the values in the column are outside the 1st and 99th percentile bounds
    df_clean = df_clean[(df_clean[col] >= Q01) & (df_clean[col] <= Q99)]

# Display the cleaned DataFrame
df_clean.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0,21666.0
mean,43.14915,0.662288,0.707684,5.372966,-6.463788,0.561024,0.10419,0.162094,0.061936,0.185037,0.518507,120.467124,224331.466183
std,24.247196,0.129555,0.155107,3.508978,2.390884,0.480666,0.089632,0.187676,0.177201,0.130892,0.216275,24.339928,49471.807416
min,0.0,0.276,0.215,0.0,-14.918,0.0,0.0264,7.6e-05,0.0,0.0376,0.0605,74.98,123333.0
25%,27.0,0.582,0.61,2.0,-7.67575,0.0,0.0423,0.0192,0.0,0.0956,0.35525,100.06425,190577.5
50%,44.0,0.666,0.711,5.373157,-6.238,1.0,0.0671,0.0939,1.8e-05,0.137,0.510806,120.89734,219709.5
75%,62.0,0.756,0.828,8.0,-4.72625,1.0,0.122,0.22,0.007092,0.234,0.683,130.053,247546.0
max,100.0,0.925,0.98,11.0,-2.068,1.0,0.444,0.848,0.905,0.77,0.961,189.372,420000.0


### Categorical Encoding

In [7]:
# Define categories and labels for track_popularity
popularity_bins = [0, 20, 40, 60, 80, 100]
popularity_labels = ['Very Low Popularity', 'Low Popularity', 'Medium Popularity', 'High Popularity', 'Very High Popularity']
df_clean['popularity_category'] = pd.Categorical(pd.cut(df_clean['track_popularity'], bins=popularity_bins, labels=popularity_labels))

# Define categories and labels for tempo
tempo_bins = [0, 60, 90, 120, 150, 180, float('inf')]
tempo_labels = ['Very Slow', 'Slow', 'Moderate', 'Fast', 'Very Fast', 'Extremely Fast']
df_clean['tempo_category'] = pd.Categorical(pd.cut(df_clean['tempo'], bins=tempo_bins, labels=tempo_labels))

# Define categories and labels for loudness
loudness_bins = [-float('inf'), -15, -10, -5, 0, float('inf')]
loudness_labels = ['Very Quiet', 'Quiet', 'Moderate', 'Loud', 'Very Loud']
df_clean['loudness_category'] = pd.Categorical(pd.cut(df_clean['loudness'], bins=loudness_bins, labels=loudness_labels))
df_clean

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_genre,playlist_subgenre,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity_category,tempo_category,loudness_category
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66.00000,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,pop,dance pop,0.748,0.916000,...,0.058300,0.102000,0.000000,0.065300,0.5180,122.036,194754.0,High Popularity,Fast,Moderate
1,Memories - Dillon Francis Remix,Maroon 5,67.00000,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,pop,dance pop,0.726,0.815000,...,0.037300,0.072400,0.004210,0.357000,0.6930,99.972,162600.0,High Popularity,Moderate,Loud
2,All the Time - Don Diablo Remix,Zara Larsson,70.00000,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,pop,dance pop,0.675,0.931000,...,0.107319,0.079400,0.000023,0.110000,0.6130,124.008,176616.0,High Popularity,Fast,Loud
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,60.00000,Call You Mine - The Remixes,2019-07-19,Pop Remix,pop,dance pop,0.718,0.930000,...,0.102000,0.028700,0.000009,0.204000,0.2770,121.956,169093.0,Medium Popularity,Fast,Loud
5,Beautiful People (feat. Khalid) - Jack Wins Remix,Ed Sheeran,67.00000,Beautiful People (feat. Khalid) [Jack Wins Remix],2019-07-11,Pop Remix,pop,dance pop,0.675,0.919000,...,0.127000,0.079900,0.000000,0.143000,0.5850,124.982,163049.0,High Popularity,Fast,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32826,Wasted,Tiësto,47.00000,Wasted,2014-04-22,♥ EDM LOVE 2020,edm,progressive electro house,0.645,0.832000,...,0.107319,0.001060,0.002640,0.199000,0.3750,112.028,188371.0,Medium Popularity,Moderate,Moderate
32829,Closer - Sultan & Ned Shepard Remix,Tegan and Sara,20.00000,Closer Remixed,2013-03-08,♥ EDM LOVE 2020,edm,progressive electro house,0.522,0.786000,...,0.042000,0.001710,0.004270,0.189597,0.4000,128.041,353120.0,Very Low Popularity,Fast,Loud
32830,Sweet Surrender - Radio Edit,Starkillers,14.00000,Sweet Surrender (Radio Edit),2014-04-21,♥ EDM LOVE 2020,edm,progressive electro house,0.529,0.698468,...,0.048100,0.108000,0.000001,0.150000,0.4360,127.989,210112.0,Very Low Popularity,Fast,Loud
32831,Only For You - Maor Levi Remix,Mat Zo,15.00000,Only For You (Remixes),2014-01-01,♥ EDM LOVE 2020,edm,progressive electro house,0.626,0.888000,...,0.109000,0.007920,0.127000,0.343000,0.3080,128.008,367432.0,Very Low Popularity,Fast,Loud


## Feature splitting for release_date (Domain expertise)

In [8]:
# Convert track_album_release_date to datetime, coercing errors
df_clean['track_album_release_date'] = pd.to_datetime(df_clean['track_album_release_date'], errors='coerce')

# Extract the month name and create a new column
df_clean['release_month'] = df_clean['track_album_release_date'].dt.strftime('%B')

# Display the DataFrame
df_clean.drop(columns=['instrumentalness', 'acousticness', 'mode', 'playlist_subgenre', 'playlist_name'], inplace=True)
df_clean

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_genre,danceability,energy,key,loudness,speechiness,liveness,valence,tempo,duration_ms,popularity_category,tempo_category,loudness_category,release_month
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66.00000,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,pop,0.748,0.916000,6.000000,-6.711124,0.058300,0.065300,0.5180,122.036,194754.0,High Popularity,Fast,Moderate,June
1,Memories - Dillon Francis Remix,Maroon 5,67.00000,Memories (Dillon Francis Remix),2019-12-13,pop,0.726,0.815000,5.373157,-4.969000,0.037300,0.357000,0.6930,99.972,162600.0,High Popularity,Moderate,Loud,December
2,All the Time - Don Diablo Remix,Zara Larsson,70.00000,All the Time (Don Diablo Remix),2019-07-05,pop,0.675,0.931000,1.000000,-3.432000,0.107319,0.110000,0.6130,124.008,176616.0,High Popularity,Fast,Loud,July
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,60.00000,Call You Mine - The Remixes,2019-07-19,pop,0.718,0.930000,7.000000,-3.778000,0.102000,0.204000,0.2770,121.956,169093.0,Medium Popularity,Fast,Loud,July
5,Beautiful People (feat. Khalid) - Jack Wins Remix,Ed Sheeran,67.00000,Beautiful People (feat. Khalid) [Jack Wins Remix],2019-07-11,pop,0.675,0.919000,8.000000,-5.385000,0.127000,0.143000,0.5850,124.982,163049.0,High Popularity,Fast,Moderate,July
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32826,Wasted,Tiësto,47.00000,Wasted,2014-04-22,edm,0.645,0.832000,2.000000,-5.595000,0.107319,0.199000,0.3750,112.028,188371.0,Medium Popularity,Moderate,Moderate,April
32829,Closer - Sultan & Ned Shepard Remix,Tegan and Sara,20.00000,Closer Remixed,2013-03-08,edm,0.522,0.786000,0.000000,-4.462000,0.042000,0.189597,0.4000,128.041,353120.0,Very Low Popularity,Fast,Loud,March
32830,Sweet Surrender - Radio Edit,Starkillers,14.00000,Sweet Surrender (Radio Edit),2014-04-21,edm,0.529,0.698468,6.000000,-4.899000,0.048100,0.150000,0.4360,127.989,210112.0,Very Low Popularity,Fast,Loud,April
32831,Only For You - Maor Levi Remix,Mat Zo,15.00000,Only For You (Remixes),2014-01-01,edm,0.626,0.888000,5.373157,-3.361000,0.109000,0.343000,0.3080,128.008,367432.0,Very Low Popularity,Fast,Loud,January


In [9]:
df_clean.to_csv("./dataset/spotify_songs_restructured_cleaned.csv", index=False)