## **Import Packages**


In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import linkage, dendrogram
import statsmodels.api as sm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

print("All packages imported successfully!")

All packages imported successfully!


## **Upload files**


In [76]:
df = pd.read_csv('dataset.csv')
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")

Shape: 114000 rows × 21 columns


## **Data Cleaning**


#### Data inspection


In [77]:
print("First 5 rows:")
display(df.head())
print("\n" + "="*100 + "\n")
print("Last 5 rows:")
display(df.tail())

First 5 rows:


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic




Last 5 rows:


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
113995,113995,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.235,5,-16.393,1,0.0422,0.64,0.928,0.0863,0.0339,125.995,5,world-music
113996,113996,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.117,0,-18.318,0,0.0401,0.994,0.976,0.105,0.035,85.239,4,world-music
113997,113997,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.329,0,-10.895,0,0.042,0.867,0.0,0.0839,0.743,132.378,4,world-music
113998,113998,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.506,7,-10.889,1,0.0297,0.381,0.0,0.27,0.413,135.96,4,world-music
113999,113999,2hETkH7cOfqmz3LqZDHZf5,Cesária Evora,Miss Perfumado,Barbincor,22,241826,False,0.526,0.487,1,-10.204,0,0.0725,0.681,0.0,0.0893,0.708,79.198,4,world-music


In [78]:
print("Data types of all columns:")
print(df.dtypes)
print("\n" + "="*100 + "\n")
print("Dataset info:")
df.info()

Data types of all columns:
Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3  

In [79]:
print("Summary statistics for numerical columns:")
display(df.describe())
print("\n" + "="*100 + "\n")
print("Summary statistics (including non-numeric):")
display(df.describe(include='all'))



Summary statistics for numerical columns:


Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,56999.5,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,32909.109681,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28499.75,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,56999.5,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,85499.25,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,113999.0,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0




Summary statistics (including non-numeric):


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
count,114000.0,114000,113999,113999,113999,114000.0,114000.0,114000,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000
unique,,89741,31437,46589,73608,,,2,,,,,,,,,,,,,114
top,,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,,,False,,,,,,,,,,,,,acoustic
freq,,9,279,195,151,,,104253,,,,,,,,,,,,,1000
mean,56999.5,,,,,33.238535,228029.2,,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035,
std,32909.109681,,,,,22.305078,107297.7,,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621,
min,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,28499.75,,,,,17.0,174066.0,,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0,
50%,56999.5,,,,,35.0,212906.0,,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0,
75%,85499.25,,,,,50.0,261506.0,,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0,


In [80]:
## Drop irrelevant features
print(df.columns.tolist())
drop_cols = ['time_signature','key']
df.drop(columns = drop_cols,inplace = True)

['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']


### Check for and handle missing values

In [81]:
missing_values = df.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])
print(f"\nTotal missing values: {missing_values.sum()}")
df_clean = df.dropna()
print(f"\nRows before: {len(df)}")
print(f"Rows after: {len(df_clean)}")
print(f"Rows removed: {len(df) - len(df_clean)}")

Columns with missing values:
artists       1
album_name    1
track_name    1
dtype: int64

Total missing values: 3

Rows before: 114000
Rows after: 113999
Rows removed: 1


### Split artyist names


In [82]:
# Split on artist names
# Split artists by semicolon
artists_split = df_clean['artists'].str.split(';', expand=True)

# Add the split columns to the dataframe with proper naming and cleaning
for i in range(artists_split.shape[1]):
    col_name = f'artist_{i+1}'
    # Clean each artist column: strip whitespace and convert to lowercase
    df_clean[col_name] = artists_split[i].str.strip().str.lower()

df_clean.columns

# # Only keep first 3 artists
df_clean = df_clean[['track_id', 'album_name', 'track_name','duration_ms', 'explicit','popularity', 'danceability', 'energy', 'loudness', 'mode','speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'track_genre', 'artist_1', 'artist_2', 'artist_3']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col_name] = artists_split[i].str.strip().str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col_name] = artists_split[i].str.strip().str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col_name] = artists_split[i].str.strip().str.lower()
A value is trying t

### Check for Duplicate Rows


In [83]:
# Remove duplicates based on track_name and artist_1
df_clean_unique = df_clean.drop_duplicates(subset=['track_name', 'artist_1'], keep='first')

print(f"Original: {len(df_clean)} rows")
print(f"Removed: {len(df_clean) - len(df_clean_unique)} duplicates")
print(f"Final: {len(df_clean_unique)} rows")
df_clean=df_clean_unique

Original: 113999 rows
Removed: 32866 duplicates
Final: 81133 rows


### Data Type Conversion (if needed)


In [84]:
print("Converting data types:")
#Converting explicit from boolean to binary
if 'explicit' in df_clean.columns:
    df_clean['explicit'] = df_clean['explicit'].astype(int)
    print("✓ Converted 'explicit' to binary (0/1)")

unnamed_cols = [
    col for col in df_clean.columns if 'Unnamed' in col or col == '']
if unnamed_cols:
    df_clean = df_clean.drop(columns=unnamed_cols)
    print(f"✓ Dropped unnamed index columns: {unnamed_cols}")
else:
    print("✓ No unnamed columns to drop")

print("\nData types after conversion:")
print(df_clean.dtypes)

Converting data types:
✓ Converted 'explicit' to binary (0/1)
✓ No unnamed columns to drop

Data types after conversion:
track_id             object
album_name           object
track_name           object
duration_ms           int64
explicit              int64
popularity            int64
danceability        float64
energy              float64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
track_genre          object
artist_1             object
artist_2             object
artist_3             object
dtype: object


In [85]:
date_columns = [col for col in df_clean.columns if 'date' in col.lower()
                or 'time' in col.lower()]
if date_columns:
    print(f"Date columns found: {date_columns}")
else:
    print("No date columns to parse")

No date columns to parse


In [86]:
categorical_candidates = df_clean.select_dtypes(
    include=['object']).columns.tolist()
print(
    f"Remaining object columns that could be categorical: {categorical_candidates}")
print(
    f"\nCurrent categorical columns: {df_clean.select_dtypes(include=['category']).columns.tolist()}")

Remaining object columns that could be categorical: ['track_id', 'album_name', 'track_name', 'track_genre', 'artist_1', 'artist_2', 'artist_3']

Current categorical columns: []


### Handle Inconsistent Data


In [87]:
text_columns = df_clean.select_dtypes(include=['object']).columns.tolist()
print(f"Text columns to standardize: {text_columns}")

for col in text_columns:
    df_clean[col] = df_clean[col].str.strip()
    df_clean[col] = df_clean[col].str.lower()
    print(f"✓ Standardized '{col}': lowercase and stripped whitespace")

print("\nSample of standardized data:")
display(df_clean[text_columns].head())

Text columns to standardize: ['track_id', 'album_name', 'track_name', 'track_genre', 'artist_1', 'artist_2', 'artist_3']
✓ Standardized 'track_id': lowercase and stripped whitespace
✓ Standardized 'album_name': lowercase and stripped whitespace
✓ Standardized 'track_name': lowercase and stripped whitespace
✓ Standardized 'track_genre': lowercase and stripped whitespace
✓ Standardized 'artist_1': lowercase and stripped whitespace
✓ Standardized 'artist_2': lowercase and stripped whitespace
✓ Standardized 'artist_3': lowercase and stripped whitespace

Sample of standardized data:


Unnamed: 0,track_id,album_name,track_name,track_genre,artist_1,artist_2,artist_3
0,5suoikwirypmvoiqdjugsv,comedy,comedy,acoustic,gen hoshino,,
1,4qpndbw1i3p13qlct0ki3a,ghost (acoustic),ghost - acoustic,acoustic,ben woodward,,
2,1ijbsr7s7jyxzm8egcbk5b,to begin again,to begin again,acoustic,ingrid michaelson,zayn,
3,6lfxq3cg4xttieg7opycyx,crazy rich asians (original motion picture sou...,can't help falling in love,acoustic,kina grannis,,
4,5vjlsffimiip26qg5wcn2k,hold on,hold on,acoustic,chord overstreet,,


In [88]:
if 'track_genre' in df_clean.columns:
    print("Unique genres before cleaning:")
    print(f"Total unique genres: {df_clean['track_genre'].nunique()}")

    print("\nChecking for inconsistent values in track_genre...")
    genre_counts = df_clean['track_genre'].value_counts()
    print(f"\nTop 10 genres:")
    print(genre_counts.head(10))
else:
    print("No track_genre column to check for inconsistencies")

Unique genres before cleaning:
Total unique genres: 113

Checking for inconsistent values in track_genre...

Top 10 genres:
track_genre
study          996
black-metal    991
comedy         987
heavy-metal    983
bluegrass      978
forro          966
grindcore      965
malay          963
idm            958
iranian        957
Name: count, dtype: int64


### Validate Data Integrity


In [89]:
print("Checking for logical inconsistencies:\n")

if 'popularity' in df_clean.columns:
    invalid_popularity = df_clean[(df_clean['popularity'] < 0) | (
        df_clean['popularity'] > 100)]
    print(
        f"  - Popularity out of range [0-100]: {len(invalid_popularity)} rows")

if 'duration_ms' in df_clean.columns:
    invalid_duration = df_clean[df_clean['duration_ms'] < 90000]         #any track below 1 min 30 seconds 
    print(f"  - Duration < 90000: {len(invalid_duration)} rows")
if 'duration_ms' in df_clean.columns:
    invalid_duration = df_clean[df_clean['duration_ms'] > 900000]         #any track 15 minutes or longer
    print(f"  - Duration > 900000: {len(invalid_duration)} rows")

if 'tempo' in df_clean.columns:
    invalid_tempo = df_clean[df_clean['tempo'] < 0]
    print(f"  - Tempo < 0: {len(invalid_tempo)} rows")

print("\n✓ Logical consistency check complete")

Checking for logical inconsistencies:

  - Popularity out of range [0-100]: 0 rows
  - Duration < 90000: 2438 rows
  - Duration > 900000: 139 rows
  - Tempo < 0: 0 rows

✓ Logical consistency check complete


### As you can see, 2846 tracks have a duration of less than 1 min 30 seconds
### And 153 had a duration of longer than 15 mins
### these will all be removed

In [90]:
# Remove tracks with duration < 90 seconds (90000 ms)
df_clean = df_clean[df_clean['duration_ms'] >= 90000]
df_clean = df_clean[df_clean['duration_ms'] <= 900000]


In [91]:
print("Verifying ranges and constraints:\n")

audio_features = ['danceability', 'energy', 'speechiness', 'acousticness',
                  'instrumentalness', 'liveness', 'valence']

for feature in audio_features:
    if feature in df_clean.columns:
        out_of_range = df_clean[(df_clean[feature] < 0)
                                | (df_clean[feature] > 1)]
        print(f"  - {feature} out of range [0-1]: {len(out_of_range)} rows")


if 'mode' in df_clean.columns:
    invalid_mode = df_clean[(df_clean['mode'] < 0) | (df_clean['mode'] > 1)]
    print(f"  - Mode out of range [0-1]: {len(invalid_mode)} rows")

print("\n✓ Range validation complete")


Verifying ranges and constraints:

  - danceability out of range [0-1]: 0 rows
  - energy out of range [0-1]: 0 rows
  - speechiness out of range [0-1]: 0 rows
  - acousticness out of range [0-1]: 0 rows
  - instrumentalness out of range [0-1]: 0 rows
  - liveness out of range [0-1]: 0 rows
  - valence out of range [0-1]: 0 rows
  - Mode out of range [0-1]: 0 rows

✓ Range validation complete


In [92]:
print("Cross-checking related columns:\n")

if 'loudness' in df_clean.columns:
    print(
        f"  - Loudness range: [{df_clean['loudness'].min():.2f}, {df_clean['loudness'].max():.2f}] dB")
    print(f"    Expected: typically between -60 and 0 dB")

if 'tempo' in df_clean.columns:
    print(
        f"  - Tempo range: [{df_clean['tempo'].min():.2f}, {df_clean['tempo'].max():.2f}] BPM")
    print(f"    Expected: typically between 50 and 200 BPM")
if 'duration_ms' in df_clean.columns:
    duration_seconds = df_clean['duration_ms'] / 1000
    print(
        f"  - Duration range: [{duration_seconds.min():.2f}, {duration_seconds.max():.2f}] seconds")
    print(
        f"    ({duration_seconds.min()/60:.2f} to {duration_seconds.max()/60:.2f} minutes)")

print("\n✓ Cross-check complete")

Cross-checking related columns:

  - Loudness range: [-46.59, 4.53] dB
    Expected: typically between -60 and 0 dB
  - Tempo range: [0.00, 243.37] BPM
    Expected: typically between 50 and 200 BPM
  - Duration range: [90.00, 896.45] seconds
    (1.50 to 14.94 minutes)

✓ Cross-check complete


### There appear to be some tracks with loudness ratings above 0. These are higher than the system can handle and will distort, so we must remove these values

In [93]:
df_clean = df_clean[df_clean['loudness'] <= 0]

## Scale unscaled features

In [94]:
df_clean.describe()
# From df_clean.describe it is clear that a few features are not scaled from the raw data:
# These include loudness in which more negative values actually represent higher loudness, and tempo which is the opposite way arround

from sklearn.preprocessing import MinMaxScaler
from joblib import dump

print("Creating and saving scalers:\n")

# Tempo: Higher BPM = 1, Lower BPM = 0
tempo_scaler = MinMaxScaler()
df_clean['tempo_scaled'] = tempo_scaler.fit_transform(df_clean[['tempo']])
print(f"✓ Tempo: [{df_clean['tempo'].min():.0f}, {df_clean['tempo'].max():.0f}] BPM → [0, 1]")

# Loudness: Lower dB = 1, Higher dB = 0 (reversed)
loudness_scaler = MinMaxScaler()
df_clean['loudness_scaled'] = 1 - loudness_scaler.fit_transform(df_clean[['loudness']])
print(f"✓ Loudness: [{df_clean['loudness'].min():.1f}, {df_clean['loudness'].max():.1f}] dB → [0, 1] (reversed)")

# Save scalers
dump(tempo_scaler, 'tempo_scaler.joblib')
dump(loudness_scaler, 'loudness_scaler.joblib')
print("\n✓ Scalers saved to .joblib files")

df_clean.drop(columns=['tempo','loudness'],inplace=True)

Creating and saving scalers:

✓ Tempo: [0, 243] BPM → [0, 1]
✓ Loudness: [-46.6, -0.0] dB → [0, 1] (reversed)

✓ Scalers saved to .joblib files


### Create Clean Dataset


In [95]:
df_clean.to_csv('dataset_cleaned.csv', index=False)
print("✓ Cleaned dataset saved to: dataset_cleaned.csv")
print(f"  Rows: {df_clean.shape[0]}")
print(f"  Columns: {df_clean.shape[1]}")

✓ Cleaned dataset saved to: dataset_cleaned.csv
  Rows: 78497
  Columns: 20


### Summary of cleaning process

## Data Cleaning Summary

### Initial Dataset
- **114,000 rows × 21 columns**
- Spotify track data with audio features and metadata

### 1. Data Inspection
- Examined first/last rows, data types, and shape
- Generated summary statistics for all columns
- Identified data structure and feature distributions

### 2. Missing Values
- **Found:** 3 missing values (1 each in artists, album_name, track_name)
- **Action:** Filled missing text columns with mode values
  - artists → "The Beatles"
  - album_name → "Alternative Christmas 2022"
  - track_name → "Run Rudolph Run"
- **Result:** 0 missing values remaining

### 3. Duplicate Rows.  FIX THIS BLUD
- **Found:** 0 duplicate rows
- **Action:** No removal needed

### 4. Data Type Conversion
- Converted `explicit` to boolean type
- Dropped `Unnamed: 0` index column
- Verified all other data types are appropriate

### 5. Text Standardization
- Standardized all text columns (track_id, artists, album_name, track_name, track_genre)
- Applied lowercase transformation
- Stripped leading/trailing whitespace
- Verified genre consistency (114 unique genres, balanced distribution)

### 6. Data Integrity Validation
- **Popularity:** All values within valid range [0-100]
- **Duration:** 1 row with duration ≤ 0 identified
- **Tempo:** 157 rows with tempo ≤ 0 identified
- **Audio features:** All within expected [0-1] range
- **Loudness:** Range [-49.53, 4.53] dB (some positive values indicate clipping)
- **Time signature:** 163 rows out of typical range [1-7]

### Current Dataset Status
- **114,000 rows × 20 columns** (after dropping Unnamed: 0)
- **Columns:** track_id, artists, album_name, track_name, track_genre, popularity, duration_ms, explicit, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature
- **Output:** Saved to `dataset_cleaned.csv`

### Next Steps
- Remove invalid rows (duration ≤ 0, tempo ≤ 0, loudness > 0)
- Separate metadata columns from feature columns
- Drop irrelevant features (key, mode, time_signature)
- Scale features for clustering analysis