In [1]:
# 02_preprocessing.ipynb

# -----------------------
# STEP 1: Import libraries
# -----------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import os

# -----------------------
# STEP 2: Load dataset
# -----------------------
df = pd.read_csv("C:\\Users\\luna love\\Downloads\\Project4\\data\\single_genre_artists.csv")
print("Shape:", df.shape)
df.head()

# -----------------------
# STEP 3: Select features for clustering
# -----------------------
features = [
    'danceability','energy','loudness','speechiness',
    'acousticness','instrumentalness','liveness','valence',
    'tempo','duration_ms'
]

X = df[features].copy()

# -----------------------
# STEP 4: Handle skewed features (optional)
# Log-transform duration_ms (song length)
# -----------------------
X['duration_ms'] = np.log1p(X['duration_ms'])

# -----------------------
# STEP 5: Scale features
# -----------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for inspection
X_scaled_df = pd.DataFrame(X_scaled, columns=features)
print(X_scaled_df.head())

# -----------------------
# STEP 6: Save processed features and scaler
# -----------------------
# Ensure paths exist
os.makedirs("../data", exist_ok=True)
os.makedirs("../models", exist_ok=True)

# Save scaled features
X_scaled_df.to_csv("../data/processed_features.csv", index=False)

# Save scaler object
joblib.dump(scaler, "../models/scaler.pkl")

print("✅ Preprocessing complete. Scaled features saved to data/processed_features.csv")
print("✅ Scaler object saved to models/scaler.pkl")


Shape: (95837, 23)
   danceability    energy  loudness  speechiness  acousticness  \
0     -0.153476 -1.511128 -0.757909    -0.427109      1.616187   
1     -1.028518 -1.528056 -1.098629    -0.369741      1.604081   
2     -0.488051 -1.418027 -1.127899    -0.398062      1.622240   
3      0.573581 -0.313510 -0.470677    -0.013914      1.610134   
4      0.406294 -1.028695 -0.768227    -0.111948      1.610134   

   instrumentalness  liveness   valence     tempo  duration_ms  
0         -0.353338  0.538584  0.321287  0.515005    -0.311345  
1         -0.353405 -0.521537 -0.577455 -1.294487     0.368718  
2         -0.353405  1.039046 -0.375943 -1.560003     0.221609  
3         -0.353405 -0.031836  0.611464  0.405268    -0.304798  
4         -0.353405  0.796887  1.087031 -1.377593    -0.228190  
✅ Preprocessing complete. Scaled features saved to data/processed_features.csv
✅ Scaler object saved to models/scaler.pkl
