# Data Preprocessing
## Amazon Music Clustering Project

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import os

### 1. Load Data

In [2]:
df = pd.read_csv('../data/raw/single_genre_artists.csv')
print(df.shape)
df.head()

(95837, 23)


Unnamed: 0,id_songs,name_song,popularity_songs,duration_ms,explicit,id_artists,release_date,danceability,energy,key,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,genres,name_artists,popularity_artists
0,0IA0Hju8CAgYfV1hwhidBH,La Java,0,161427,0,4AxgXfD7ISvJSTObqm4aIE,1922,0.563,0.184,4,...,0.993,1.6e-05,0.325,0.654,133.088,3,5078.0,['vintage chanson'],Mistinguett,22
1,1b8HZQCqcqwbzlA1jRTp6E,En Douce,0,223440,0,4AxgXfD7ISvJSTObqm4aIE,1922,0.427,0.18,10,...,0.989,0.0,0.128,0.431,78.459,3,5078.0,['vintage chanson'],Mistinguett,22
2,5d5gQxHwYovxR5pqETOIAa,J'en Ai Marre,0,208267,0,4AxgXfD7ISvJSTObqm4aIE,1922,0.511,0.206,0,...,0.995,0.0,0.418,0.481,70.443,4,5078.0,['vintage chanson'],Mistinguett,22
3,1EO65UEEPfy7CR0NK2sDxy,Ils n'ont pas ca,0,161933,0,4AxgXfD7ISvJSTObqm4aIE,1924,0.676,0.467,9,...,0.991,0.0,0.219,0.726,129.775,4,5078.0,['vintage chanson'],Mistinguett,22
4,6a58gXSgqbIsXUhVZ6ZJqe,La belote,0,167973,0,4AxgXfD7ISvJSTObqm4aIE,1924,0.65,0.298,9,...,0.991,0.0,0.373,0.844,75.95,4,5078.0,['vintage chanson'],Mistinguett,22


### 2. Feature Selection

In [3]:
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
X = df[features]
X.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.563,0.184,-13.757,0.0512,0.993,1.6e-05,0.325,0.654,133.088,161427
1,0.427,0.18,-15.375,0.067,0.989,0.0,0.128,0.431,78.459,223440
2,0.511,0.206,-15.514,0.0592,0.995,0.0,0.418,0.481,70.443,208267
3,0.676,0.467,-12.393,0.165,0.991,0.0,0.219,0.726,129.775,161933
4,0.65,0.298,-13.806,0.138,0.991,0.0,0.373,0.844,75.95,167973


### 3. Data Scaling

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=features)
X_scaled_df.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,-0.153476,-1.511128,-0.757909,-0.427109,1.616187,-0.353338,0.538584,0.321287,0.515005,-0.401735
1,-1.028518,-1.528056,-1.098629,-0.369741,1.604081,-0.353405,-0.521537,-0.577455,-1.294487,0.124906
2,-0.488051,-1.418027,-1.127899,-0.398062,1.62224,-0.353405,1.039046,-0.375943,-1.560003,-0.003949
3,0.573581,-0.31351,-0.470677,-0.013914,1.610134,-0.353405,-0.031836,0.611464,0.405268,-0.397437
4,0.406294,-1.028695,-0.768227,-0.111948,1.610134,-0.353405,0.796887,1.087031,-1.377593,-0.346143


### 4. Save Scaler and Processed Data

In [5]:
# Save Scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save Processed Data
X_scaled_df.to_csv('../data/processed/scaled_features.csv', index=False)
df.to_csv('../data/processed/original_with_features.csv', index=False)