# Step 2: Preprocess the Data:
Run each cell to:
* Clean the raw dataset (resources/dataset.csv).
* Generate:
    - features_dataset.csv: Extracted features for training the neural network. Saved in the notebooks directory.
    - metadata_dataset.csv: Metadata for future analysis, saved in the same directory.

In [1]:
import pandas as pd
import os

# Get the current working directory
base_path = os.getcwd()  # This works in Jupyter notebooks
file_path = os.path.join(base_path, 'resources', 'dataset.csv')

# Load the dataset
spotify_df = pd.read_csv(file_path)

# Display a preview to confirm it's loaded correctly
print(spotify_df.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [2]:
# Drop the 'Unnamed: 0' column
spotify_df.drop(columns=['Unnamed: 0'], inplace=True)

# Check for missing values
missing_values = spotify_df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Preview the dataset after dropping the column
print("\nDataset after dropping 'Unnamed: 0':")
print(spotify_df.head())

Missing values per column:
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

Dataset after dropping 'Unnamed: 0':
                 track_id                 artists  \
0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic

In [None]:
# Impute missing values for critical columns
spotify_df['artists'].fillna('Unknown', inplace=True)
spotify_df['album_name'].fillna('Unknown', inplace=True)
spotify_df['track_name'].fillna('Unknown', inplace=True)

# Check again for missing values (should not be any)
print("\nMissing values after imputation:")
print(spotify_df.isnull().sum())



Missing values after imputation:
track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spotify_df['artists'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  spotify_df['album_name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [4]:
# Save cleaned dataset
spotify_df.to_csv('cleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'.")

Cleaned dataset saved as 'cleaned_dataset.csv'.


In [5]:
# Select relevant features for training
features_columns = [
    'popularity', 'duration_ms', 'danceability', 'energy', 'key',
    'loudness', 'speechiness', 'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo'
]
features_df = spotify_df[features_columns]

# Save the features dataset
features_df.to_csv('features_dataset.csv', index=False)
print("Features dataset saved as 'features_dataset.csv'.")

Features dataset saved as 'features_dataset.csv'.


In [6]:
# Add a placeholder for 'predicted_mood'
spotify_df['predicted_mood'] = "Unknown"

# Save the metadata dataset for testing!
spotify_df.to_csv('metadata_dataset.csv', index=False)
print("Metadata dataset saved as 'metadata_dataset.csv'.")

Metadata dataset saved as 'metadata_dataset.csv'.


In [7]:
# Peak at the top of what we have for the extracted features
features_df.head(5)

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,73,230666,0.676,0.461,1,-6.746,0.143,0.0322,1e-06,0.358,0.715,87.917
1,55,149610,0.42,0.166,1,-17.235,0.0763,0.924,6e-06,0.101,0.267,77.489
2,57,210826,0.438,0.359,0,-9.734,0.0557,0.21,0.0,0.117,0.12,76.332
3,71,201933,0.266,0.0596,0,-18.515,0.0363,0.905,7.1e-05,0.132,0.143,181.74
4,82,198853,0.618,0.443,2,-9.681,0.0526,0.469,0.0,0.0829,0.167,119.949


In [8]:
# Peak at meta data
spotify_df.head(5)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,predicted_mood
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,...,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,Unknown
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,...,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,Unknown
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,...,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,Unknown
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,...,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,Unknown
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,...,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,Unknown
