#### Processing numerical values
We would perform the following steps:

- Remove the skewness of the data for continuous numerical features for prediction using regression algorithm.
- Standardise the values of the variables to the same range.

In [1]:
import pandas as pd
import numpy as np
import json
import os

path_original_data = r"C:\Users\Usuario\Documents\FOLDER_JupyterNotebook\unimi_files\SMML"
df = pd.read_csv(os.path.join(path_original_data, 'dataset.csv'), low_memory=False)

df.drop(df.columns[0], axis=1, inplace=True)
df=df.dropna()
df=df.drop_duplicates()

df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


### Repeat the nuemrical features preprocessing

In [2]:
# Selecting the numerical features from the DataFrame 'df'
feature_numerical = [feature for feature in df.columns if df[feature].dtypes != 'O']

# Selecting the discrete numerical features (with fewer than 50 unique values)
feature_discrete_numerical = [feature for feature in feature_numerical if df[feature].nunique() < 50]

# Selecting the continuous numerical features (not in the list of discrete numerical features)
feature_continuous_numerical = [feature for feature in feature_numerical if feature not in feature_discrete_numerical]

# Create a copy of the original DataFrame 'df' for later comparison
saved_df = df.copy()

# Apply certain preprocessing to specific numerical features
df['acousticness'] = df['acousticness'] ** (1/2)
df[['speechiness', 'instrumentalness', 'liveness']] = df[['speechiness', 'instrumentalness', 'liveness']] ** (1/5)
df['duration_ms'] = np.log(df['duration_ms'] + 1)

# Check for missing values (NaN) in the DataFrame 'df'
missing_values_count = df.isna().sum()

# Print the count of missing values for each feature
print(missing_values_count)

df.head()

track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,12.34873,False,0.676,0.461,1,-6.746,0,0.677746,0.179444,0.063221,0.814285,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,11.915794,False,0.42,0.166,1,-17.235,1,0.59773,0.961249,0.088923,0.632214,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,12.258793,False,0.438,0.359,0,-9.734,1,0.561269,0.458258,0.0,0.651084,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,12.215696,False,0.266,0.0596,0,-18.515,1,0.515206,0.951315,0.147871,0.666983,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,12.200326,False,0.618,0.443,2,-9.681,1,0.554878,0.684836,0.0,0.60773,0.167,119.949,4,acoustic


## Process discrete features

In [3]:
dataset = df.copy()
feature_discrete_numerical

['explicit', 'key', 'mode', 'time_signature']

In [4]:
# Convert the 'explicit' column in the DataFrame 'dataset' into a binary numerical value (0 or 1)
dataset['explicit'] = np.where(dataset['explicit'] == False, 0, 1)

# Import the StandardScaler class from scikit-learn
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Select the features to be scaled, excluding 'explicit' and 'mode' columns from the discrete numerical features
features_scaling = [feature for feature in feature_discrete_numerical if feature not in ['explicit', 'mode']]

# Fit the StandardScaler to the selected features (calculate mean and standard deviation)
scaler.fit(dataset[features_scaling])

# Perform standard scaling on the selected features
data_to_replace = pd.DataFrame(scaler.transform(dataset[features_scaling]), columns=features_scaling)

# Replace the original unscaled features in 'dataset' with the scaled values from 'data_to_replace'
for feature in features_scaling:
    dataset[feature] = data_to_replace[feature].values

# Display the first few rows of the DataFrame 'data_to_replace' (containing the scaled features)
data_to_replace.head()

Unnamed: 0,key,time_signature
0,-1.210476,0.22166
1,-1.210476,0.22166
2,-1.491364,0.22166
3,-1.491364,-2.092538
4,-0.929587,0.22166


## Process categorical features

In [5]:
# Create a list 'feature_categorical' containing the names of features that are not numerical (i.e., categorical features)
feature_categorical = [feature for feature in df.columns if feature not in feature_numerical]

# Assuming the first element of 'feature_categorical' corresponds to the first column of 'df'
# Remove the first element from 'feature_categorical'
feature_categorical.pop(0)

# Drop the first column of 'dataset' (since it was removed from 'feature_categorical' and likely corresponds to the first column of 'df')
dataset.drop(df.columns[0], axis=1, inplace=True)

# Display the first few rows of the modified DataFrame 'dataset'
dataset.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,73,12.34873,0,0.676,0.461,-1.210476,-6.746,0,0.677746,0.179444,0.063221,0.814285,0.715,87.917,0.22166,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,11.915794,0,0.42,0.166,-1.210476,-17.235,1,0.59773,0.961249,0.088923,0.632214,0.267,77.489,0.22166,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,12.258793,0,0.438,0.359,-1.491364,-9.734,1,0.561269,0.458258,0.0,0.651084,0.12,76.332,0.22166,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,12.215696,0,0.266,0.0596,-1.491364,-18.515,1,0.515206,0.951315,0.147871,0.666983,0.143,181.74,-2.092538,acoustic
4,Chord Overstreet,Hold On,Hold On,82,12.200326,0,0.618,0.443,-0.929587,-9.681,1,0.554878,0.684836,0.0,0.60773,0.167,119.949,0.22166,acoustic


In [6]:
for feature in feature_categorical:
    print(feature,': {}, missing values {}'.format(df[feature].nunique(), df[feature].isna().sum()))

artists : 31437, missing values 0
album_name : 46589, missing values 0
track_name : 73608, missing values 0
track_genre : 114, missing values 0


Observations:

- The track genre can definitely affect the popularity as it would depend on the individual which genre they like. 
- The artist name can also affect the song's popularity as a famed artist is likely to have more popular tracks. track_ name and album_name can also affect the popularity.

In [7]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from imblearn.datasets import make_imbalance
from category_encoders.target_encoder import TargetEncoder
import statsmodels.api as sm

### Due to high number of lables One Hot encoding is unable to be applied
- MemoryError: Unable to allocate 9.49 GiB for an array with shape (89740, 113549) and data type uint8

### BaseN encoding

In [8]:
# Import the category_encoders library as 'ce'
import category_encoders as ce

# Create a BaseNEncoder object named 'encoder1' to perform Base-N encoding on the categorical features
# The 'cols' parameter specifies the list of columns to be encoded, which is 'feature_categorical'
# The 'base' parameter specifies the base to use for encoding, which is 10 in this case
# The 'return_df' parameter is set to True to return a DataFrame after encoding
encoder1 = ce.BaseNEncoder(cols=feature_categorical, base=10, return_df=True)

# Perform Base-N encoding on the DataFrame 'dataset' using the 'encoder1' object
# The encoded DataFrame is stored in the variable 'data'
data = encoder1.fit_transform(dataset)

# Display the first few rows of the encoded DataFrame 'data'
data.head()

# Iterate over each column in 'label_encoded_df' (including numerical and label-encoded features)
# and apply max absolute scaling to the respective column in 'label_encoded_df'
for col in data.columns:
    data[col] = MaxAbsScaler().fit_transform(data[[col]])

In [9]:
data.to_csv(os.path.join(path_original_data, 'processed_categorical_dataset_basen.csv'), index=False)
data.head()

Unnamed: 0,artists_0,artists_1,artists_2,artists_3,artists_4,album_name_0,album_name_1,album_name_2,album_name_3,album_name_4,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre_0,track_genre_1,track_genre_2
0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,...,0.682593,0.179804,0.063221,0.814285,0.718593,0.361245,0.024533,0.0,0.0,0.111111
1,0.0,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.222222,...,0.602004,0.963177,0.088923,0.632214,0.268342,0.318397,0.024533,0.0,0.0,0.111111
2,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,...,0.565283,0.459177,0.0,0.651084,0.120603,0.313643,0.024533,0.0,0.0,0.111111
3,0.0,0.0,0.0,0.0,0.444444,0.0,0.0,0.0,0.0,0.444444,...,0.518891,0.953223,0.147871,0.666983,0.143719,0.746758,-0.2316,0.0,0.0,0.111111
4,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.555556,...,0.558845,0.68621,0.0,0.60773,0.167839,0.492863,0.024533,0.0,0.0,0.111111


In [10]:
df = dataset.copy()
df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,73,12.34873,0,0.676,0.461,-1.210476,-6.746,0,0.677746,0.179444,0.063221,0.814285,0.715,87.917,0.22166,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,11.915794,0,0.42,0.166,-1.210476,-17.235,1,0.59773,0.961249,0.088923,0.632214,0.267,77.489,0.22166,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,12.258793,0,0.438,0.359,-1.491364,-9.734,1,0.561269,0.458258,0.0,0.651084,0.12,76.332,0.22166,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,12.215696,0,0.266,0.0596,-1.491364,-18.515,1,0.515206,0.951315,0.147871,0.666983,0.143,181.74,-2.092538,acoustic
4,Chord Overstreet,Hold On,Hold On,82,12.200326,0,0.618,0.443,-0.929587,-9.681,1,0.554878,0.684836,0.0,0.60773,0.167,119.949,0.22166,acoustic


### Label encoding

In [11]:
# Import the LabelEncoder class from scikit-learn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object named 'le'
le = LabelEncoder()

# Create a copy of the original DataFrame 'df' named 'label_encoded_df'
label_encoded_df = df.copy()

# Iterate over each column in 'feature_categorical' (categorical features)
# and apply label encoding to convert categorical values to numerical labels
for col in feature_categorical:
    label_encoded_df[col] = le.fit_transform(label_encoded_df[col])

# Display the first few rows of the label-encoded DataFrame 'label_encoded_df'
label_encoded_df.head()  

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,10357,8100,11741,73,12.34873,0,0.676,0.461,-1.210476,-6.746,0,0.677746,0.179444,0.063221,0.814285,0.715,87.917,0.22166,0
1,3287,14796,22528,55,11.915794,0,0.42,0.166,-1.210476,-17.235,1,0.59773,0.961249,0.088923,0.632214,0.267,77.489,0.22166,0
2,12397,39162,60774,57,12.258793,0,0.438,0.359,-1.491364,-9.734,1,0.561269,0.458258,0.0,0.651084,0.12,76.332,0.22166,0
3,14839,8580,9580,71,12.215696,0,0.266,0.0596,-1.491364,-18.515,1,0.515206,0.951315,0.147871,0.666983,0.143,181.74,-2.092538,0
4,5255,16899,25689,82,12.200326,0,0.618,0.443,-0.929587,-9.681,1,0.554878,0.684836,0.0,0.60773,0.167,119.949,0.22166,0


### Target encoding

In [12]:
# Define the custom function 'target_encoding' that performs target encoding on a specified column
def target_encoding(data, column, target):
    # Group the DataFrame by the specified column and calculate the mean of the target variable for each category
    grouped = data[[column, target]].groupby(column, as_index=False).mean()
    
    # Create an empty dictionary to store the mapping of categories to their target means
    empty_dict = {}
    
    # Iterate over the rows of the 'grouped' DataFrame and populate the dictionary
    for i in range(len(grouped)):
        empty_dict[grouped.iloc[i, 0]] = grouped.iloc[i, 1]
    
    # Map the categorical values in the specified column to their corresponding target means using the dictionary
    data[column] = data[column].map(lambda x: empty_dict[x])
    
    # Return the DataFrame with target encoding applied to the specified column
    return data

# Create a copy of the original DataFrame 'df' named 'te_df'
te_df = df.copy()

# Iterate over each column in 'feature_categorical' (categorical features)
# and apply target encoding to convert categorical values to target means
for col in feature_categorical:
    target_encoding(te_df, col, 'popularity')

# Display the first few rows of the DataFrame 'te_df' after target encoding
te_df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,58.0,73.0,73.0,73,12.34873,0,0.676,0.461,-1.210476,-6.746,0,0.677746,0.179444,0.063221,0.814285,0.715,87.917,0.22166,42.483
1,42.923077,55.0,50.666667,55,11.915794,0,0.42,0.166,-1.210476,-17.235,1,0.59773,0.961249,0.088923,0.632214,0.267,77.489,0.22166,42.483
2,57.0,57.0,57.0,57,12.258793,0,0.438,0.359,-1.491364,-9.734,1,0.561269,0.458258,0.0,0.651084,0.12,76.332,0.22166,42.483
3,53.933333,71.0,58.833333,71,12.215696,0,0.266,0.0596,-1.491364,-18.515,1,0.515206,0.951315,0.147871,0.666983,0.143,181.74,-2.092538,42.483
4,41.727273,39.0,39.85,82,12.200326,0,0.618,0.443,-0.929587,-9.681,1,0.554878,0.684836,0.0,0.60773,0.167,119.949,0.22166,42.483


In [13]:
# Iterate over each column in 'label_encoded_df' (including numerical and label-encoded features)
# and apply max absolute scaling to the respective column in 'label_encoded_df'
for col in label_encoded_df.columns:
    label_encoded_df[col] = MaxAbsScaler().fit_transform(label_encoded_df[[col]])

# Iterate over each column in 'te_df' (including numerical and target-encoded features)
# and apply max absolute scaling to the respective column in 'te_df'
for col in te_df.columns:
    te_df[col] = MaxAbsScaler().fit_transform(te_df[[col]])

In [14]:
label_encoded_df.to_csv(os.path.join(path_original_data, 'processed_categorical_dataset_label.csv'), index=False)
label_encoded_df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0.329463,0.173865,0.159509,0.73,0.798169,0.0,0.686294,0.461,-0.7573,-0.136198,0.0,0.682593,0.179804,0.063221,0.814285,0.718593,0.361245,0.024533,0.0
1,0.104562,0.317593,0.306058,0.55,0.770186,0.0,0.426396,0.166,-0.7573,-0.347964,1.0,0.602004,0.963177,0.088923,0.632214,0.268342,0.318397,0.024533,0.0
2,0.394357,0.840603,0.825655,0.57,0.792356,0.0,0.44467,0.359,-0.93303,-0.196523,1.0,0.565283,0.459177,0.0,0.651084,0.120603,0.313643,0.024533,0.0
3,0.472038,0.184168,0.130151,0.71,0.789571,0.0,0.270051,0.0596,-0.93303,-0.373806,1.0,0.518891,0.953223,0.147871,0.666983,0.143719,0.746758,-0.2316,0.0
4,0.167165,0.362733,0.349002,0.82,0.788577,0.0,0.627411,0.443,-0.58157,-0.195453,1.0,0.558845,0.68621,0.0,0.60773,0.167839,0.492863,0.024533,0.0


In [15]:
te_df.to_csv(os.path.join(path_original_data, 'processed_categorical_dataset_target.csv'), index=False)
te_df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0.58,0.73,0.73,0.73,0.798169,0.0,0.686294,0.461,-0.7573,-0.136198,0.0,0.682593,0.179804,0.063221,0.814285,0.718593,0.361245,0.024533,0.716646
1,0.429231,0.55,0.506667,0.55,0.770186,0.0,0.426396,0.166,-0.7573,-0.347964,1.0,0.602004,0.963177,0.088923,0.632214,0.268342,0.318397,0.024533,0.716646
2,0.57,0.57,0.57,0.57,0.792356,0.0,0.44467,0.359,-0.93303,-0.196523,1.0,0.565283,0.459177,0.0,0.651084,0.120603,0.313643,0.024533,0.716646
3,0.539333,0.71,0.588333,0.71,0.789571,0.0,0.270051,0.0596,-0.93303,-0.373806,1.0,0.518891,0.953223,0.147871,0.666983,0.143719,0.746758,-0.2316,0.716646
4,0.417273,0.39,0.3985,0.82,0.788577,0.0,0.627411,0.443,-0.58157,-0.195453,1.0,0.558845,0.68621,0.0,0.60773,0.167839,0.492863,0.024533,0.716646
