# Extra Trees Regressor

#### Importing the libraries

In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#### Importing the dataset

In [2]:
data = pd.read_csv('dataset/spotify_features.csv', sep=',')
data.head()

Unnamed: 0,track_popularity,instrumentalness,duration_ms,energy,acousticness,album_month,album_year,danceability,loudness,liveness,genre_code,release_month_code,valence,artist_code,speechiness,tempo
0,67.0,0.00421,162600.0,0.815,0.0724,12,2019,0.726,-4.969,0.357,2,2,0.693,4535,0.106983,99.972
1,70.0,2.3e-05,176616.0,0.931,0.0794,7,2019,0.675,-3.432,0.19031,2,5,0.613,7724,0.0742,124.008
2,60.0,9e-06,169093.0,0.93,0.0287,7,2019,0.718,-3.778,0.204,2,5,0.509838,6862,0.102,121.956
3,62.0,0.0,187675.0,0.856,0.187,7,2019,0.449,-4.788,0.176,2,5,0.152,3635,0.0623,112.648
4,58.0,5e-06,207894.0,0.923,0.146,6,2019,0.679,-6.5,0.124,2,6,0.752,2056,0.181,121.984


#### Splitting the dataset into the Training set and Test set

In [3]:
X = data.drop(columns=['track_popularity'])
y = data['track_popularity']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

print(f'Training set: {X_train.shape}, {y_train.shape}')
print(f'Validation set: {X_val.shape}, {y_val.shape}')
print(f'Test set: {X_test.shape}, {y_test.shape}')

Training set: (11908, 15), (11908,)
Validation set: (3970, 15), (3970,)
Test set: (3970, 15), (3970,)


#### Training the Extra Trees Regressor model

In [4]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('extra_trees_regressor', ExtraTreesRegressor(n_estimators=200, random_state=0))
])

model.fit(X_train, y_train)

#### Model evaluation

In [5]:
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)
test_score = model.score(X_test, y_test)

print(f"Train R^2: {train_score:.2f}")
print(f"Validation R^2: {val_score:.2f}")
print(f"Test R^2: {test_score:.2f}")

mse = mean_squared_error(y_test, model.predict(X_test))
mae = mean_absolute_error(y_test, model.predict(X_test))
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

Train R^2: 1.00
Validation R^2: 0.26
Test R^2: 0.24
Mean Squared Error: 439.00
Mean Absolute Error: 16.95


#### hyperparameter tuning

In [6]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define the model pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('extra_trees_regressor', ExtraTreesRegressor(random_state=0))
])

# Define the hyperparameters for tuning
param_grid = {
    'extra_trees_regressor__n_estimators': [100, 200, 300],
    'extra_trees_regressor__max_features': ['auto', 'sqrt', 'log2'],
    'extra_trees_regressor__min_samples_split': [2, 5, 10],
    'extra_trees_regressor__min_samples_leaf': [1, 2, 4]
}

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Retrieve and print best parameters
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Evaluate the best model on training, validation, and test sets
train_score = best_model.score(X_train, y_train)
val_score = best_model.score(X_val, y_val)
test_score = best_model.score(X_test, y_test)

print(f"Best Model Train R^2: {train_score:.2f}")
print(f"Best Model Validation R^2: {val_score:.2f}")
print(f"Best Model Test R^2: {test_score:.2f}")

# Calculate error metrics
mse = mean_squared_error(y_test, best_model.predict(X_test))
mae = mean_absolute_error(y_test, best_model.predict(X_test))
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

135 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
62 fits failed with the following error:
Traceback (most recent call last):
  File "C:\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\venv\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\venv\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\venv\Lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "C:\ve

Best parameters: {'extra_trees_regressor__max_features': 'sqrt', 'extra_trees_regressor__min_samples_leaf': 1, 'extra_trees_regressor__min_samples_split': 2, 'extra_trees_regressor__n_estimators': 300}
Best Model Train R^2: 1.00
Best Model Validation R^2: 0.24
Best Model Test R^2: 0.23
Mean Squared Error: 442.24
Mean Absolute Error: 17.03
