# Model Selection (& fine tuning)

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics 
import warnings
warnings.filterwarnings("ignore")

In [24]:
models_list = pd.DataFrame()

In [25]:
def regressionMetrics(y, yhat):
    res = {'MSE': metrics.mean_squared_error(y,yhat),
           'RMSE': np.sqrt(metrics.mean_squared_error(y,yhat)),
           'MAE': metrics.mean_absolute_error(y,yhat),
           
          }
    # Calculate RMSLE using absolute values to avoid negative issues
    res['RMSLE'] = np.sqrt(metrics.mean_squared_log_error(np.abs(y), np.abs(yhat)))
    
    return res

In [26]:
df = pd.read_pickle('pickle/05_feature_selection/feature_selection.pkl')

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28356 entries, 0 to 28355
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   danceability            28356 non-null  float64
 1   energy                  28356 non-null  float64
 2   key                     28356 non-null  int64  
 3   loudness                28356 non-null  float64
 4   acousticness            28356 non-null  float64
 5   instrumentalness        28356 non-null  float64
 6   liveness                28356 non-null  float64
 7   tempo                   28356 non-null  float64
 8   duration_ms             28356 non-null  int64  
 9   playlist_count          28356 non-null  int64  
 10  edm                     28356 non-null  bool   
 11  pop                     28356 non-null  bool   
 12  r&b                     28356 non-null  bool   
 13  rap                     28356 non-null  bool   
 14  rock                    28356 non-null

In [28]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,acousticness,instrumentalness,liveness,tempo,duration_ms,playlist_count,...,rap,rock,year,month,day,decade,feat,Remix,track_artist_followers,track_popularity
0,0.682,0.401,2,-10.068,0.279,0.0117,0.0887,97.091,235440,1,...,False,True,2001,1,1,2000,False,False,103090.0,41
1,0.582,0.704,5,-6.242,0.0651,0.0,0.212,150.863,197286,1,...,False,False,2018,1,26,2010,False,False,366482.0,15
2,0.303,0.88,9,-4.739,0.0117,0.00994,0.347,135.225,373512,1,...,False,True,2017,11,21,2010,False,False,4132.0,28
3,0.659,0.794,10,-5.644,0.000761,0.132,0.322,128.041,228565,1,...,False,False,2015,8,7,2010,False,False,557.0,24
4,0.662,0.838,1,-6.3,0.114,0.000697,0.0881,129.884,236308,1,...,False,False,2018,11,16,2010,False,False,2913.0,38


In [29]:
# deprecated due to lack of support for categorical columns in XGBoost

# # Cast df.key, df.decade to category type
# df['key'] = df['key'].astype('category')
# df['decade'] = df['decade'].astype('category')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28356 entries, 0 to 28355
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   danceability            28356 non-null  float64
 1   energy                  28356 non-null  float64
 2   key                     28356 non-null  int64  
 3   loudness                28356 non-null  float64
 4   acousticness            28356 non-null  float64
 5   instrumentalness        28356 non-null  float64
 6   liveness                28356 non-null  float64
 7   tempo                   28356 non-null  float64
 8   duration_ms             28356 non-null  int64  
 9   playlist_count          28356 non-null  int64  
 10  edm                     28356 non-null  bool   
 11  pop                     28356 non-null  bool   
 12  r&b                     28356 non-null  bool   
 13  rap                     28356 non-null  bool   
 14  rock                    28356 non-null

## Create and train the model

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
y = df['track_popularity']
X = df.drop(columns=['track_popularity'])

In [33]:
# Split into train+val and test sets (80% train+val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Split train+val into train and val sets (75% train, 25% val from the train+val set)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
#!pip install xgboost
import xgboost as xgb

In [36]:
# List of models to evaluate
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor()
}



In [None]:
# Dictionary to store the results
results = {}

# Fit and predict using each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    results[name] = regressionMetrics(y_val, y_pred)

# Display the results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")
    print()

Model: LinearRegression
  MSE: 463.23831507532424
  RMSE: 21.522971799343235
  MAE: 17.86629886651494
  RMSLE: 1.312507683594363

Model: DecisionTreeRegressor
  MSE: 760.1553517898078
  RMSE: 27.570914961056477
  MAE: 20.65261858578734
  RMSLE: 1.6046894915123213

Model: RandomForestRegressor
  MSE: 391.88592615603636
  RMSE: 19.796108864017604
  MAE: 15.673094952599275
  RMSLE: 1.232535461624786

Model: AdaBoostRegressor
  MSE: 468.26478360096473
  RMSE: 21.639426600558636
  MAE: 18.555497199185634
  RMSLE: 1.2963095290134077

Model: GradientBoostingRegressor
  MSE: 400.26336156962054
  RMSE: 20.00658295585782
  MAE: 16.19303648927646
  RMSLE: 1.268597243258889

Model: SVR
  MSE: 570.4613591516761
  RMSE: 23.88433292247611
  MAE: 18.778443352101235
  RMSLE: 1.4014299937135324

Model: XGBoost
  MSE: 402.0318666748082
  RMSE: 20.050732322656152
  MAE: 15.905198201890546
  RMSLE: 1.2345041437195001



In [42]:
models['XGBoost']

In [44]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [46]:
lighter_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, None],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 100]
}

print(lighter_grid)

# Reduced number of iterations and cross-validation folds
xgb_random = RandomizedSearchCV(estimator=models['XGBoost'], param_distributions=lighter_grid, n_iter=25, cv=3, 
                               verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
xgb_random.fit(X_train, y_train)

{'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30, 40, None], 'learning_rate': [0.01, 0.1, 0.2], 'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0], 'gamma': [0, 1, 5], 'reg_alpha': [0, 0.1, 1], 'reg_lambda': [1, 10, 100]}
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.1, max_depth=None, n_estimators=300, reg_alpha=0.1, reg_lambda=100, subsample=0.8; total time=   8.0s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.1, max_depth=None, n_estimators=300, reg_alpha=0.1, reg_lambda=100, subsample=0.8; total time=   8.2s
[CV] END colsample_bytree=1.0, gamma=1, learning_rate=0.1, max_depth=None, n_estimators=300, reg_alpha=0.1, reg_lambda=100, subsample=0.8; total time=   8.4s
[CV] END colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=10, n_estimators=300, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=  13.1s
[CV] END colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=10, 

In [47]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mae = 100 * np.mean(errors)
    print('Model Performance')
    print('Mean Absolute Error: {:0.4f}'.format(np.mean(errors)))
    return mae

In [48]:
base_accuracy = evaluate(models['XGBoost'], X_test, y_test)

Model Performance
Mean Absolute Error: 15.7765


In [50]:
best_random = xgb_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Mean Absolute Error: 15.5040


In [51]:
print('Improvement of {:0.2f}%.'.format( 100 * (base_accuracy - random_accuracy) / base_accuracy))

Improvement of 1.73%.
