In [22]:
import numpy as np
import pandas as pd
import seaborn as sns  
import time
 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb


import tensorflow as tf
import keras
from keras import layers
import keras_tuner
from keras import regularizers
from keras.layers import LeakyReLU
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, KFold

In [23]:
# Load the dataset  
df = pd.read_csv('dataset_regression/parkinsons_updrs_cleaned.data')

In [24]:
df.head()

Unnamed: 0,age,motor_UPDRS,HNR,RPDE,DFA,PPE,Jitter_combined,Shimmer_combined
0,72,28.447,20.533,0.55096,0.55348,0.26094,0.0064,0.0927
1,72,30.917,21.571,0.56359,0.5566,0.27912,0.0055,0.0638
2,72,29.682,25.347,0.43478,0.5514,0.26728,0.0058,0.0462
3,58,11.078,20.632,0.541,0.75905,0.19288,0.0042,0.0841
4,58,11.218,18.254,0.48799,0.76679,0.22277,0.0059,0.1041


In [25]:
df.shape

(2296, 8)

In [26]:
# I am gonna choose the motor_UPDRS as the target  variable
X = df.drop(['motor_UPDRS'], axis=1)
y = df['motor_UPDRS']

In [27]:
# Splitting the data into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

In [28]:
print("Overall distribution:")
df['motor_UPDRS'].describe()    

Overall distribution:


count    2296.000000
mean       18.923689
std         6.867635
min         5.037700
25%        13.677500
50%        18.000000
75%        24.140000
max        37.364000
Name: motor_UPDRS, dtype: float64

In [29]:
print("\nTraining set distribution:")
y_train.describe()  


Training set distribution:


count    1836.000000
mean       18.941759
std         6.888827
min         5.037700
25%        13.691000
50%        18.000000
75%        24.110000
max        37.364000
Name: motor_UPDRS, dtype: float64

In [30]:
print("\nTest set distribution:")
y_test.describe()   


Test set distribution:


count    230.000000
mean      18.813099
std        6.871624
min        5.437100
25%       13.589500
50%       17.936500
75%       24.182000
max       36.567000
Name: motor_UPDRS, dtype: float64

In [31]:
print(f'\nShape of X_train: {X_train.shape}')
print(f'\nShape of X_test: {X_test.shape}')
print(f'\nShape of y_train: {y_train.shape}')
print(f'\nShape of y_test: {y_test.shape}')
print(f'\nShape of X_val: {X_val.shape}')
print(f'\nShape of y_val: {y_val.shape}')


Shape of X_train: (1836, 7)

Shape of X_test: (230, 7)

Shape of y_train: (1836,)

Shape of y_test: (230,)

Shape of X_val: (230, 7)

Shape of y_val: (230,)


In [32]:
# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [12]:
X.shape

(2296, 7)

In [13]:
# Creating a function to fine tune and adding hp object
def build_model(hp):
    model = keras.Sequential()
    
    # Adding the input layer
    model.add(keras.layers.BatchNormalization(
        momentum=hp.Float('bn_momentum', 0.1, 0.9, 0.1), input_shape=(len(X.columns),))),
  
     
    
    # First Hidden Layer 
    model.add(keras.layers.Dense(
        units=hp.Int('units', min_value=8, max_value=64, step=2),
        # For fine tuning the model, I am gonna use tow activation functions relu and tanh
        activation=hp.Choice('activation', ['relu', "tanh"]),
        kernel_regularizer=keras.regularizers.l1(l1=hp.Float('l1', 0, 0.1, step=0.01),)
    ))
        
    # Dropout layer
    if hp.Boolean('dropout'):
        model.add(keras.layers.Dropout(rate=hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
        
        
    # Adding additional hidden layers
    for i in range(hp.Int("num_layers", 1, 2)):
        units = hp.Int(f"units_{i+1}", 8, 64, 2)
        activation = hp.Choice(f"activation_{i}", ['relu', 'tanh', 'LeakyReLU'])
       
        if activation == 'LeakyReLU':
           model.add(keras.layers.Dense(units))
           model.add(keras.layers.LeakyReLU(negative_slope=hp.Float('leaky_relu_slope', 0.1, 0.5, step=0.1)))
            
        else:
           model.add(keras.layers.Dense(units, activation=activation))
           
    model.add(keras.layers.Dense(1))
    
    # Setting up the optimizer and compiling the model 
    learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling="log")
    # Creating the dictionary for the optimizers for givin flexibility to the model
    optimizers = {
        'adam': keras.optimizers.Adam(learning_rate=learning_rate),
        'sgd': keras.optimizers.SGD(learning_rate=learning_rate, momentum=hp.Float('momentum', 0.0, 0.9, 0.1)),
        'rmsprop': keras.optimizers.RMSprop(learning_rate=learning_rate)}[hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop'])]
    
    model.compile(optimizer=optimizers, loss='mse',metrics=['mae'])
    return model

build_model(keras_tuner.HyperParameters())  

# Setting up the Keras tuner
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model, 
    objective="val_loss",
    #I will got for 10 trials
    max_trials=10,
    executions_per_trial=3,
    overwrite=True, 
    directory='dataset_regression/model_tuning',
    project_name="regression_model",
)

mc = ModelCheckpoint('best_model_regression.keras', monitor='val_loss', mode='min', save_best_only=True)

callback = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5),
    mc
]
# Starting searching
tuner.search(X_train_scaled, y_train, epochs=250, validation_data=(X_val_scaled, y_val), callbacks=callback)   

  
# got main idea code from Deep Learning Lecture  notes and modified it for my own dataset and I used https://keras.io/api/models/model/
# For debugging I used LLM.

Trial 10 Complete [00h 00m 23s]
val_loss: 12.85354487101237

Best val_loss So Far: 10.944008827209473
Total elapsed time: 00h 08m 16s


In [14]:
# Get the top 2 models.
models = tuner.get_best_models(num_models=2)
best_model = models[0]
best_model.summary()

  super().__init__(**kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
# Printing  out the results
tuner.results_summary()

Results summary
Results in dataset_regression/model_tuning\regression_model
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 05 summary
Hyperparameters:
bn_momentum: 0.6
units: 42
activation: tanh
l1: 0.01
dropout: True
num_layers: 2
units_1: 8
activation_0: tanh
lr: 0.0001787125964217817
momentum: 0.0
optimizer: sgd
units_2: 30
activation_1: LeakyReLU
dropout_rate: 0.5
Score: 10.944008827209473

Trial 04 summary
Hyperparameters:
bn_momentum: 0.8
units: 40
activation: tanh
l1: 0.0
dropout: True
num_layers: 1
units_1: 4
activation_0: LeakyReLU
lr: 0.0011055415225593402
momentum: 0.0
optimizer: rmsprop
units_2: 18
activation_1: relu
dropout_rate: 0.1
Score: 10.965479850769043

Trial 08 summary
Hyperparameters:
bn_momentum: 0.30000000000000004
units: 24
activation: tanh
l1: 0.05
dropout: True
num_layers: 2
units_1: 6
activation_0: relu
lr: 0.006320810242350413
momentum: 0.4
optimizer: rmsprop
units_2: 18
activation_1: relu
dropout_rate: 0.5
Score: 11.3220224380493

In [20]:
# Getting the best hyperparameters
best_hps = tuner.get_best_hyperparameters(1)[0]
# Making dataframe to show the best hyperparameters
best_hps_dict = best_hps.values
best_hps_df = pd.DataFrame(best_hps_dict.items(), columns=['Hyperparameter', 'Value'])
best_hps_df

Unnamed: 0,Hyperparameter,Value
0,bn_momentum,0.6
1,units,42
2,activation,tanh
3,l1,0.01
4,dropout,True
5,num_layers,2
6,units_1,8
7,activation_0,tanh
8,lr,0.000179
9,momentum,0.0


In [33]:
# Creating a dictionary of models
models = {
    "Linear Regression": LinearRegression(),  
    "Random Forest Regressor": RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=2, max_features=3),
    "SVR": SVR(C=10, epsilon=0.1, kernel='rbf', gamma='scale'),
    "KNN": KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto'),
    "MLP Regressor": MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', early_stopping=True, alpha=0.01, learning_rate_init=0.001),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, min_samples_split=10),    
    'Extra Trees Regressor': ExtraTreesRegressor(n_estimators=400, max_depth=20, min_samples_split=2, max_features='sqrt'),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=10),
    "XGBoost": xgb.XGBRegressor(n_estimators=400, learning_rate=0.01, max_depth=10), 
    'LightGBM': lgb.LGBMRegressor(n_estimators=500, learning_rate=0.01, max_depth=12, verbose=0),
    'CatBoost': cb.CatBoostRegressor(n_estimators=400, learning_rate=0.01, depth=6, verbose=0),
    'Neural Network': keras.models.Sequential(
        [
        layers.BatchNormalization(input_shape=(len(X.columns),)),   
        layers.Dense(50, activation='tanh', kernel_regularizer=regularizers.l1(l1=0.02)),
        layers.BatchNormalization( momentum=0.2),
        layers.Dense(14, activation='tanh'),    
        layers.BatchNormalization( momentum=0.2),   
        layers.Dense(4, activation='relu'),    
        layers.BatchNormalization(momentum=0.2),    
        layers.Dense(1)
    ]
    )
}

  super().__init__(**kwargs)


In [34]:
# Creating a list to store the results
results = []
# for each model
for name, model in models.items():
    
    print("Starting ... "+ name)    
    model.random_state = 78
    # start the clock
    start  = time.time()  
      
    if name == 'Neural Network':
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0048640356869126), loss='mse', metrics=['mean_absolute_error']) 
         # I need callback to stop the model when it is not improving and avoid overfitting.
        callback = [
          keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
          keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5)
          ]
        
        model.fit(X_train_scaled, y_train, validation_data= (X_val_scaled, y_val), callbacks=callback, epochs=100, batch_size=32, verbose=0) 
        predictions = model.predict(X_test_scaled)
   # Since these model needs scaled data, I will use the scaled data
    elif name in ['SVR', 'KNN', 'MLP Regressor']:
    
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)

    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
    end  = time.time()
    Train_Time = round(end - start, 2)
    # calculate the metrics
    
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    # append the results to the list
    results.append([name, mae, mse, r2, Train_Time])
    
# I got the main code idea from lecture notes. But I changed the code for my own dataset.  I got some errors and I used LLM for debugging.

Starting ... Linear Regression
Starting ... Random Forest Regressor
Starting ... SVR
Starting ... KNN
Starting ... MLP Regressor
Starting ... Decision Tree
Starting ... Extra Trees Regressor
Starting ... Gradient Boosting Regressor
Starting ... XGBoost
Starting ... LightGBM
Starting ... CatBoost
Starting ... Neural Network
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


In [35]:
# Create a DataFrame from the results list
results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'MSE', 'R2', 'Train_Time'])
results_df.sort_values('R2', ascending=False)   

Unnamed: 0,Model,MAE,MSE,R2,Train_Time
9,LightGBM,1.749707,6.476291,0.862247,0.88
6,Extra Trees Regressor,1.91958,6.576456,0.860117,1.48
7,Gradient Boosting Regressor,1.709234,6.810534,0.855138,8.05
1,Random Forest Regressor,1.899221,6.856325,0.854164,2.18
8,XGBoost,1.744329,7.032418,0.850418,3.74
11,Neural Network,2.115607,7.278244,0.84519,16.45
3,KNN,2.205991,8.803995,0.812736,0.01
2,SVR,2.288863,8.863075,0.81148,0.3
4,MLP Regressor,2.394122,8.916438,0.810345,1.95
5,Decision Tree,1.973544,9.21285,0.80404,0.02


**what I can see here, LightGBM has better performance. It has higher R score and also has lower MAE and MSE.**