In [195]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from sklearn.model_selection import KFold
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor

from tensorflow import keras
from keras import layers

In [196]:
# Loading the clean dataset
df = pd.read_csv('dataset_regression/parkinsons_updrs_cleaned.data')

In [197]:
df.head()

Unnamed: 0,age,motor_UPDRS,HNR,RPDE,DFA,PPE,Jitter_combined,Shimmer_combined
0,72,28.447,20.533,0.55096,0.55348,0.26094,0.0064,0.0927
1,72,30.917,21.571,0.56359,0.5566,0.27912,0.0055,0.0638
2,72,29.682,25.347,0.43478,0.5514,0.26728,0.0058,0.0462
3,58,11.078,20.632,0.541,0.75905,0.19288,0.0042,0.0841
4,58,11.218,18.254,0.48799,0.76679,0.22277,0.0059,0.1041


In [198]:
df.shape

(2247, 8)

In [199]:
X= df.drop('motor_UPDRS', axis=1)
y = df['motor_UPDRS']

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [201]:
# Here, I am gonna to use only a few models that had the best performance in the Benchmarking notebook
meta_model = LinearRegression()
stacking_model = StackingRegressor(
    estimators=[
        ('Extra Trees Regressor', ExtraTreesRegressor(n_estimators=400, max_depth=20, min_samples_split=2, max_features='sqrt')),
        ('Gradient Boosting Regressor', GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, max_depth=10)),
        ('XGBoost', xgb.XGBRegressor(n_estimators=400, learning_rate=0.01, max_depth=10)),
        ('LightGBM', lgb.LGBMRegressor(n_estimators=500, learning_rate=0.01, max_depth=12, verbose=0)),
        ('CatBoost', cb.CatBoostRegressor(n_estimators=400, learning_rate=0.01, depth=6, verbose=0)),
    ],
    final_estimator=meta_model, 
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
)

In [202]:
meta_model.fit(X_train, y_train)  

# test predictions and metrics
predictions = meta_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse).round(2)    
r2 = r2_score(y_test, predictions)

print()
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")


MAE: 2.90
MSE: 11.62
RMSE: 3.41
R2: 0.72


**My model is not accurate enough for health purposes. 2.8 point error in UPDRS is important.This means that my model's predictions is not reliable for doctors or patients.**

In [206]:
# Create new dataframe to test the model 
new_data = pd.DataFrame({
    'age': [72],
    'HNR': [20.12],
    'RPDE': [0.5],
    'DFA': [0.6],
    'PPE': [0.2],
    'Jitter_combined': [0.003],
    'Shimmer_combined': [0.0256]
})

In [207]:
predictions = meta_model.predict(new_data)
print("Predictions:")   
print(predictions[0].round(2))

Predictions:
18.14
