In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pickle

In [45]:
def hourPreProcessing(df, columnName):
    for i in range(len(df)):
        oldValue = df.loc[i, columnName]
        if ':' in oldValue:
            hour,minute = oldValue.split(':')
            hour=float(hour)*60
            minute = float(minute)
            newValue = hour+minute
            newValue /=60
        else:
            newValue = float(oldValue)
        df.loc[i, columnName] = newValue


In [46]:
def columnDropper(df, columnName):
    df.drop(columnName, axis=1, inplace=True)


In [47]:
def ColumnSeparator(df, columnName):
    for i in range(len(df)):
        value = df.loc[i, columnName]
        items = value.split(', ')
        for item in items:
            key, val = item.split(': ')
            key=key.strip("{'")
            val=val.strip("'}")
            df.loc[i, key] = val
    df.drop(columns=[columnName], inplace=True)

In [48]:
def encodingData(CateData):
    with open('label_encoders.pkl', 'rb') as f:
        label_encoders = pickle.load(f)

    for column in CateData.columns:
        for i in range(len(CateData[column])):
            if CateData[column].iloc[i] not in label_encoders[column].classes_:
                CateData[column].iloc[i] = "Others"
        CateData[column] = label_encoders[column].transform(CateData[column])
    return CateData

In [49]:
def regressionScriptTester():
    data= pd.read_csv("parkinsons_disease_data_reg.csv")
    hourPreProcessing(data,'WeeklyPhysicalActivity (hr)')
    columnDropper(data, 'PatientID')
    columnDropper(data, 'DoctorInCharge')
    numerical_columns = [
        'UPDRS', 'CholesterolHDL', 'BMI', 'MoCA',
        'CholesterolTotal', 'DiastolicBP', 'AlcoholConsumption',
        'CholesterolTriglycerides', 'SystolicBP', 'Age'
    ]    
    numeric_Data = data[numerical_columns]
    with open('mean.pk1', 'rb') as f:
        mean = pickle.load(f)
        
    numeric_Data= numeric_Data.fillna(mean)
    target=numeric_Data['UPDRS']
    columnDropper(numeric_Data,'UPDRS')
    with open('scaler.pk1', 'rb') as f:
        loaded_scaler = pickle.load(f)
    numeric_Data = loaded_scaler.transform(numeric_Data)
    numeric_Data = pd.DataFrame(numeric_Data, columns=numerical_columns[1:]) 
    numeric_Data['UPDRS']=target
    
    ColumnSeparator(data,'MedicalHistory')
    ColumnSeparator(data,'Symptoms')
    categorical_columns = ['PosturalInstability', 'Depression', 'Gender', 'Hypertension', 
                           'Bradykinesia', 'FamilyHistoryParkinsons', 'Diabetes', 'Stroke', 'SleepDisorders', 'Tremor']
    
    category_Data = data[categorical_columns]
    with open('mode.pk1', 'rb') as f:
        mode = pickle.load(f)
    category_Data = category_Data.fillna(mode)
    
    encoded_df= encodingData(category_Data)
    encoded_df['Disease Symptoms'] =  (encoded_df['Tremor'] + encoded_df['Bradykinesia'] + encoded_df['SleepDisorders'] + encoded_df['PosturalInstability']) / 4
    encoded_df['ChronicDiseasesScore'] = (encoded_df['Hypertension'] + encoded_df['Diabetes']) / 2
    encoded_df.drop(['Tremor', 'Bradykinesia','SleepDisorders', 'PosturalInstability','Hypertension', 'Diabetes'], axis=1, inplace=True)
    data=pd.concat([numeric_Data,category_Data], axis=1)
    return data

In [50]:
def loadModels():
    polymodelfiles = [
        'lasso_regression_poly.pkl',
        'linear_regression_poly.pkl',
        'ridge_regression_poly.pkl',
        'gradient_boosting.pkl'

    ]
    modelfiles=[
        'linear_regression.pkl',
        'random_forest.pkl',
        'ridge_alpha=100.pkl',
        'lasso_regression.pkl'

    ]

    models = {}
    results = {}
    polymodels={}

    with open('normal_model_results.pkl', 'rb') as f:
        result_normal = pickle.load(f)
    results['normal'] = result_normal

    with open('polynomial_model_results.pkl', 'rb') as f:
        result_poly = pickle.load(f)
    results['polynomial'] = result_poly

    with open('poly_transformer.pkl', 'rb') as f:
        transformer = pickle.load(f)

    for modelfile in modelfiles:
        with open(modelfile, 'rb') as f:
            model = pickle.load(f)
        model_name = modelfile.replace('.pkl', '')
        models[model_name] = model

    for modelfile in polymodelfiles:
        with open(modelfile, 'rb') as f:
            model = pickle.load(f)
        model_name = modelfile.replace('.pkl', '')
        polymodels[model_name] = model

    return models, polymodels, results, transformer

     

In [51]:
def testNormalData(models, data):
    y = data['UPDRS']
    X = data.drop('UPDRS', axis=1)

    results = []

    for name, model in models.items():
        print(f"Evaluating {name} ...")

        y_pred = model.predict(X)

        r2 = r2_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)

        print(f" Test R²: {r2:.4f}")
        print(f" Test MSE: {mse:.4f}\n")

        results.append({
            "Model": name,
            "Test MSE": mse,
            "Test R²": r2
        })

    results_df = pd.DataFrame(results).sort_values(by="Test R²", ascending=False)

    print("Summary of all models:")
    print(results_df)

    return results_df

In [58]:
def testPolyData(models, transformer, data):
    y = data['UPDRS']
    X = data.drop('UPDRS', axis=1)
    
    X_poly = transformer.fit_transform(X)
    results = []

    for name, model in models.items():
        print(f"Evaluating {name} ...")

        y_pred = model.predict(X_poly)

        r2 = r2_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)

        print(f" Test R²: {r2:.4f}")
        print(f" Test MSE: {mse:.4f}\n")

        results.append({
            "Model": name,
            "Test MSE": mse,
            "Test R²": r2
        })

    results_df = pd.DataFrame(results).sort_values(by="Test R²", ascending=False)

    print("Summary of all models:")
    print(results_df)

    return results_df

In [52]:
data=regressionScriptTester()


In [56]:
models,poly_models, results, transformer = loadModels()
normal_r=testNormalData(models,data)


    # print("Loaded models:", list(models.keys()))
    # print("Results keys:", list(results.keys()))

Evaluating linear_regression ...
 Test R²: 0.0114
 Test MSE: 3155.8568

Evaluating random_forest ...
 Test R²: 0.6743
 Test MSE: 1039.5419

Evaluating ridge_alpha=100 ...
 Test R²: 0.0105
 Test MSE: 3158.7841

Evaluating lasso_regression ...
 Test R²: 0.0114
 Test MSE: 3155.8156

Summary of all models:
               Model     Test MSE   Test R²
1      random_forest  1039.541867  0.674346
3   lasso_regression  3155.815585  0.011386
0  linear_regression  3155.856842  0.011373
2    ridge_alpha=100  3158.784068  0.010456


In [59]:
poly_r=testPolyData(poly_models,transformer,data)


Evaluating lasso_regression_poly ...
 Test R²: 0.0304
 Test MSE: 3095.2467

Evaluating linear_regression_poly ...
 Test R²: 0.0564
 Test MSE: 3012.2199

Evaluating ridge_regression_poly ...
 Test R²: 0.0556
 Test MSE: 3014.6839

Evaluating gradient_boosting ...
 Test R²: 0.6037
 Test MSE: 1265.2090

Summary of all models:
                    Model     Test MSE   Test R²
3       gradient_boosting  1265.209023  0.603651
1  linear_regression_poly  3012.219920  0.056370
2   ridge_regression_poly  3014.683917  0.055598
0   lasso_regression_poly  3095.246713  0.030360
