In [54]:
import pandas as pd
import numpy as np
from selenium.webdriver.common.devtools.v85.schema import Domain
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [55]:
df = pd.read_csv('./alzheimer.csv')
df.sample(3)

Unnamed: 0,PatientID,Domain,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
8960,P008961,Orthopedics,39.0,33.46,80.0,103.0,213.0,63.21,74.85,Medium Risk,Yes,Stable
14014,P014015,Orthopedics,39.0,25.94,83.0,135.0,214.0,62.74,81.4,High Risk,No,Stable
20203,P020204,Dermatology,45.0,31.41,89.0,122.0,215.0,99.68,83.34,Medium Risk,No,Stable


In [56]:
df.drop(columns=["PatientID","Domain"],inplace=True)

In [57]:
for cat in df.select_dtypes(include=["object","category"]):
    for num in df.select_dtypes(include=["number"]):
        print(df.groupby(cat)[num].mean())
        

RiskLevel
High Risk      52.984142
Low Risk       53.763592
Medium Risk    53.311000
Name: Age, dtype: float64
RiskLevel
High Risk      25.004167
Low Risk       24.917062
Medium Risk    24.983888
Name: BMI, dtype: float64
RiskLevel
High Risk      71.960462
Low Risk       72.101552
Medium Risk    72.151137
Name: HeartRate, dtype: float64
RiskLevel
High Risk      119.948133
Low Risk       120.021193
Medium Risk    120.098037
Name: BloodPressure, dtype: float64
RiskLevel
High Risk      199.825141
Low Risk       199.857339
Medium Risk    199.881044
Name: Cholesterol, dtype: float64
RiskLevel
High Risk      74.911137
Low Risk       74.989610
Medium Risk    74.841456
Name: CognitiveTestScore, dtype: float64
RiskLevel
High Risk      69.998989
Low Risk       69.917259
Medium Risk    69.937634
Name: StressMobilityTestScore, dtype: float64
FamilyHistoryDementia
No     53.401486
Yes    53.293930
Name: Age, dtype: float64
FamilyHistoryDementia
No     24.997969
Yes    24.921944
Name: BMI, dtype: fl

In [58]:
df.select_dtypes(include=["number"]).corr()

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore
Age,1.0,-0.007673,0.000789,-0.000861,-0.004067,-0.005183,0.001233
BMI,-0.007673,1.0,0.011005,0.008129,-0.004704,0.005204,-0.001462
HeartRate,0.000789,0.011005,1.0,0.003418,0.005879,5e-05,0.005204
BloodPressure,-0.000861,0.008129,0.003418,1.0,0.01256,0.007127,-0.005517
Cholesterol,-0.004067,-0.004704,0.005879,0.01256,1.0,0.010273,0.002025
CognitiveTestScore,-0.005183,0.005204,5e-05,0.007127,0.010273,1.0,-0.001955
StressMobilityTestScore,0.001233,-0.001462,0.005204,-0.005517,0.002025,-0.001955,1.0


In [59]:
def label_encoder(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame:
    df_copy = dataframe.copy()
    for column in columns:
        le = LabelEncoder()
        mask = df_copy[column].isnull()
        df_copy[column] = le.fit_transform(df_copy[column].astype(str))
        df_copy.loc[mask, column] = None
    return df_copy


def simple_imputor(dataframe: pd.DataFrame, strategy: str, column_name: str) -> pd.DataFrame:
    df_copy = dataframe.copy()
    imputer = SimpleImputer(strategy=strategy)
    df_copy[column_name] = imputer.fit_transform(df_copy[[column_name]])
    return df_copy

In [60]:
df.isnull().sum()

Age                        200
BMI                        200
HeartRate                   50
BloodPressure              200
Cholesterol                200
CognitiveTestScore          50
StressMobilityTestScore    200
RiskLevel                    0
FamilyHistoryDementia        0
TreatmentResponse            0
dtype: int64

In [61]:
df = simple_imputor(df,"median","Age")
df = simple_imputor(df,"median","BMI")
df = simple_imputor(df,"median","HeartRate")
df = simple_imputor(df,"median","BloodPressure")
df = simple_imputor(df,"median","Cholesterol")
df = simple_imputor(df,"median","CognitiveTestScore")
df = simple_imputor(df,"median","StressMobilityTestScore")

In [62]:
df.isnull().sum()

Age                        0
BMI                        0
HeartRate                  0
BloodPressure              0
Cholesterol                0
CognitiveTestScore         0
StressMobilityTestScore    0
RiskLevel                  0
FamilyHistoryDementia      0
TreatmentResponse          0
dtype: int64

In [63]:
df

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
0,62.0,28.09,69.0,115.0,206.0,82.78,59.68,High Risk,No,Stable
1,19.0,22.83,78.0,116.0,196.0,76.00,60.13,Low Risk,No,Improved
2,18.0,27.40,71.0,116.0,200.0,58.52,68.76,Low Risk,Yes,Deteriorated
3,47.0,27.81,78.0,105.0,176.0,83.56,70.71,Medium Risk,Yes,Improved
4,38.0,17.92,69.0,113.0,208.0,74.45,76.36,High Risk,No,Stable
...,...,...,...,...,...,...,...,...,...,...
29995,30.0,29.28,65.0,138.0,286.0,80.59,81.70,Medium Risk,No,Improved
29996,42.0,34.00,68.0,117.0,190.0,62.90,69.58,Low Risk,No,Improved
29997,69.0,22.87,59.0,107.0,212.0,82.72,50.81,Low Risk,No,Stable
29998,42.0,29.74,74.0,115.0,191.0,74.63,63.08,High Risk,No,Improved


In [64]:
df = label_encoder(df,df.select_dtypes(include=["object","category"]))

In [65]:
df

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
0,62.0,28.09,69.0,115.0,206.0,82.78,59.68,0.0,0.0,2.0
1,19.0,22.83,78.0,116.0,196.0,76.00,60.13,1.0,0.0,1.0
2,18.0,27.40,71.0,116.0,200.0,58.52,68.76,1.0,1.0,0.0
3,47.0,27.81,78.0,105.0,176.0,83.56,70.71,2.0,1.0,1.0
4,38.0,17.92,69.0,113.0,208.0,74.45,76.36,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...
29995,30.0,29.28,65.0,138.0,286.0,80.59,81.70,2.0,0.0,1.0
29996,42.0,34.00,68.0,117.0,190.0,62.90,69.58,1.0,0.0,1.0
29997,69.0,22.87,59.0,107.0,212.0,82.72,50.81,1.0,0.0,2.0
29998,42.0,29.74,74.0,115.0,191.0,74.63,63.08,0.0,0.0,1.0


In [66]:
X = df.drop(columns=["RiskLevel"],axis=1)
Y = df["RiskLevel"]


In [69]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [70]:
model = XGBClassifier(max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=-1, learning_rate=0.1)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

In [71]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.33116666666666666
[[586 795 534]
 [631 872 579]
 [622 852 529]]
              precision    recall  f1-score   support

         0.0       0.32      0.31      0.31      1915
         1.0       0.35      0.42      0.38      2082
         2.0       0.32      0.26      0.29      2003

    accuracy                           0.33      6000
   macro avg       0.33      0.33      0.33      6000
weighted avg       0.33      0.33      0.33      6000



In [72]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier = classifier.fit(X_train,Y_train)

y_pred = classifier.predict(X_test)

In [73]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.345
[[ 900 1015    0]
 [ 912 1170    0]
 [ 962 1041    0]]
              precision    recall  f1-score   support

         0.0       0.32      0.47      0.38      1915
         1.0       0.36      0.56      0.44      2082
         2.0       0.00      0.00      0.00      2003

    accuracy                           0.34      6000
   macro avg       0.23      0.34      0.27      6000
weighted avg       0.23      0.34      0.28      6000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
