In [22]:
!gdown 1YAQHOBSmpDfwYeFq1dMQv94DyYPkp7Xc

Downloading...
From: https://drive.google.com/uc?id=1YAQHOBSmpDfwYeFq1dMQv94DyYPkp7Xc
To: /content/alzheimer.csv
  0% 0.00/2.38M [00:00<?, ?B/s]100% 2.38M/2.38M [00:00<00:00, 107MB/s]


In [23]:
import pandas as pd
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
data = pd.read_csv('alzheimer.csv')
data.sample(3)

Unnamed: 0,PatientID,Domain,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
3579,P003580,Orthopedics,39.0,28.79,69.0,84.0,191.0,76.2,74.54,Medium Risk,No,Improved
7572,P007573,Pediatrics,51.0,29.2,71.0,105.0,216.0,80.3,72.66,Low Risk,No,Stable
1044,P001045,Dermatology,36.0,24.54,78.0,122.0,213.0,70.04,70.96,Low Risk,No,Stable


In [25]:
data.isna().sum()

Unnamed: 0,0
PatientID,0
Domain,200
Age,200
BMI,200
HeartRate,50
BloodPressure,200
Cholesterol,200
CognitiveTestScore,50
StressMobilityTestScore,200
RiskLevel,0


In [26]:
def drop_data(data:pd.DataFrame, columns:list):
  data_copy = data.copy()
  data_copy.drop(columns, axis=1, inplace=True)
  return data_copy

In [27]:
def simple_impute_data(data:pd.DataFrame, columns:list, strategy:str):
  imputer = SimpleImputer(strategy=strategy)
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = imputer.fit_transform(data_copy[[column]])
  return data_copy

In [28]:
def label_data(data:pd.DataFrame, columns:list):
  encoder = LabelEncoder()
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())

    # if 'nan' in encoder.classes_:
    #   data_copy.loc[data_copy[column] == -1, column] = np.nan
  return data_copy

In [29]:
# def knn_impute_data(data:pd.DataFrame, columns:list, n_neighbors:int, weights:str):
#   imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
#   data_copy = data.copy()

#   for column in columns:
#     data_copy[column] = imputer.fit_transform(data_copy[column].to_numpy().reshape(-1, 1))
#   return data_copy

In [30]:
data = drop_data(data=data, columns=['PatientID', 'Domain'])

data = simple_impute_data(data=data, columns=['Age', 'BMI', 'HeartRate', 'BloodPressure', 'Cholesterol', 'CognitiveTestScore', 'StressMobilityTestScore'], strategy='mean')
data = label_data(data=data, columns=['RiskLevel', 'FamilyHistoryDementia', 'TreatmentResponse'])

In [42]:
data.sample(5)

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
17924,80.0,29.26,85.0,124.0,194.0,81.05,86.13,2,1,0
1820,52.0,18.63,76.0,95.0,249.0,76.27,68.08,2,1,2
6538,31.0,17.64,87.0,116.0,194.0,61.16,80.15,1,0,1
22889,72.0,29.77,62.0,125.0,161.0,61.05,59.82,1,0,2
21644,18.0,30.31,55.0,132.0,174.0,76.79,88.39,2,0,1


In [37]:
data.isna().sum()

Unnamed: 0,0
Age,0
BMI,0
HeartRate,0
BloodPressure,0
Cholesterol,0
CognitiveTestScore,0
StressMobilityTestScore,0
RiskLevel,0
FamilyHistoryDementia,0
TreatmentResponse,0


In [38]:
input_data = data.copy()
input_data = drop_data(data=input_data, columns=['RiskLevel'])
input_data.sample(3)

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,FamilyHistoryDementia,TreatmentResponse
6142,82.0,28.23,75.0,137.0,205.0,79.8,81.08,0,2
23,26.0,34.47,89.0,105.0,210.0,85.18,49.93,0,1
9212,86.0,27.35,80.0,111.0,181.0,85.18,82.19,0,1


In [39]:
target_data = data.copy()['RiskLevel']
target_data.sample(3)

Unnamed: 0,RiskLevel
3322,1
28737,0
28698,0


In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target_data, test_size=0.2)

In [52]:
model = XGBClassifier(max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=-1, learning_rate=0.1)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

In [53]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.3313333333333333
[[539 911 506]
 [550 951 550]
 [595 900 498]]
              precision    recall  f1-score   support

           0       0.32      0.28      0.30      1956
           1       0.34      0.46      0.40      2051
           2       0.32      0.25      0.28      1993

    accuracy                           0.33      6000
   macro avg       0.33      0.33      0.32      6000
weighted avg       0.33      0.33      0.32      6000



In [54]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier = classifier.fit(X_train,Y_train)

y_pred = classifier.predict(X_test)

In [55]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.3338333333333333
[[ 958  998    0]
 [1006 1045    0]
 [ 976 1017    0]]
              precision    recall  f1-score   support

           0       0.33      0.49      0.39      1956
           1       0.34      0.51      0.41      2051
           2       0.00      0.00      0.00      1993

    accuracy                           0.33      6000
   macro avg       0.22      0.33      0.27      6000
weighted avg       0.22      0.33      0.27      6000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
