In [1]:
#pip install seaborn

In [2]:
#pip install openpyxl

In [3]:
#pip install dython

In [4]:
#pip install imblearn

In [5]:
#import necessary libraries
import pandas as pd
import numpy as np
import openpyxl
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold, cross_val_score,cross_val_predict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm, tree, metrics
from sklearn.feature_selection import *
from sklearn.model_selection import *

In [None]:
#Read the dataset
df = pd.read_excel('Dataset.xlsx')
df

In [7]:
print("============= Data Summary =============")
print("Missing Values: " ,np.sum(df.isnull().any(axis=1)))
print("Shape: ", df.shape)
print("Columns: ", len(df.columns))
print("Data types:")
print(df.dtypes)
     

Missing Values:  895
Shape:  (899, 385)
Columns:  385
Data types:
STUDY_ID                                     object
SUBJ_ID                                       int64
VISIT_NAME                                   object
Source Of Information (PTCGBOTH)             object
Experienced Cognitive Decline (COGDECLN)     object
                                             ...   
CSF Hemoglobin (ng/ml) (Textual)             object
CSF Alpha-synuclein (pg/ml)                 float64
Abeta 42 (pg/ml)                            float64
Total tau (pg/ml)                           float64
Triglycerides (mg/dL)                       float64
Length: 385, dtype: object


In [8]:
#We need to convert the different type of values to categorical so we can run correlation function and machine learning algorithms 
#We convert them to categorical type and assign codes to values
df[[col for col in df.columns if df[col].dtypes == object]] = df[[col for col in df.columns if df[col].dtypes == object]].astype('category')#.cat.codes

In [9]:
cat_cols = [col for col in df.columns if df[col].dtype == 'category']

In [10]:
for col in cat_cols:
    df[col] = df[col].cat.codes

In [11]:
#Export the correlation matrix to csv
(df.corr()).to_csv('CorrelationMatrix.csv')
#Export the matrix to find the correlation between the "target" attribute with the other attributes 
(df.corr()['Cognitive State (COGSTATE)']).to_csv('ClassCorellation.csv')

In [12]:
#Fill the empty values with value of previous cell
df = df.ffill()

In [13]:
#Drop non-assgined values
df = df.dropna()

In [14]:
#Assign correlation method
corr_mat = df.corr(method='pearson')

In [15]:
#Filter and keep the values which have correlation greater than 2
dfCorr = corr_mat
filteredDf = dfCorr[(dfCorr >= .2) & (dfCorr !=1.000)]
filteredDf.to_csv('ClassCorellationFiltered.csv')
filteredDf


Unnamed: 0,STUDY_ID,SUBJ_ID,VISIT_NAME,Source Of Information (PTCGBOTH),Experienced Cognitive Decline (COGDECLN),Functional Impairment Due To Cognitive (FNCDTCOG),Cognitive State (COGSTATE),Level Of Confidence Cognitive Diagnosis (COGDXCL),Review Neuropsychological Tests (RVWNPSY),Age,...,Standing Blood Pressure - Systolic (mmHg) (SYSSTND),Standing Blood Pressure - Diastolic (mmHg) (DIASTND),Standing Heart Rate (beats/min) (HRSTND),Serum IGF-1 (ng/mL),p-Tau181P (pg/ml),CSF Hemoglobin (ng/ml) (Textual),CSF Alpha-synuclein (pg/ml),Abeta 42 (pg/ml),Total tau (pg/ml),Triglycerides (mg/dL)
STUDY_ID,,,,,,,,,,,...,,,,,,,,,,
SUBJ_ID,,,,,,,,,,,...,,,,,,,0.563972,,,
VISIT_NAME,,,,,,,,,,,...,,,,0.315246,,0.411656,,,,
Source Of Information (PTCGBOTH),,,,,,0.230773,,,,,...,,,,,,,,,,
Experienced Cognitive Decline (COGDECLN),,,,,,0.417194,0.732831,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CSF Hemoglobin (ng/ml) (Textual),,,0.411656,,,,,,,,...,,,,,,,,,,
CSF Alpha-synuclein (pg/ml),,0.563972,,,,,,,,,...,,,,,0.250449,,,,0.663215,
Abeta 42 (pg/ml),,,,,,,,,,,...,,,,,0.257285,,,,,
Total tau (pg/ml),,,,,,,,,,0.236175,...,,,,,0.461550,,0.663215,,,


In [16]:
#Drop columns which have correlation less than 0.2 with the target attribute
to_drop = [column for column in corr_mat.columns if abs(corr_mat['Cognitive State (COGSTATE)'][column]) <= 0.2]

In [17]:
df1 = df.drop(to_drop, axis=1)

In [18]:
df1

Unnamed: 0,STUDY_ID,Experienced Cognitive Decline (COGDECLN),Functional Impairment Due To Cognitive (FNCDTCOG),Cognitive State (COGSTATE),Level Of Confidence Cognitive Diagnosis (COGDXCL),Age,Identify Self As Hawaiian/Other Pacific Islander (RAHAWOPI),Maternal Grandparents (MAGPAR),Paternal Grandparents (PAGPAR),MDS-UPDRS Total Score,...,Derived-Semantic Fluency-Animal Scaled Score (DVS_SFTANIM),SDMT - Total Score (SDMT0101),Age At Assessment - Symbol Digit (AGE_ASSESS_SDM),Derived-Symbol Digit SD (DVSD_SDM),Derived-Symbol Digit T-Score (DVT_SDM),Score From Booklet #1 (UPSITBK1),Score From Booklet #2 (UPSITBK2),Score From Booklet #3 (UPSITBK3),Score From Booklet #4 (UPSITBK4),UPSIT Total Score
1,0,1,0,0,3,65,0,2.0,2.0,22.0,...,10.0,42.0,65.0,-0.167,48.330002,4.0,6.0,9.0,6.0,25.0
2,0,1,0,0,2,68,0,2.0,2.0,40.0,...,12.0,41.0,67.0,-0.250,47.500000,3.0,5.0,3.0,6.0,17.0
3,0,0,0,0,3,57,0,2.0,2.0,47.0,...,10.0,37.0,56.0,-1.250,37.500000,5.0,7.0,5.0,6.0,23.0
4,0,0,0,0,3,60,0,2.0,2.0,4.0,...,18.0,47.0,59.0,-0.100,49.000000,10.0,10.0,8.0,8.0,36.0
5,0,0,0,0,2,82,0,2.0,2.0,20.0,...,9.0,34.0,81.0,-0.833,41.669998,9.0,10.0,9.0,9.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,0,0,0,0,3,63,0,2.0,2.0,4.0,...,11.0,31.0,66.0,-1.100,39.000000,5.0,3.0,4.0,3.0,15.0
895,0,1,0,1,2,82,0,2.0,2.0,20.0,...,7.0,27.0,82.0,-1.500,35.000000,5.0,5.0,4.0,5.0,19.0
896,0,0,0,0,3,67,0,2.0,2.0,9.0,...,9.0,45.0,67.0,0.100,51.000000,5.0,4.0,5.0,4.0,18.0
897,0,0,0,0,2,72,0,2.0,2.0,17.0,...,14.0,53.0,72.0,2.200,72.000000,3.0,1.0,3.0,3.0,10.0


In [19]:

X = df1.drop('Cognitive State (COGSTATE)', axis=1)

y = df1['Cognitive State (COGSTATE)']

In [20]:
y

1      0
2      0
3      0
4      0
5      0
      ..
894    0
895    1
896    0
897    0
898    0
Name: Cognitive State (COGSTATE), Length: 898, dtype: int8

In [21]:
X.to_csv("X.csv")

In [22]:
#Split the test to 70-30 for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [23]:
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import SMOTENC




In [24]:
#Use SMOTE to oversample as "State 1" in the target attribute is undersampled
oversample =SMOTEN(sampling_strategy='minority',k_neighbors=1)
X_res,y_res = oversample.fit_resample(X_train,y_train)
print(X_res.shape)
print(y_res.shape)


(1042, 36)
(1042,)


RusBoost Classifier

In [25]:
clf = RUSBoostClassifier(random_state=0)
clf.fit(X_res, y_res)
predRUS = clf.predict(X_test)


In [26]:
report = classification_report(y_test, predRUS)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       222
           1       0.82      0.85      0.84        48

    accuracy                           0.94       270
   macro avg       0.89      0.91      0.90       270
weighted avg       0.94      0.94      0.94       270



AdaBoost Classifier

In [27]:
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
model = abc.fit(X_res, y_res)
predABC = model.predict(X_test)
                         

In [28]:
report = classification_report(y_test, predABC)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       222
           1       0.82      0.85      0.84        48

    accuracy                           0.94       270
   macro avg       0.89      0.91      0.90       270
weighted avg       0.94      0.94      0.94       270



XGB Classifier

In [29]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(X_res, y_res)
predXGB = xgb_cl.predict(X_test)

In [30]:
print(classification_report(y_test,predXGB))


              precision    recall  f1-score   support

           0       0.95      0.95      0.95       222
           1       0.79      0.77      0.78        48

    accuracy                           0.92       270
   macro avg       0.87      0.86      0.87       270
weighted avg       0.92      0.92      0.92       270

