# Mini Project: Dementia Patients  
## Are we able to predict if a person has dementia based on its attributes? 

### General Code

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

#For MinMaxScaler it keeps numerical data within a given range, usually between 0 and 1, 
#OneHotEncoder increases processing efficiency for machine learning algorithms by converting   
#categorical variables into a binary matrix without imposing a linear relationship.
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

#ColumnTransformer allows different columns or portions of the input dataset to be transformed separately and 
#the features generated by each transformer will be joined to form a single feature space. 
from sklearn.compose import ColumnTransformer

In [2]:
#Import csv file into dataframe using pandas
data = pd.read_csv('data/dementia_patients_health_data.csv')
data.head(5)

Unnamed: 0,Diabetic,AlcoholLevel,HeartRate,BloodOxygenLevel,BodyTemperature,Weight,MRI_Delay,Prescription,Dosage in mg,Age,...,Smoking_Status,APOE_ε4,Physical_Activity,Depression_Status,Cognitive_Test_Scores,Medication_History,Nutrition_Diet,Sleep_Quality,Chronic_Health_Conditions,Dementia
0,1,0.084974,98,96.230743,36.224852,57.563978,36.421028,,,60,...,Current Smoker,Negative,Sedentary,No,10,No,Low-Carb Diet,Poor,Diabetes,0
1,0,0.016973,78,93.032122,36.183874,56.832335,31.157633,Galantamine,12.0,61,...,Former Smoker,Positive,Moderate Activity,No,1,Yes,Low-Carb Diet,Poor,Heart Disease,1
2,0,0.009,89,93.566504,37.326321,59.759066,37.640435,,,69,...,Former Smoker,Negative,Moderate Activity,No,8,No,Mediterranean Diet,Poor,Heart Disease,0
3,0,0.086437,60,93.90651,37.03062,58.266471,50.673992,Donepezil,23.0,78,...,Never Smoked,Negative,Mild Activity,Yes,5,Yes,Balanced Diet,Poor,Hypertension,1
4,1,0.150747,67,97.508994,36.062121,67.705027,27.810601,Memantine,20.0,77,...,Never Smoked,Positive,Mild Activity,No,0,Yes,Low-Carb Diet,Good,Diabetes,1


### 1) Data Cleaning on duplication and missing values to check on which of the data is not needed to detemining a person has dementia.

In [3]:
#Remove duplicate entries
data.drop_duplicates(inplace=True)

#Check data remaining missing values
missing_data_summary = data.isnull().sum()

#Display the updated info and any remaining missing data
data.info(), missing_data_summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Diabetic                   1000 non-null   int64  
 1   AlcoholLevel               1000 non-null   float64
 2   HeartRate                  1000 non-null   int64  
 3   BloodOxygenLevel           1000 non-null   float64
 4   BodyTemperature            1000 non-null   float64
 5   Weight                     1000 non-null   float64
 6   MRI_Delay                  1000 non-null   float64
 7   Prescription               485 non-null    object 
 8   Dosage in mg               485 non-null    float64
 9   Age                        1000 non-null   int64  
 10  Education_Level            1000 non-null   object 
 11  Dominant_Hand              1000 non-null   object 
 12  Gender                     1000 non-null   object 
 13  Family_History             1000 non-null   object

(None,
 Diabetic                       0
 AlcoholLevel                   0
 HeartRate                      0
 BloodOxygenLevel               0
 BodyTemperature                0
 Weight                         0
 MRI_Delay                      0
 Prescription                 515
 Dosage in mg                 515
 Age                            0
 Education_Level                0
 Dominant_Hand                  0
 Gender                         0
 Family_History                 0
 Smoking_Status                 0
 APOE_ε4                        0
 Physical_Activity              0
 Depression_Status              0
 Cognitive_Test_Scores          0
 Medication_History             0
 Nutrition_Diet                 0
 Sleep_Quality                  0
 Chronic_Health_Conditions    179
 Dementia                       0
 dtype: int64)

We found out that the following attributes has mising data: <br>
-Prescription (will be removed from model) <br>
-Dosage in mg (will be removed from model) <br>
-Chronic_Health_Conditions (missing value indicates no chronic health conditions) 

In [4]:
#Replace string entries in Smoking_Status to 1,2,3 for Never Smoked, Former Smoker and Current Smoker respectively.
#Repeat for physical activity,SleepQuality,Education

data['Smoking_Status'] = data['Smoking_Status'].replace('Never Smoked',0)
data['Smoking_Status'] = data['Smoking_Status'].replace('Former Smoker',1)
data['Smoking_Status'] = data['Smoking_Status'].replace('Current Smoker',2)

#Physical activity ranked in order
data['Physical_Activity'] = data['Physical_Activity'].replace('Sedentary',1)
data['Physical_Activity'] = data['Physical_Activity'].replace('Mild Activity',2)
data['Physical_Activity'] = data['Physical_Activity'].replace('Moderate Activity',3)

#Education ranked in order
data['Education_Level'] = data['Education_Level'].replace('No School',1)
data['Education_Level'] = data['Education_Level'].replace('Primary School',2)
data['Education_Level'] = data['Education_Level'].replace('Secondary School',3)
data['Education_Level'] = data['Education_Level'].replace('Diploma/Degree',4)

#Sleep quality ranked in order
data['Sleep_Quality'] = data['Sleep_Quality'].replace('Poor',0)
data['Sleep_Quality'] = data['Sleep_Quality'].replace('Good',1)
data['Sleep_Quality'] = data['Sleep_Quality'].astype(bool)

#Convert Family History to Boolean
data['Family_History'] = data['Family_History'].replace('No',0)
data['Family_History'] = data['Family_History'].replace('Yes',1)
data['Family_History'] = data['Family_History'].astype(bool)

#Convert Diabetic to Boolean
data['Diabetic'] = data['Diabetic'].astype(bool)

#Convert Depression to Boolean
data['Depression_Status'] = data['Depression_Status'].astype(bool)

#Convert Chronic Health Conditions to Boolean
data['Chronic_Health_Conditions'] = data['Chronic_Health_Conditions'].replace('Diabetes',1)
data['Chronic_Health_Conditions'] = data['Chronic_Health_Conditions'].replace('Heart Disease',1)
data['Chronic_Health_Conditions'] = data['Chronic_Health_Conditions'].replace('Hypertension',1)
data['Chronic_Health_Conditions'] = data['Chronic_Health_Conditions'].replace('None',0)
data['Chronic_Health_Conditions'] = data['Chronic_Health_Conditions'].astype(bool)

#Convert APOE_ε4 to Boolean
data['APOE_ε4'] = data['APOE_ε4'].replace('Positive',1)
data['APOE_ε4'] = data['APOE_ε4'].replace('Negative',0)
data['APOE_ε4'] = data['APOE_ε4'].astype(bool)

In [5]:
data.info(), missing_data_summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Diabetic                   1000 non-null   bool   
 1   AlcoholLevel               1000 non-null   float64
 2   HeartRate                  1000 non-null   int64  
 3   BloodOxygenLevel           1000 non-null   float64
 4   BodyTemperature            1000 non-null   float64
 5   Weight                     1000 non-null   float64
 6   MRI_Delay                  1000 non-null   float64
 7   Prescription               485 non-null    object 
 8   Dosage in mg               485 non-null    float64
 9   Age                        1000 non-null   int64  
 10  Education_Level            1000 non-null   int64  
 11  Dominant_Hand              1000 non-null   object 
 12  Gender                     1000 non-null   object 
 13  Family_History             1000 non-null   bool  

(None,
 Diabetic                       0
 AlcoholLevel                   0
 HeartRate                      0
 BloodOxygenLevel               0
 BodyTemperature                0
 Weight                         0
 MRI_Delay                      0
 Prescription                 515
 Dosage in mg                 515
 Age                            0
 Education_Level                0
 Dominant_Hand                  0
 Gender                         0
 Family_History                 0
 Smoking_Status                 0
 APOE_ε4                        0
 Physical_Activity              0
 Depression_Status              0
 Cognitive_Test_Scores          0
 Medication_History             0
 Nutrition_Diet                 0
 Sleep_Quality                  0
 Chronic_Health_Conditions    179
 Dementia                       0
 dtype: int64)

We convert Smoking_Status, Physical_Activity, Education_Level to numerical and not other data like dominant_hand, gender, nutrition because the above variables can be placed on a scale in terms of worst to best. Hence, it makes more sense to convert them to numerical figures for easier data analysis.

We changed Diabetes,Family_History,APOE_ε4,Depression_Status,Sleep_Quality,Chronic_Health_Conditions to Boolean types as it suits them more as they are variables that are either a yes or no. 

We keep Dementia as numerical (0 and 1) for the correlation heatmap

In [6]:
# We excluded 'Prescription' and 'Dosage in mg' because they are attributes for dementia patients 
# and do not help in predicting the presence of dementia.

# We excluded 'Medication_History' because it does not help with our prediction

#Remove prescription, dosage
data = data.drop('Prescription', axis = 1)
data = data.drop('Dosage in mg', axis = 1)
data = data.drop('Medication_History', axis = 1)

num_columns = len(data.columns)
print('Number of Variables used:')
print(num_columns)



Number of Variables used:
21


In [7]:
# Identifying numerical and categorical columns
num_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = data.select_dtypes(include=['object']).columns.tolist()

# Creating transformers for numerical and categorical data
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combining transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Fitting and transforming the data
data_transformed = preprocessor.fit_transform(data)

# Display the shape of the transformed data to confirm the changes
data_transformed.shape

(1000, 19)

In [8]:
# Remove 'AlcoholLevel' from the clean data and save into a csv file
data = data.drop('AlcoholLevel', axis = 1)
dementia_patients_cleaned_without_AlcoholLevel = data
dementia_patients_cleaned_without_AlcoholLevel.to_csv('data/dementia_patients_cleaned_2_After EDA.csv', index=False )