<a href="https://colab.research.google.com/github/MarkNCI/AI-Ml-Diploma/blob/main/MHayden_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment Description
I am using the Personal Key Indicators of Heart Disease dataset from [Kaggle](https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease), which is an annual telephone survey conducted by the United States Center for Disease Control in 2020 with over 400,000 participants.

## Column Descriptions


***HeartDisease***: Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI).

***BMI***: Body Mass Index (BMI).

***Smoking***: Have you smoked at least 100 cigarettes in your entire life?

***AlcoholDrinking***: Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week

***Stroke***: (Ever told) (you had) a stroke?

***PhysicalHealth***: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days).

***MentalHealth***: Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days).

***DiffWalking***: Do you have serious difficulty walking or climbing stairs?

***Sex***: Are you male or female?

***AgeCategory***: Fourteen-level age category. (then calculated the mean)

***Race***: Imputed race/ethnicity value.

***Diabetic***: (Ever told) (you had) diabetes?

***PhysicalActivity***: Adults who reported doing physical activity or exercise during the past 30 days other than their regular job.

***GenHealth***: Would you say that in general your health is...

***SleepTime***: On average, how many hours of sleep do you get in a 24-hour period?

***Asthma***: (Ever told) (you had) asthma?

***KidneyDisease***: Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?

***SkinCancer***: (Ever told) (you had) skin cancer?

In [None]:
# Load Libraries
from google.colab import files
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense

## Upload Cancer Patients dataset (Google Colab) ##
uploaded = files.upload()

In [None]:
# Load dataset
df = pd.read_csv('/content/heart_2020_cleaned.csv')
df.info()

# Dataset Details

In [None]:
# Dataset details
print(df.columns,'\n')
df.head()

In [None]:
# Count of Label Values
print(df['HeartDisease'].value_counts()['No'])
print(df['HeartDisease'].value_counts()['Yes'])

In [None]:
# Count nulls
df.isna().sum()

In [None]:
# Categorical columns
categorical = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race', 
               'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']

for cat in categorical:
  print(cat)
  print(np.unique(df[cat].values))

# Stats per Categorical columns

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(df['HeartDisease'].value_counts(),labels = df['HeartDisease'].unique(), colors = colors, autopct='%.0f%%')
plt.title('HeartDisease')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Smoking'].value_counts(),labels = df['Smoking'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Smoking')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['AlcoholDrinking'].value_counts(),labels = df['AlcoholDrinking'].unique(), colors = colors, autopct='%.0f%%')
plt.title('AlcoholDrinking')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Stroke'].value_counts(),labels = df['Stroke'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Stroke')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Sex'].value_counts(),labels = df['Sex'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Sex')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['DiffWalking'].value_counts(),labels = df['DiffWalking'].unique(), colors = colors, autopct='%.0f%%')
plt.title('DiffWalking')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Race'].value_counts(),labels = df['Race'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Race')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Diabetic'].value_counts(),labels = df['Diabetic'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Diabetic')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['PhysicalActivity'].value_counts(),labels = df['PhysicalActivity'].unique(), colors = colors, autopct='%.0f%%')
plt.title('PhysicalActivity')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['GenHealth'].value_counts(),labels = df['GenHealth'].unique(), colors = colors, autopct='%.0f%%')
plt.title('GenHealth')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['Asthma'].value_counts(),labels = df['Asthma'].unique(), colors = colors, autopct='%.0f%%')
plt.title('Asthma')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['KidneyDisease'].value_counts(),labels = df['KidneyDisease'].unique(), colors = colors, autopct='%.0f%%')
plt.title('KidneyDisease')
plt.show()

colors = sns.color_palette('pastel')[0:5]
plt.pie(df['SkinCancer'].value_counts(),labels = df['SkinCancer'].unique(), colors = colors, autopct='%.0f%%')
plt.title('SkinCancer')
plt.show()

# Stats per Numerical columns

In [None]:
BMI = sns.histplot(data=df['BMI'])
BMI

In [None]:
PH = sns.histplot(data=df['PhysicalHealth'])
PH

In [None]:
MH = sns.histplot(data=df['MentalHealth'])
MH

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(df['AgeCategory'].value_counts(),labels = df['AgeCategory'].unique(), colors = colors, autopct='%.0f%%')
plt.title('AgeCategory')
plt.show()

In [None]:
ST = sns.histplot(data=df['SleepTime'])
ST

# Feature Extraction

In [None]:
# Converting ages into mean
print(df['AgeCategory'].unique())
df['Age'] = df['AgeCategory'].apply(lambda x: '57' if x == '55-59' else '80' if x == '80 or older' 
                                    else '67' if x == '65-69' else '77' if x == '75-79' else '42' if x == '40-44'
                                    else '72' if x == '70-74' else '62' if x == '60-64' else '52' if x == '50-54' 
                                    else '47' if x == '45-49' else '20' if x == '18-24' else '37' if x == '35-39' 
                                    else '32' if x == '30-34' else '27' if x == '25-29' else x)
print(df['Age'].unique())
df = df.drop(columns=['AgeCategory'])
df.columns

In [None]:
# Numerical columns: Normalise between 0 and 1
numerical = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
for num in numerical:
  df[num] = df[num]/df[num].max()

df[numerical]

In [None]:
# Count unique values in categorical columns
print(df[['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'Race', 
 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer']].nunique()>2)

In [None]:
# For columns with 2 unique values, sklearn labelencoder
encoder = preprocessing.LabelEncoder()
for col in df[['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']]:
  df[col] = encoder.fit_transform(df[col])

# One Hot Encoding for columns with more than 2 unique values
df = pd.get_dummies(df, columns=['Race', 'Diabetic', 'GenHealth'], prefix= ['Race', 'Diabetic', 'GenHealth'])
df.head()

In [None]:
# Count of Label Values
print(df['HeartDisease'].value_counts()[0])
print(df['HeartDisease'].value_counts()[1])

In [None]:
# Checking to see if dataset is balanced
train_df = df[:int(len(df)*0.8)]
test_df = df[int(len(df)*0.8):]
print(train_df['HeartDisease'].value_counts())
print(test_df['HeartDisease'].value_counts())

In [None]:
# As dataset is imbalanced, will oversample to balance out
no_heartdisease = train_df[train_df['HeartDisease'] == 0]
has_heartdisease = train_df[train_df['HeartDisease'] == 1]

has_heartdisease = has_heartdisease.sample(len(no_heartdisease),replace=True)
train_df = pd.concat([no_heartdisease, has_heartdisease], axis=0)
print(train_df['HeartDisease'].value_counts())

# As dataset is imbalanced, will oversample to balance out
no_heartdisease = test_df[test_df['HeartDisease'] == 0]
has_heartdisease = test_df[test_df['HeartDisease'] == 1]

has_heartdisease = has_heartdisease.sample(len(no_heartdisease),replace=True)
test_df = pd.concat([no_heartdisease, has_heartdisease], axis=0)
print(test_df['HeartDisease'].value_counts())

In [None]:
# Concat into single dataframe
df = pd.concat([train_df,test_df],axis=0)
df['HeartDisease'].value_counts()

In [None]:
# Encode as Numpy arrays for Train/Test split
X = df[['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'Age',
       'PhysicalActivity', 'SleepTime', 'Asthma', 'KidneyDisease',
       'SkinCancer', 'Race_American Indian/Alaskan Native', 'Race_Asian',
       'Race_Black', 'Race_Hispanic', 'Race_Other', 'Race_White',
       'Diabetic_No', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
       'Diabetic_Yes (during pregnancy)', 'GenHealth_Excellent',
       'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor',
       'GenHealth_Very good']]
y = df['HeartDisease']
X = np.asarray(X).astype(np.float32)
y = np.asarray(y).astype(np.float32)

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ML Models

In [None]:
# Decision Tree
%%time
dt = DecisionTreeClassifier().fit(X_train, y_train)
score = dt.score(X_test, y_test)
predict = dt.predict(X_test)
print(str(score))
print("Total points: %d Correctly labeled points : %d" %(y_test.shape[0],(y_test == predict).sum()))
cm = np.array(confusion_matrix(y_test, predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['has_heartdisease', 'no_heartdisease'],
columns=['predicted_heartdisease','predicted_healthy'])
print(confusion)
print(classification_report(y_test, predict))

In [None]:
# Random Forest
%%time
rf = RandomForestClassifier().fit(X_train, y_train)
score = rf.score(X_test, y_test)
predict = rf.predict(X_test)
print(str(score))
print("Total points: %d Correctly labeled points : %d" %(y_test.shape[0],(y_test == predict).sum()))
cm = np.array(confusion_matrix(y_test, predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['has_heartdisease', 'no_heartdisease'],
columns=['predicted_heartdisease','predicted_healthy'])
print(confusion)
print(classification_report(y_test, predict))

In [None]:
# K-Nearset Neighbours
%%time
knn = KNeighborsClassifier().fit(X_train, y_train)
score = knn.score(X_test, y_test)
predict = knn.predict(X_test)
print(str(score))
print("Total points: %d Correctly labeled points : %d" %(y_test.shape[0],(y_test == predict).sum()))
cm = np.array(confusion_matrix(y_test, predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['has_heartdisease', 'no_heartdisease'],
columns=['predicted_heartdisease','predicted_healthy'])
print(confusion)
print(classification_report(y_test, predict))

In [None]:
# Naive Bayes
%%time
nb = GaussianNB().fit(X_train, y_train)
score = nb.score(X_test, y_test)
print(str(score))
predict = nb.predict(X_test)
print("Total points: %d Correctly labeled points : %d" %(y_test.shape[0],(y_test == predict).sum()))
cm = np.array(confusion_matrix(y_test, predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['has_heartdisease', 'no_heartdisease'],
columns=['predicted_heartdisease','predicted_healthy'])
print(confusion)
print(classification_report(y_test, predict))

In [None]:
# Multi-Level Perceptron
%%time
mlp = MLPClassifier(random_state=1, max_iter=300,alpha=1).fit(X_train, y_train)
score = mlp.score(X_test, y_test)
print(str(score))
predict = mlp.predict(X_test)
print("Total points: %d Correctly labeled points : %d" %(y_test.shape[0],(y_test == predict).sum()))
cm = np.array(confusion_matrix(y_test, predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['has_heartdisease', 'no_heartdisease'],
columns=['predicted_heartdisease','predicted_healthy'])
print(confusion)
print(classification_report(y_test, predict))

In [None]:
# Linear Regression
%%time
lr = LinearRegression().fit(X_train, y_train)
score = lr.score(X_test, y_test)
print(str(score))

In [None]:
# Neural Network
model = Sequential()
model.add(Dense(50, input_dim=29, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Run Neural Network
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, verbose=1)

In [None]:
# Print Accuracy
model.summary()
score = model.evaluate(X_test, y_test, verbose=0)
print('Model Accuracy = ',score[1])