# Kieran Robson - 600092 - Data Mining ACW

# Business Understanding
1. To predict whether a patient is a risk of mortality depending on any previous health issues they may have/had.
2. To have an efficient model with 99% or above accuracy.

# Data Understanding
## Import the modules needed

In [None]:
#Imports various modules
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as mat
import numpy as np
%matplotlib inline

import sklearn.model_selection

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Import the data and copy it to a new variable so the original data is not effected

In [None]:
#Reads in data csv
trainData = pd.read_csv("cardio-vascular--ACWData.csv")
#Copies trainData as to not effect current data set
dataCopy = trainData

## Data Analysis showcasing all attributes, data type and number of null values

In [None]:
dataCopy.info()

# Data Preparation
## Data Cleaning

In [None]:
#Drop rows that arent relevant medically
dataCopy.drop(['Random', 'Id'], axis = 1, inplace = True)

#Replacing lower Asx with ASx so consistent throughout
dataCopy['Indication'].replace({'Asx': 'ASx'}, inplace = True)

#Gets rid of errors within Contra
dataCopy['Contra'] = pd.to_numeric(dataCopy.Contra, errors='coerce')
#Fills null cells with mean
dataCopy['Contra'].fillna(value = dataCopy['Contra'].mean(), inplace = True)

#Filling IPSI null values with mean
dataCopy['IPSI'].fillna(value = dataCopy['IPSI'].mean(), inplace = True)

#Filling in data using mode
dataCopy['Indication'].fillna(dataCopy['Indication'].mode()[0], inplace = True)
dataCopy['Diabetes'].fillna(dataCopy['Diabetes'].mode()[0], inplace = True)
dataCopy['IHD'].fillna(dataCopy['IHD'].mode()[0], inplace = True)
dataCopy['Hypertension'].fillna(dataCopy['Hypertension'].mode()[0], inplace = True)
dataCopy['Arrhythmia'].fillna(dataCopy['Arrhythmia'].mode()[0], inplace = True)
dataCopy['History'].fillna(dataCopy['History'].mode()[0], inplace = True)

#Replace yes and no into Binary
dataCopy['Diabetes'].replace({'yes': 1, 'no': 0}, inplace = True)
dataCopy['IHD'].replace({'yes': 1, 'no': 0}, inplace = True)
dataCopy['Hypertension'].replace({'yes': 1, 'no': 0}, inplace = True)
dataCopy['Arrhythmia'].replace({'yes': 1, 'no': 0}, inplace = True)
dataCopy['History'].replace({'yes': 1, 'no': 0}, inplace = True)

#Drop all Unknown Values
dataCopy.drop(dataCopy.index[dataCopy['label'] == 'Unknown'], inplace=True)
#Drop null values
dataCopy.dropna(subset=['label'], inplace = True)
#Turn Risk and NoRisk into Binary
dataCopy['label'].replace({'Risk': 1, 'NoRisk': 0}, inplace = True)

# Modelling
## Create Dummies

In [None]:
dataCopy = pd.get_dummies(dataCopy)
pd.get_dummies(dataCopy)

## Creating Test and Train Data

In [None]:
y = dataCopy['label']
x = dataCopy.drop('label', axis = 1)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.3, shuffle = True)

## Create Classifiers 

In [None]:
modelMLP = MLPClassifier(solver='adam', activation='logistic', max_iter=500000)
modelRF = RandomForestClassifier(random_state=0)

## Fitting Data and Predicting Data

In [None]:
modelMLP.fit(x_train, y_train)
modelRF.fit(x_train, y_train)

YPredMLP = modelMLP.predict(x_test)
YPredRF = modelRF.predict(x_test)

# Model 1 - RandomForest
## M1 Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, YPredRF)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(accuracy_score(y_test, YPredRF))

# Model 2 - MultiLayer Perceptron (MLP)

## M2 Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, YPredMLP)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(accuracy_score(y_test, YPredMLP))

# Evaluation
MLP's overall performance could be improved. While it matches the first business requirement, it does not match the second by a large margin, with an accuracy below 0.98. 
The RandomForest model was a success, with most results being 0 - 5 false positives and 0 - 3 false negatives. With this model, objective 1 was met since it predicted whether patients would die. This model met business objective 2 as most results showed 99% accuracy or better.
