In [33]:
# Just a test model on heart prediction, since I'm still working on familiarity with these things
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report # Cool library, remember this

In [34]:
data = pd.read_csv("heartattackdata.csv")

In [35]:
print(data.columns)
print(data.shape)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num       '],
      dtype='object')
(294, 14)


In [36]:
# Clean the data
data = data.replace({"?":np.NaN})

In [37]:
print(data.isnull().sum())

age             0
sex             0
cp              0
trestbps        1
chol           23
fbs             8
restecg         1
thalach         1
exang           1
oldpeak         0
slope         190
ca            291
thal          266
num             0
dtype: int64


In [38]:
# ca has basically all null, so it would be best to drop it

data.drop("ca", axis = 1, inplace = True)
print(data.isnull().sum())

age             0
sex             0
cp              0
trestbps        1
chol           23
fbs             8
restecg         1
thalach         1
exang           1
oldpeak         0
slope         190
thal          266
num             0
dtype: int64


In [39]:
# Replace null with median data
data.fillna(data.median(), inplace=True)

# Get rid of duplicate data
print(data.duplicated().sum())
data.drop_duplicates(inplace=True)

1


In [40]:
# Check columns again to make sure data is ok before ml algorithms
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'thal', 'num       '],
      dtype='object')

In [41]:
# Num looks a little weird, change the name so it's more human friendly
data = data.rename(columns={"num       ":"num"})
data.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'thal', 'num'],
      dtype='object')

In [42]:
# Much better. Now let's split the data
X = data.drop("num", axis=1)
Y = data["num"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [43]:
logReg = LogisticRegression()
logReg.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [44]:
# Make a prediction
logRegPrediction = logReg.predict(X_test)

In [45]:
print(classification_report(Y_test, logRegPrediction))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89        41
           1       0.71      0.83      0.77        18

    accuracy                           0.85        59
   macro avg       0.82      0.84      0.83        59
weighted avg       0.86      0.85      0.85        59



In [50]:
# Let's try our own model to see what difference it makes

sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, Y_train)

SGDClassifier()

In [54]:
SGDPrediction = sgd_clf.predict(X_test)
print(classification_report(Y_test, SGDPrediction)) # Not the best accuracy, but it still works!

              precision    recall  f1-score   support

           0       0.74      0.90      0.81        41
           1       0.56      0.28      0.37        18

    accuracy                           0.71        59
   macro avg       0.65      0.59      0.59        59
weighted avg       0.68      0.71      0.68        59



In [53]:
# Kaggle dataset: https://www.kaggle.com/imnikhilanand/heart-attack-prediction
# Model that helped me solve my NaN issues: https://www.kaggle.com/terrifictitan12/heart-attack-prediction-83-accuracy