In [67]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier ##Stochastic Gradient Descent
from sklearn.model_selection import train_test_split

In [68]:
df = pd.read_csv("heart.csv.xls")

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [70]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [71]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [72]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [73]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [74]:
sex = pd.get_dummies(df['Sex'])
chest_pain_type = pd.get_dummies(df['ChestPainType'])
resting_ECG = pd.get_dummies(df['RestingECG'])
st_slope = pd.get_dummies(df['ST_Slope'])
exercise = pd.get_dummies(df['ExerciseAngina'])

In [75]:
df.drop(['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina'], axis=1, inplace=True)

In [76]:
df = pd.concat([df, sex, chest_pain_type, resting_ECG, st_slope, exercise], axis=1)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease',axis=1), 
                                                    df['HeartDisease'], test_size=0.30, 
                                                    random_state=10)

In [78]:
SGD_model = SGDClassifier(max_iter=1000, tol=0, random_state=10)

In [79]:
SGD_model.fit(X_train, y_train)

In [80]:
SGD_predictions = SGD_model.predict(X_test)

In [81]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,SGD_predictions))

[[126   1]
 [ 94  55]]


In [82]:
from sklearn import metrics
print("Accuracy without scaling: ", metrics.accuracy_score(y_test, SGD_predictions))

Accuracy without scaling:  0.6557971014492754


In [83]:
from sklearn.preprocessing import MinMaxScaler

In [84]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [85]:
SGD_model.fit(X_train, y_train)

In [86]:
X_test = scaler.fit_transform(X_test)

In [87]:
SGD_predictions = SGD_model.predict(X_test)

In [88]:
print(confusion_matrix(y_test,SGD_predictions))

[[ 77  50]
 [  6 143]]


In [89]:
print("Accuracy with scaling: ", metrics.accuracy_score(y_test, SGD_predictions))

Accuracy with scaling:  0.7971014492753623


### Choose which hypermeter is the best

In [90]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_iter': [100, 500,1000,10000], 'tol': [0, 0.0000001, 0.001,0.01]} 

model=SGDClassifier(random_state=10)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'max_iter': 100, 'tol': 0}
Best Score: 0.8472989341085271


In [95]:
from sklearn.metrics import classification_report

In [98]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease',axis=1), 
                                                    df['HeartDisease'], test_size=0.30, 
                                                    random_state=20)
SGD_model_2 = SGDClassifier(max_iter=1000, tol=0, random_state=10)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

SGD_model.fit(X_train, y_train)

X_test = scaler.fit_transform(X_test)
SGD_predictions = SGD_model.predict(X_test)

print(metrics.accuracy_score(y_test, SGD_predictions))
print(confusion_matrix(y_test,SGD_predictions))
print(classification_report(y_test,SGD_predictions))


0.8514492753623188
[[ 91  36]
 [  5 144]]
              precision    recall  f1-score   support

           0       0.95      0.72      0.82       127
           1       0.80      0.97      0.88       149

    accuracy                           0.85       276
   macro avg       0.87      0.84      0.85       276
weighted avg       0.87      0.85      0.85       276

