In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('dataset/train.csv')
train_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df.drop(columns=['id', 'Heart Disease']), train_df['Heart Disease'])

In [4]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

columns_to_remove = [

]

X_train = X_train.drop(columns=columns_to_remove)
X_test= X_test.drop(columns=columns_to_remove)

test_label_encoder = LabelEncoder()
Y_train = pd.Series(test_label_encoder.fit_transform(Y_train), index=Y_train.index, name='Heart Disease')
Y_test = pd.Series(test_label_encoder.transform(Y_test), index=Y_test.index, name='Heart Disease')

standard_scalers = {}
for col in X_train.columns:
  standard_scaler = StandardScaler()
  X_train[col] = standard_scaler.fit_transform(np.array(X_train[col]).reshape(-1, 1))
  X_test[col] = standard_scaler.transform(np.array(X_test[col]).reshape(-1, 1))
  standard_scalers[col] = standard_scaler

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logistic_regression = LogisticRegression()

param_grid = [
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 200]
    },
    {
        'penalty': ['l1'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200]
    },
    {
        'penalty': ['elasticnet'],
        'C': [0.1, 1, 10],
        'solver': ['saga'],
        'l1_ratio': [0.1, 0.5, 0.9],
        'max_iter': [100, 200]
    },
    {
        'penalty': ['none'],
        'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
        'max_iter': [100, 200]
    }
]


grid_search_cv = GridSearchCV(logistic_regression, param_grid=param_grid, scoring='roc_auc')
grid_search_cv.fit(X_train, Y_train)
print(grid_search_cv.best_params_)

predictions = grid_search_cv.predict(X_test)

{'C': 0.01, 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

print('===Accuracy===')
print(accuracy_score(Y_test, predictions))

print('\n===Precision===')
print(precision_score(Y_test, predictions))

print('\n===Recall===')
print(recall_score(Y_test, predictions))

print('\n===ROC AUC===')
print(roc_auc_score(Y_test, predictions))

===Accuracy===
0.883352380952381

===Precision===
0.8789642129764329

===Recall===
0.8573617890793507

===ROC AUC===
0.8808801867215019


In [7]:
sub_df = pd.read_csv('dataset/test.csv')
sub_ids = sub_df['id']
sub_df = sub_df.drop(columns=['id'])

for col in sub_df.columns:
  sub_df[col] = standard_scalers.get(col).transform(np.array(sub_df[col]).reshape(-1, 1))

sub_predictions = grid_search_cv.predict(sub_df)
pd.concat([sub_ids, pd.DataFrame(sub_predictions)], axis=1).to_csv("predictions.csv", index=False, header=['id', 'Heart Disease'])