In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load Data

In [3]:
df = pd.read_csv('data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [5]:
#Check for missing values
df.isnull().sum()   

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [8]:
!pip install tqdm
from tqdm.auto import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Display duplicated rows
duplicated_rows = df[df.duplicated()].sort_values(by=list(df.columns))
print(f"Number of duplicated rows: {len(duplicated_rows)}")
#print("\nThe duplicated rows:")
#duplicated_rows               

Number of duplicated rows: 723


Split the dataset

In [10]:
# Split the dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Reset indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Separate target variable
y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

# Remove target from dataframes
del df_train['target']
del df_val['target']
del df_test['target']

In [11]:
numerical = ['age', 'sex', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical = ['cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [12]:
def train (df_train, y_train, C=1.0):
    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    
    model = LogisticRegression(C=C, max_iter=len(X_train)*10)
    model.fit(X_train, y_train)
    
    return dv, model


In [13]:
dv, model = train(df_train, y_train, C=1.0)

In [14]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred

In [15]:
y_pred = predict(df_val, dv, model)

auc = roc_auc_score(y_val, y_pred)
print(f"AUC on validation set: {auc:.3f}")

AUC on validation set: 0.914


In [16]:
n_splits = 5

for C in tqdm([0.001, 0.01, 0.1, 1, 10, 100]):
    aucs = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    for train_index, val_index in kf.split(df_full_train):
        df_train = df_full_train.iloc[train_index]
        df_val = df_full_train.iloc[val_index]
        
        y_train = df_train.target.values
        y_val = df_val.target.values
        
        del df_train['target']
        del df_val['target']
        
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
    print(f"C={C}: AUC={np.mean(aucs):.3f} +/- {np.std(aucs):.3f}")

 17%|█▋        | 1/6 [00:00<00:01,  2.91it/s]

C=0.001: AUC=0.799 +/- 0.023


 33%|███▎      | 2/6 [00:00<00:01,  2.61it/s]

C=0.01: AUC=0.888 +/- 0.024


 50%|█████     | 3/6 [00:01<00:01,  1.86it/s]

C=0.1: AUC=0.915 +/- 0.028


 67%|██████▋   | 4/6 [00:02<00:01,  1.44it/s]

C=1: AUC=0.917 +/- 0.027


 83%|████████▎ | 5/6 [00:03<00:00,  1.36it/s]

C=10: AUC=0.917 +/- 0.026


100%|██████████| 6/6 [00:04<00:00,  1.33it/s]

C=100: AUC=0.917 +/- 0.026





Choosing the C=0.1 first and examine the Confusion Matrix

In [17]:
dv, model = train(df_full_train, df_full_train.target.values, C=0.1)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on test set: {auc:.3f}\n")

actual_positive = (y_test == 1)
actual_negative = (y_test == 0)

t=0.6
predicted_positive = (y_pred >= t)
predicted_negative = (y_pred < t)

tp = (predicted_positive & actual_positive).sum()
tn = (predicted_negative & actual_negative).sum()
fp = (predicted_positive & actual_negative).sum()
fn = (predicted_negative & actual_positive).sum()

print(f"Confusion Matrix (threshold={t}):")
print(f"TP: {tp}, FP: {fp}")
print(f"FN: {fn}, TN: {tn}")

AUC on test set: 0.920

Confusion Matrix (threshold=0.6):
TP: 81, FP: 23
FN: 15, TN: 86


Choosing the C=1 first and examine the Confusion Matrix

In [18]:
dv, model = train(df_full_train, df_full_train.target.values, C=1)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on test set: {auc:.3f}\n")

actual_positive = (y_test == 1)
actual_negative = (y_test == 0)

t=0.55
predicted_positive = (y_pred >= t)
predicted_negative = (y_pred < t)

tp = (predicted_positive & actual_positive).sum()
tn = (predicted_negative & actual_negative).sum()
fp = (predicted_positive & actual_negative).sum()
fn = (predicted_negative & actual_positive).sum()

print(f"Confusion Matrix (threshold={t}):")
print(f"TP: {tp}, FP: {fp}")
print(f"FN: {fn}, TN: {tn}")

AUC on test set: 0.921

Confusion Matrix (threshold=0.55):
TP: 84, FP: 23
FN: 12, TN: 86


Choosing the C=10 first and examine the Confusion Matrix

In [19]:
dv, model = train(df_full_train, df_full_train.target.values, C=10)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on test set: {auc:.3f}\n")

actual_positive = (y_test == 1)
actual_negative = (y_test == 0)

t=0.55
predicted_positive = (y_pred >= t)
predicted_negative = (y_pred < t)

tp = (predicted_positive & actual_positive).sum()
tn = (predicted_negative & actual_negative).sum()
fp = (predicted_positive & actual_negative).sum()
fn = (predicted_negative & actual_positive).sum()

print(f"Confusion Matrix (threshold={t}):")
print(f"TP: {tp}, FP: {fp}")
print(f"FN: {fn}, TN: {tn}")

AUC on test set: 0.920

Confusion Matrix (threshold=0.55):
TP: 85, FP: 24
FN: 11, TN: 85


The Choosen Model with c=1 and threshold 0.55

In [22]:
dv, model = train(df_full_train, df_full_train.target.values, C=1)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on test set: {auc:.3f}\n")

actual_positive = (y_test == 1)
actual_negative = (y_test == 0)

t=0.55
predicted_positive = (y_pred >= t)
predicted_negative = (y_pred < t)

tp = (predicted_positive & actual_positive).sum()
tn = (predicted_negative & actual_negative).sum()
fp = (predicted_positive & actual_negative).sum()
fn = (predicted_negative & actual_positive).sum()

sum = tp + tn + fp + fn
print(f"Confusion Matrix (threshold={t}):")
print(f"TP: {(tp/(tp+fp)).round(2)}, FP: {(fp/(tp+fp)).round(2)}")
print(f"FN: {(fn/(fn+tn)).round(2)}, TN: {(tn/(fn+tn)).round(2)}")

AUC on test set: 0.921

Confusion Matrix (threshold=0.55):
TP: 0.79, FP: 0.21
FN: 0.12, TN: 0.88


Try to train the model in another ways

In [28]:
# Display duplicated rows
duplicated_rows = df[df.duplicated()].sort_values(by=list(df.columns))
print(f"Number of duplicated rows: {len(duplicated_rows)}")
#print("\nThe duplicated rows:")
#duplicated_rows                                                                                           

Number of duplicated rows: 723


In [31]:
df = df.drop_duplicates().reset_index(drop=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [32]:
# Split the dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Reset indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Separate target variable
y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

# Remove target from dataframes
del df_train['target']
del df_val['target']
del df_test['target']

In [33]:
n_splits = 5

for C in tqdm([0.001, 0.01, 0.1, 1, 10, 100]):
    aucs = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    for train_index, val_index in kf.split(df_full_train):
        df_train = df_full_train.iloc[train_index]
        df_val = df_full_train.iloc[val_index]
        
        y_train = df_train.target.values
        y_val = df_val.target.values
        
        del df_train['target']
        del df_val['target']
        
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)
        auc = roc_auc_score(y_val, y_pred)
        aucs.append(auc)
    print(f"C={C}: AUC={np.mean(aucs):.3f} +/- {np.std(aucs):.3f}")

 33%|███▎      | 2/6 [00:00<00:00,  6.43it/s]

C=0.001: AUC=0.755 +/- 0.025
C=0.01: AUC=0.837 +/- 0.020


 50%|█████     | 3/6 [00:00<00:00,  3.65it/s]

C=0.1: AUC=0.905 +/- 0.036


 67%|██████▋   | 4/6 [00:01<00:00,  2.41it/s]

C=1: AUC=0.908 +/- 0.048


 83%|████████▎ | 5/6 [00:02<00:00,  1.68it/s]

C=10: AUC=0.907 +/- 0.052


100%|██████████| 6/6 [00:03<00:00,  1.90it/s]

C=100: AUC=0.907 +/- 0.052





In [37]:
dv, model = train(df_full_train, df_full_train.target.values, C=1)
y_pred = predict(df_test, dv, model)
auc = roc_auc_score(y_test, y_pred)
print(f"AUC on test set: {auc:.3f}\n")

actual_positive = (y_test == 1)
actual_negative = (y_test == 0)

t=0.55
predicted_positive = (y_pred >= t)
predicted_negative = (y_pred < t)

tp = (predicted_positive & actual_positive).sum()
tn = (predicted_negative & actual_negative).sum()
fp = (predicted_positive & actual_negative).sum()
fn = (predicted_negative & actual_positive).sum()

sum = tp + tn + fp + fn
print(f"Confusion Matrix (threshold={t}):")
print(f"TP: {(tp/(tp+fp)).round(2)}, FP: {(fp/(tp+fp)).round(2)}")
print(f"FN: {(fn/(fn+tn)).round(2)}, TN: {(tn/(fn+tn)).round(2)}")

AUC on test set: 0.864

Confusion Matrix (threshold=0.55):
TP: 0.81, FP: 0.19
FN: 0.33, TN: 0.67
