In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
import plotly.express as px
import numpy as np
import optuna
import matplotlib.pyplot as plt
import warnings 
import seaborn as sns

### Read in data

In [2]:
df1 = pd.read_csv('../data/processed/heart_dataset_1_processed.csv')
df2 = pd.read_csv('../data/processed/heart_dataset_2_processed.csv')
df3 = pd.read_csv('../data/processed/heart_dataset_3_processed.csv')
df4 = pd.read_csv('../data/processed/heart_dataset_4_processed.csv')
display(df1.head(), df1.shape, df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  918 non-null    float64
 1   sex                  918 non-null    int64  
 2   chest_pain_type      918 non-null    int64  
 3   resting_bp           918 non-null    float64
 4   cholesterol          918 non-null    float64
 5   fasting_blood_sugar  918 non-null    int64  
 6   resting_ecg          918 non-null    int64  
 7   max_hr               918 non-null    float64
 8   exercise_angina      918 non-null    int64  
 9   oldpeak              918 non-null    float64
 10  st_slope             918 non-null    int64  
 11  HeartDisease         918 non-null    int64  
dtypes: float64(5), int64(7)
memory usage: 86.2 KB


Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_blood_sugar,resting_ecg,max_hr,exercise_angina,oldpeak,st_slope,HeartDisease
0,-1.43314,1,0,0.410909,0.82507,0,0,1.382928,0,-0.832432,0,0
1,-0.478484,0,1,1.491752,-0.171961,0,0,0.754157,0,0.105664,1,1
2,-1.751359,1,0,-0.129513,0.770188,0,1,-1.525138,0,-0.832432,0,0
3,-0.584556,0,2,0.302825,0.13904,0,0,-1.132156,1,0.574711,1,1
4,0.051881,1,1,0.951331,-0.034755,0,0,-0.581981,0,-0.832432,0,0


(918, 12)

None

### Collect Keys for comparison

In [3]:
keys1 = df1.keys()
keys2 = df2.keys()
keys3 = df3.keys()
keys4 = df4.keys()

In [4]:
selected_keys1 = ['age', 'sex', 'cholesterol', 'fasting_blood_sugar', 'resting_ecg',
       'max_hr', 'exercise_angina', 'oldpeak', 'st_slope', 'chest_pain_type_0',
       'chest_pain_type_1', 'chest_pain_type_3']
selected_keys2 = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'hypertension',
       'serum_creatinine', 'serum_sodium', 'sex', 'time']
selected_keys3 = ['age', 'white_blood_cell_count', 'chest_pain_type', 'cholestorl',
       'oldpeak', 'ca', 'reaction']
selected_keys4 = ['age']

### Importing Performances

In [16]:
study_name = "accuracy"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"
dataset1_history = optuna.load_study(study_name = study_name, storage=storage_name)

In [17]:
study_name = "accuracy_dataset2"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"
dataset2_history = optuna.load_study(study_name = study_name, storage=storage_name)

In [18]:
study_name = "accuracy_dataset3"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"
dataset3_history = optuna.load_study(study_name = study_name, storage=storage_name)

In [19]:
study_name = "accuracy_dataset4_no_id"
storage_name = f"sqlite:///../optuna_database/{study_name}.db"
dataset4_history = optuna.load_study(study_name = study_name, storage=storage_name)

In [20]:
dataset1_history = dataset1_history.trials_dataframe()
accuracy1 = dataset1_history['value']
C1 = dataset1_history['params_C']
trial_number1 = dataset1_history['number']

In [21]:
dataset2_history = dataset2_history.trials_dataframe()
accuracy2 = dataset2_history['value']
C2 = dataset2_history['params_C']
trial_number2 = dataset2_history['number']

In [22]:
dataset3_history = dataset3_history.trials_dataframe()
accuracy3 = dataset3_history['value']
C3 = dataset3_history['params_C']
trial_number3 = dataset3_history['number']

In [23]:
dataset4_history = dataset4_history.trials_dataframe()
accuracy4 = dataset4_history['value']
C4 = dataset4_history['params_C']
trial_number4 = dataset4_history['number']

### Separating Data

In [25]:
X1 = df1.drop(columns=['HeartDisease'])
y1 = df1['HeartDisease']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)

X2 = df2.drop(columns=['HeartDisease'])
y2 = df2['HeartDisease']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)

X3 = df3.drop(columns=['HeartDisease'])
y3 = df3['HeartDisease']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42, stratify=y3)

X4 = df4.drop(columns=['HeartDisease','id','length_of_stay'])
y4 = df4['HeartDisease']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=42, stratify=y4)
keys4 = df4.drop(columns=['id','length_of_stay']).keys()