In [None]:
!pip install ucimlrepo



In [None]:
import pandas as pd
import numpy as np
import joblib

from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE

from sklearn.impute import KNNImputer

from xgboost import XGBClassifier

from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

In [None]:
# Set display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
5,56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0
6,62,0,4,140,268,0,2,160,0,3.6,3,2.0,3.0
7,57,0,4,120,354,0,0,163,1,0.6,1,0.0,3.0
8,63,1,4,130,254,0,2,147,0,1.4,2,1.0,7.0
9,53,1,4,140,203,1,2,155,1,3.1,3,0.0,7.0


In [None]:
X.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca          float64
thal        float64
dtype: object

In [None]:
y

Unnamed: 0,num
0,0
1,2
2,1
3,0
4,0
5,0
6,3
7,0
8,2
9,1


In [None]:
y.value_counts()

num
0      164
1       55
2       36
3       35
4       13
Name: count, dtype: int64

In [None]:
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

imputed_X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
imputed_X.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

categorical_data = imputed_X[categorical_cols]

numerical_data = imputed_X[numerical_cols]

In [None]:
print("Categorical DataFrame:")
print(categorical_data)

Categorical DataFrame:
     sex   cp  fbs  restecg  exang  slope  thal
0    1.0  1.0  1.0      2.0    0.0    3.0   6.0
1    1.0  4.0  0.0      2.0    1.0    2.0   3.0
2    1.0  4.0  0.0      2.0    1.0    2.0   7.0
3    1.0  3.0  0.0      0.0    0.0    3.0   3.0
4    0.0  2.0  0.0      2.0    0.0    1.0   3.0
5    1.0  2.0  0.0      0.0    0.0    1.0   3.0
6    0.0  4.0  0.0      2.0    0.0    3.0   3.0
7    0.0  4.0  0.0      0.0    1.0    1.0   3.0
8    1.0  4.0  0.0      2.0    0.0    2.0   7.0
9    1.0  4.0  1.0      2.0    1.0    3.0   7.0
10   1.0  4.0  0.0      0.0    0.0    2.0   6.0
11   0.0  2.0  0.0      2.0    0.0    2.0   3.0
12   1.0  3.0  1.0      2.0    1.0    2.0   6.0
13   1.0  2.0  0.0      0.0    0.0    1.0   7.0
14   1.0  3.0  1.0      0.0    0.0    1.0   7.0
15   1.0  3.0  0.0      0.0    0.0    1.0   3.0
16   1.0  2.0  0.0      0.0    0.0    3.0   7.0
17   1.0  4.0  0.0      0.0    0.0    1.0   3.0
18   0.0  3.0  0.0      0.0    0.0    1.0   3.0
19   1.0  2.0  0.

In [None]:
print("\nNumerical DataFrame:")
print(numerical_data)


Numerical DataFrame:
      age  trestbps   chol  thalach  oldpeak   ca
0    63.0     145.0  233.0    150.0      2.3  0.0
1    67.0     160.0  286.0    108.0      1.5  3.0
2    67.0     120.0  229.0    129.0      2.6  2.0
3    37.0     130.0  250.0    187.0      3.5  0.0
4    41.0     130.0  204.0    172.0      1.4  0.0
5    56.0     120.0  236.0    178.0      0.8  0.0
6    62.0     140.0  268.0    160.0      3.6  2.0
7    57.0     120.0  354.0    163.0      0.6  0.0
8    63.0     130.0  254.0    147.0      1.4  1.0
9    53.0     140.0  203.0    155.0      3.1  0.0
10   57.0     140.0  192.0    148.0      0.4  0.0
11   56.0     140.0  294.0    153.0      1.3  0.0
12   56.0     130.0  256.0    142.0      0.6  1.0
13   44.0     120.0  263.0    173.0      0.0  0.0
14   52.0     172.0  199.0    162.0      0.5  0.0
15   57.0     150.0  168.0    174.0      1.6  0.0
16   48.0     110.0  229.0    168.0      1.0  0.0
17   54.0     140.0  239.0    160.0      1.2  0.0
18   48.0     130.0  275.0  

In [None]:
encoder = LabelEncoder()
encoded_categorical_data = categorical_data.apply(encoder.fit_transform)
encoded_categorical_data

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,thal
0,1,0,1,2,0,2,3
1,1,3,0,2,1,1,0
2,1,3,0,2,1,1,4
3,1,2,0,0,0,2,0
4,0,1,0,2,0,0,0
5,1,1,0,0,0,0,0
6,0,3,0,2,0,2,0
7,0,3,0,0,1,0,0
8,1,3,0,2,0,1,4
9,1,3,1,2,1,2,4


In [None]:
scaler = MinMaxScaler(feature_range=(0, 3))

numerical_scaled_data = scaler.fit_transform(numerical_data)
numerical_scaled_data = pd.DataFrame(numerical_scaled_data, columns=['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca'])

In [None]:
numerical_scaled_data

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca
0,2.125,1.443396,0.732877,1.80916,1.112903,0.0
1,2.375,1.867925,1.09589,0.847328,0.725806,3.0
2,2.375,0.735849,0.705479,1.328244,1.258065,2.0
3,0.5,1.018868,0.849315,2.656489,1.693548,0.0
4,0.75,1.018868,0.534247,2.312977,0.677419,0.0
5,1.6875,0.735849,0.753425,2.450382,0.387097,0.0
6,2.0625,1.301887,0.972603,2.038168,1.741935,2.0
7,1.75,0.735849,1.561644,2.10687,0.290323,0.0
8,2.125,1.018868,0.876712,1.740458,0.677419,1.0
9,1.5,1.301887,0.527397,1.923664,1.5,0.0


In [None]:
X_merged = pd.concat([encoded_categorical_data, numerical_scaled_data], axis=1)

order = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

X_merged_reordered = X_merged.reindex(columns=order)

In [None]:
X_merged_reordered

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,2.125,1,0,1.443396,0.732877,1,2,1.80916,0,1.112903,2,0.0,3
1,2.375,1,3,1.867925,1.09589,0,2,0.847328,1,0.725806,1,3.0,0
2,2.375,1,3,0.735849,0.705479,0,2,1.328244,1,1.258065,1,2.0,4
3,0.5,1,2,1.018868,0.849315,0,0,2.656489,0,1.693548,2,0.0,0
4,0.75,0,1,1.018868,0.534247,0,2,2.312977,0,0.677419,0,0.0,0
5,1.6875,1,1,0.735849,0.753425,0,0,2.450382,0,0.387097,0,0.0,0
6,2.0625,0,3,1.301887,0.972603,0,2,2.038168,0,1.741935,2,2.0,0
7,1.75,0,3,0.735849,1.561644,0,0,2.10687,1,0.290323,0,0.0,0
8,2.125,1,3,1.018868,0.876712,0,2,1.740458,0,0.677419,1,1.0,4
9,1.5,1,3,1.301887,0.527397,1,2,1.923664,1,1.5,2,0.0,4


In [None]:
y = y.values.ravel()

In [None]:
clf = XGBClassifier()

scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
scoring += ['precision', 'recall', 'f1']

y_pred = cross_val_predict(clf, X_merged_reordered, y, cv=5)

scores = {
    'accuracy': accuracy_score(y, y_pred),
    'precision_weighted': precision_score(y, y_pred, average='weighted', zero_division=1),
    'recall_weighted': recall_score(y, y_pred, average='weighted', zero_division=1),
    'f1_weighted': f1_score(y, y_pred, average='weighted'),
    'precision': precision_score(y, y_pred, average='micro', zero_division=1),
    'recall': recall_score(y, y_pred, average='micro', zero_division=1),
    'f1': f1_score(y, y_pred, average='micro')
}

print(f"\nClassifier: {clf.__class__.__name__}")
for metric, score in scores.items():
    print(f"{metric.replace('_', ' ').capitalize()}: {score:.4f}")
print('-' * 30)

joblib.dump(clf, f'/content/drive/MyDrive/CS345_Project/{clf.__class__.__name__}-Baseline.joblib')


Classifier: XGBClassifier
Accuracy: 0.5809
Precision weighted: 0.5386
Recall weighted: 0.5809
F1 weighted: 0.5566
Precision: 0.5809
Recall: 0.5809
F1: 0.5809
------------------------------


['/content/drive/MyDrive/CS345_Project/XGBClassifier-Baseline.joblib']