In [39]:
import pandas as pd

In [40]:
# Load CSV file into a DataFrame
df = pd.read_csv('../data/cleaveland.csv')

# Print the first few rows of the DataFrame to check if it loaded correctly
print(df.head(10))

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope   
0   63    1   1       145   233    1        2      150      0      2.3      3  \
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   
5   56    1   2       120   236    0        0      178      0      0.8      1   
6   62    0   4       140   268    0        2      160      0      3.6      3   
7   57    0   4       120   354    0        0      163      1      0.6      1   
8   63    1   4       130   254    0        2      147      0      1.4      2   
9   53    1   4       140   203    1        2      155      1      3.1      3   

  ca thal  num  
0  0    6    0  
1  3    3    2  
2  2    7    1  
3  0    3    0  
4  0    3    0  
5  0  

In [41]:
columns = list(df.columns)

columns.remove("num")

In [42]:
missing_cols = []

for column in columns:

    try :

        col = df[column]

        # Use isna() to check for missing values
        miss_val_count = col.value_counts()["?"]

        print(miss_val_count,"- missing values in column ",column)
        missing_cols.append(column)
    
    except : 
        continue

4 - missing values in column  ca
2 - missing values in column  thal


In [43]:
for column in missing_cols:

    mode = df[column].mode()[0]

    df[column] = df[column].replace('?', mode).astype('int64')

In [44]:
df_scaled = df.copy()

# Iterate over the columns of the DataFrame
for column in columns:
    # Find the minimum and maximum values of the column
    col_min = df[column].min()
    col_max = df[column].max()

    # Apply min-max scaling to the column
    df_scaled[column] = (df[column] - col_min) / (col_max - col_min)

In [45]:
df_scaled.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,0.529978,0.679868,0.719472,0.355564,0.275555,0.148515,0.49505,0.600055,0.326733,0.167678,0.30033,0.221122,0.430693,0.937294
std,0.188305,0.467299,0.320042,0.166035,0.118212,0.356198,0.497486,0.174618,0.469794,0.18727,0.308113,0.311458,0.484596,1.228536
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.395833,0.0,0.666667,0.245283,0.194064,0.0,0.0,0.477099,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5625,1.0,0.666667,0.339623,0.262557,0.0,0.5,0.625954,0.0,0.129032,0.5,0.0,0.0,0.0
75%,0.666667,1.0,1.0,0.433962,0.340183,0.0,1.0,0.725191,1.0,0.258065,0.5,0.333333,1.0,2.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [46]:
from sklearn.model_selection import KFold

num_folds = 5

# Split the dataset into folds using KFold
kf = KFold(n_splits=num_folds, shuffle=True)
fold_indices = kf.split(df_scaled)

In [47]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [48]:
for fold, (train_indices, test_indices) in enumerate(fold_indices):

    train_data = df_scaled.iloc[train_indices]
    test_data = df_scaled.iloc[test_indices]

    train_X = train_data.drop('num', axis=1)
    train_y = train_data['num']

    test_X = test_data.drop('num', axis=1)
    test_y = test_data['num']

    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(train_X, train_y)

    y_pred = xgb_model.predict(test_X)

    accuracy = accuracy_score(test_y, y_pred)

    print('Accuracy:', accuracy)


Accuracy: 0.5573770491803278
Accuracy: 0.5245901639344263
Accuracy: 0.4262295081967213
Accuracy: 0.55
Accuracy: 0.65
