In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")
print(df.shape)
df.head()


(1190, 12)


Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [None]:
# Assume last column is target if not sure
target_col = df.columns[-1]

X = df.drop(columns=[target_col])
y = df[target_col]

print("Target column:", target_col)
print("Target value counts:\n", y.value_counts())


Target column: target
Target value counts:
 target
1    629
0    561
Name: count, dtype: int64


In [None]:
# strip leading/trailing spaces from column names (keep original wording)
df.columns = [c.strip() for c in df.columns]

# for object columns: strip whitespace; make obvious "missing" tokens into NaN
obj_cols = df.select_dtypes(include=["object"]).columns
for c in obj_cols:
    df[c] = (
        df[c]
        .astype(str)
        .str.strip()
        .replace({"": np.nan, "na": np.nan, "n/a": np.nan, "null": np.nan, "?": np.nan, "none": np.nan, "nan": np.nan})
    )

print("after name/text tidy:", df.shape)


after name/text tidy: (1190, 12)


In [None]:
# remove exact duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
print(f"removed duplicate rows: {before - df.shape[0]}")

# remove duplicate columns (identical across all rows)
dupe_cols = df.T.duplicated()
if dupe_cols.any():
    cols_to_drop = df.columns[dupe_cols].tolist()
    df = df.loc[:, ~dupe_cols]
    print("dropped duplicate columns:", cols_to_drop)
else:
    print("no duplicate columns found")


removed duplicate rows: 272
no duplicate columns found


In [None]:
# try to coerce any object column that looks numeric into numeric dtype
for c in obj_cols:
    coerced = pd.to_numeric(df[c], errors="ignore")
    if not isinstance(coerced.dtype, pd.StringDtype) and str(coerced.dtype) != "object":
        df[c] = coerced

print("dtypes after coercion:")
print(df.dtypes)


dtypes after coercion:
age                      int64
sex                      int64
chest pain type          int64
resting bp s             int64
cholesterol              int64
fasting blood sugar      int64
resting ecg              int64
max heart rate           int64
exercise angina          int64
oldpeak                float64
ST slope                 int64
target                   int64
dtype: object
