In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('heart_train.csv', encoding='latin1')

In [None]:
print("Shape:", df.shape)

In [None]:
df.head(2)

In [None]:
df.tail(2)

In [None]:
print("\nColumn names:", df.columns.tolist())

In [None]:
print("\nMissing values:\n", df.isnull().sum())

In [None]:
print("\nColumn data types:")
print(df.dtypes)

In [None]:
print("\nValue counts for target column:")
print(df['HeartDisease'].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

In [None]:
plt.figure(figsize=(15, 8))
df[numerical_columns].boxplot()
plt.title("Boxplots for Numerical Features")
plt.xticks(rotation=45)
plt.show()

In [None]:
from scipy.stats import zscore

In [None]:
z_scores = df[numerical_columns].apply(zscore)

outliers = (z_scores.abs() > 3).any(axis=1)
print("Number of rows with outliers:", outliers.sum())

In [None]:
df_clean = df[~outliers].reset_index(drop=True)

In [None]:
print("Shape:", df_clean.shape)

In [None]:
df_encoded = pd.get_dummies(df_clean, drop_first=True)
print("Encoded dataframe shape:", df_encoded.shape)

In [None]:
df_encoded.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_columns] = scaler.fit_transform(X_scaled[numerical_columns])

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

for name, model in models.items():
    print(f"\n===== {name} =====")

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Test Accuracy:  {accuracy_score(y_test, y_test_pred):.4f}")
 
    print("Classification Report (Test):")
    print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)

from sklearn.metrics import classification_report, accuracy_score

y_pred = best_xgb.predict(X_test)
print(f"Test Accuracy after tuning: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report (Test):")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [None]:
y_pred = best_xgb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
y_proba = best_xgb.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

In [None]:
import json

In [None]:
model_metadata = {
    "columns": X.columns.tolist(),
    "numerical_columns": numerical_columns.tolist()
}

with open("model_columns.json", "w") as f:
    json.dump(model_metadata, f)

print("model_columns.json saved.")

In [None]:
import pickle

In [None]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("scaler.pkl saved.")

In [None]:
with open("best_xgb_model.pkl", "wb") as file:
    pickle.dump(best_xgb, file)

print("Model saved as best_xgb_model.pkl")