In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix, make_scorer


In [9]:
data = pd.read_csv(r"C:\Users\Hp\OneDrive\Documents\project\heart.csv")

In [10]:
# Split the data into features and target
X = data.drop(columns='target')
y = data['target']

In [11]:
# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Handle outliers using the IQR method
numeric_columns = X_train.select_dtypes(include=[np.number]).columns


In [13]:
for col in numeric_columns:
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

In [15]:
# Cap the outliers
X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])
X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])


In [21]:
# Perform one-hot encoding on categorical features
categorical_columns = ['cp', 'restecg', 'slope', 'thal', 'ca']  # Adjust as necessary
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')


In [23]:
# Fit and transform the categorical columns in the training data
encoded_categorical_train = encoder.fit_transform(X_train[categorical_columns])

# Transform the categorical columns in the test data
encoded_categorical_test = encoder.transform(X_test[categorical_columns])



In [24]:
# Replace the categorical columns with the encoded ones
X_train_encoded = np.hstack([X_train.drop(columns=categorical_columns).values, encoded_categorical_train])
X_test_encoded = np.hstack([X_test.drop(columns=categorical_columns).values, encoded_categorical_test])

In [25]:
# Standardize the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

In [26]:
# Save the preprocessed data
train_data = pd.DataFrame(X_train_scaled)
test_data = pd.DataFrame(X_test_scaled)


In [27]:
# Print the first few rows of the preprocessed training data
print(train_data.head())

         0        1         2         3         4         5         6   \
0  1.610480 -1.46385  0.461840 -0.123235 -0.404304  0.039942 -0.715891   
1  1.282051  0.68313  1.575595  0.023366 -0.404304 -1.373486  1.396861   
2  0.077810 -1.46385 -0.206413 -0.835301 -0.404304 -0.917541  1.396861   
3  0.296763  0.68313 -1.208792  1.887302 -0.404304 -0.324813  1.396861   
4  1.829433 -1.46385 -1.208792  0.421285  2.473388 -0.917541 -0.715891   

         7         8         9   ...        12        13        14        15  \
0  0.604853 -0.451642 -0.644364  ... -0.129641 -0.897758  1.033623 -0.247797   
1 -0.890458  2.214145 -0.644364  ... -0.129641  1.113885 -0.967471  4.035556   
2  0.770998 -0.451642 -0.644364  ...  7.713624  1.113885 -0.967471 -0.247797   
3  1.601727 -0.451642 -0.644364  ... -0.129641  1.113885 -0.967471 -0.247797   
4 -0.890458 -0.451642  1.551918  ... -0.129641 -0.897758  1.033623 -0.247797   

         16        17        18        19        20        21  
0  0.93593

In [38]:
# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8524590163934426

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.84        28
           1       0.85      0.88      0.87        33

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61


Confusion Matrix:
[[23  5]
 [ 4 29]]


In [40]:
# Define models and hyperparameter grids for tuning
models = {
    "Logistic Regression": {
        "model": LogisticRegression(random_state=42, max_iter=1000),
        "params": {"C": [0.1, 1, 10], "solver": ["lbfgs", "liblinear"]}
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]}
    },
    "SVC": {
        "model": SVC(random_state=42),
        "params": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
    }
}

# Perform hyperparameter tuning and evaluate each model
best_models = {}
for name, model_data in models.items():
    print(f"Tuning hyperparameters for {name}...")
    grid = GridSearchCV(model_data["model"], model_data["params"], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    
    best_models[name] = grid.best_estimator_
    print(f"Best parameters for {name}: {grid.best_params_}")
    print(f"Best cross-validated score: {grid.best_score_}\n")

# Evaluate the best models on the test set
for name, model in best_models.items():
    print(f"Evaluating {name} on the test set...")
    y_pred = model.predict(X_test_scaled)
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

Tuning hyperparameters for Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.1, 'solver': 'lbfgs'}
Best cross-validated score: 0.8433673469387755

Tuning hyperparameters for Random Forest...
Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validated score: 0.810204081632653

Tuning hyperparameters for SVC...
Best parameters for SVC: {'C': 1, 'kernel': 'linear'}
Best cross-validated score: 0.8474489795918367

Tuning hyperparameters for KNN...
Best parameters for KNN: {'n_neighbors': 7, 'weights': 'uniform'}
Best cross-validated score: 0.8227040816326531

Evaluating Logistic Regression on the test set...
F1 Score: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82        28
           1       0.82      0.94      0.87        33

    accuracy                           0.85        61
   macro avg       0.86      0.84      0.85       