In [None]:
#experiment 12
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

try:
    # Load the dataset from URL
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
    column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

    df = pd.read_csv(url, names=column_names, na_values='?')

    # Handle missing values (if any)
    df = df.dropna()

    # Convert target to binary (0 = no disease, 1-4 = disease)
    df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

    # Check if required columns exist
    if 'target' not in df.columns:
        raise ValueError("'target' column not found in the dataset")

    X = df.drop('target', axis=1)
    y = df['target']

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Hyperparameter Tuning with GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10]
    }

    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Best parameters and evaluation
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)

    print("\nBest Parameters:", grid_search.best_params_)
    print("Test Accuracy:", accuracy_score(y_test, y_pred))

    # Cross-Validation
    cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='accuracy')
    print("\nCross-Validation Scores:", [f"{score:.2f}" for score in cv_scores])
    print(f"Mean CV Accuracy: {np.mean(cv_scores):.2f}")

    # Feature Importance
    importances = best_rf.feature_importances_
    features = X.columns
    importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print("\nFeature Importances:\n", importance_df.to_string(index=False))

except Exception as e:
    print(f"An error occurred: {str(e)}")


Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Test Accuracy: 0.8666666666666667

Cross-Validation Scores: ['0.83', '0.87', '0.81', '0.76', '0.81']
Mean CV Accuracy: 0.82

Feature Importances:
  Feature  Importance
      ca    0.165798
    thal    0.140149
 thalach    0.109960
 oldpeak    0.103106
      cp    0.101327
     age    0.082037
   exang    0.074760
trestbps    0.068732
    chol    0.065344
   slope    0.046733
     sex    0.024332
     fbs    0.010020
 restecg    0.007701
