#**Load Data and organize it**

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None)

# Add column names
data.columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
    "exang", "oldpeak", "slope", "ca", "thal", "hd"
]

# Replace "?" with NaN
data.replace("?", np.nan, inplace=True)

# Convert appropriate columns to numeric
data["ca"] = pd.to_numeric(data["ca"], errors='coerce')
data["thal"] = pd.to_numeric(data["thal"], errors='coerce')

# Encode categorical variables
data["hd"] = data["hd"].map({0: "Healthy", 1: "Unhealthy"})

categorical_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal", "hd"]
for col in categorical_cols:
    data[col] = data[col].astype('category')

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Convert categorical columns back to category type after imputation
for col in categorical_cols:
    data_imputed[col] = data_imputed[col].astype('category')

# Split data into features and target
X = data_imputed.drop(columns=['hd'])
y = data_imputed['hd']

data_imputed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,Healthy
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,Healthy
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,Unhealthy
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,Healthy
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,Healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,Unhealthy
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,Healthy
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,Healthy
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,Unhealthy


#**Split the data**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=100)

#**Training model**

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(X_train, y_train)                                      #Treinando o modelo

#**Testing the model and comparing values**

In [None]:
# Predict on the test set
y_pred = rf.predict(X_test)

##**Calculating the accuracy**


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.84


##**Generating a confusion matrix**

In [None]:
# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred) # evaluate the performance of a classification model by comparing the actual and predicted classifications.
print("Confusion Matrix:")
print(conf_matrix)
#The model correctly identified 50 healthy individuals.
#It incorrectly classified 3 unhealthy individuals as healthy.
#It missed 7 healthy individuals and incorrectly classified them as unhealthy.
#It correctly identified 1 unhealthy individual.

Confusion Matrix:
[[50  3]
 [ 7  1]]


##**Generating a classification report**

In [None]:
# Generate a classification report
class_report = classification_report(y_test, y_pred) #evaluation of the performance of the classification
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

     Healthy       0.88      0.94      0.91        53
   Unhealthy       0.25      0.12      0.17         8

    accuracy                           0.84        61
   macro avg       0.56      0.53      0.54        61
weighted avg       0.79      0.84      0.81        61

