In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load the dataset
file_path = 'Heart D Dataset.csv'
heart_df = pd.read_csv(file_path)

In [3]:
# Display basic information and the first few rows of the dataset
heart_df.info(), heart_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 31 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Age                          1000 non-null   int64 
 1   Chest pain                   1000 non-null   object
 2   Shortness of breath          1000 non-null   object
 3   Fatigue                      1000 non-null   object
 4   Systolic                     1000 non-null   int64 
 5   Diastolic                    1000 non-null   int64 
 6   Heart rate (bpm)             1000 non-null   int64 
 7   Lung sounds                  1000 non-null   object
 8   Cholesterol level (mg/dL)    1000 non-null   int64 
 9   LDL level (mg/dL)            1000 non-null   int64 
 10  HDL level (mg/dL)            1000 non-null   int64 
 11  Diabetes                     1000 non-null   object
 12  Atrial fibrillation          1000 non-null   object
 13  Rheumatic fever              1000 

(None,
    Age Chest pain Shortness of breath Fatigue  Systolic  Diastolic  \
 0   56        Yes                 Yes      No       150        102   
 1   41        Yes                  No     Yes       133        101   
 2   70        Yes                 Yes     Yes       106         69   
 3   61        Yes                 Yes      No       157        109   
 4   51        Yes                  No     Yes        91        110   
 
    Heart rate (bpm) Lung sounds  Cholesterol level (mg/dL)  LDL level (mg/dL)  \
 0                78      Wheeze                        268                112   
 1                71     Rhonchi                        160                125   
 2                99      Wheeze                        287                136   
 3                64      Wheeze                        275                 96   
 4                73     Crackle                        163                 99   
 
    ...  Fever Chills Alcoholism Hypertension Fainting Dizziness  Smoki

In [4]:
# Encode categorical variables
label_encoder = LabelEncoder()
for column in heart_df.columns:
    if column != 'Heart disease name' and heart_df[column].dtype == 'object':
        heart_df[column] = label_encoder.fit_transform(heart_df[column])

In [5]:
# Split data into features (X) and target (y)
X = heart_df.drop(columns='Heart disease name')
y = heart_df['Heart disease name']

In [6]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

In [8]:
# Train the model
rf_model.fit(X_train, y_train)

In [9]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [11]:
# Print the accuracy and classification report
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_rep)

Accuracy: 0.8150
Classification Report:
                               precision    recall  f1-score   support

                   Arrhythmia       1.00      1.00      1.00        30
               Cardiomyopathy       1.00      1.00      1.00        38
Coronary Artery Disease (CAD)       0.58      0.70      0.63        46
                Heart Failure       0.53      0.41      0.46        39
       Valvular Heart Disease       1.00      1.00      1.00        47

                     accuracy                           0.81       200
                    macro avg       0.82      0.82      0.82       200
                 weighted avg       0.81      0.81      0.81       200



In [12]:
# Save the trained model as a pickle file
pickle_file_path = 'heart_disease_rf_model.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(rf_model, file)