<a href="https://colab.research.google.com/github/HamzaSa1t/Test1/blob/main/Heart_Disease_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('heart_disease_uci.csv')
df.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [None]:
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Fill missing values for numeric columns with mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing values for non-numeric columns with mode
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [None]:
# Example of one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.head()


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,sex_Male,dataset_Hungary,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,True,False,...,False,True,True,False,False,False,False,False,False,False
1,2,67,160.0,286.0,108.0,1.5,3.0,2,True,False,...,False,False,False,False,False,True,True,False,True,False
2,3,67,120.0,229.0,129.0,2.6,2.0,1,True,False,...,False,False,False,False,False,True,True,False,False,True
3,4,37,130.0,250.0,187.0,3.5,0.0,0,True,False,...,True,False,False,True,False,False,False,False,True,False
4,5,41,130.0,204.0,172.0,1.4,0.0,0,False,False,...,False,False,False,False,False,False,False,True,True,False


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
df = pd.DataFrame(scaled_features, columns=df.columns)
df.head()


Unnamed: 0,id,age,trestbps,chol,fbs,thalch,exang,oldpeak,ca,num,...,dataset_VA Long Beach,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,-1.730169,1.007386,0.698041,0.311021,2.380476,0.495698,-0.760292,1.349421,-1.249371,-0.871794,...,-0.527046,-0.482953,-0.533775,4.358899,-1.227523,-0.491493,-1.568007,-0.532094,-1.692792,-0.513553
1,-1.726404,1.432034,1.511761,0.797713,-0.420084,-1.175955,1.315283,0.589832,4.292099,0.879408,...,-0.527046,-0.482953,-0.533775,-0.229416,-1.227523,-0.491493,0.637752,-0.532094,0.59074,-0.513553
2,-1.722639,1.432034,-0.658158,0.274289,-0.420084,-0.340128,1.315283,1.634267,2.444942,0.003807,...,-0.527046,-0.482953,-0.533775,-0.229416,-1.227523,-0.491493,0.637752,-0.532094,-1.692792,1.94722
3,-1.718873,-1.752828,-0.115679,0.46713,-0.420084,1.968345,-0.760292,2.488805,-1.249371,-0.871794,...,-0.527046,-0.482953,1.873447,-0.229416,0.814649,-0.491493,-1.568007,-0.532094,0.59074,-0.513553
4,-1.715108,-1.32818,-0.115679,0.044717,-0.420084,1.371326,-0.760292,0.494884,-1.249371,-0.871794,...,-0.527046,2.070593,-0.533775,-0.229416,-1.227523,-0.491493,-1.568007,1.879367,0.59074,-0.513553


In [None]:
target_column = 'num'  # Adjust this line to the correct target column name

# Convert the target column to binary (0 for no disease, 1 for disease)
df[target_column] = df[target_column].apply(lambda x: 1 if x > 0 else 0)

# Separate features and target variable
X = df.drop(target_column, axis=1)
y = df[target_column]

# Apply SMOTE to balance the classes
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize models
log_reg = LogisticRegression()
rf_clf = RandomForestClassifier()
svc_clf = SVC()

# Train and evaluate models
models = [log_reg, rf_clf, svc_clf]
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{model.__class__.__name__} Accuracy: {accuracy_score(y_test, y_pred)}')

LogisticRegression Accuracy: 0.8333333333333334
RandomForestClassifier Accuracy: 0.8872549019607843
SVC Accuracy: 0.8284313725490197


In [None]:
# Train Random Forest model
rf_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_clf.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred)}')

Random Forest Accuracy: 0.8872549019607843


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
print(f'Best Parameters: {grid_search.best_params_}')

# Evaluate the optimized model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(f'Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_best)}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Optimized Random Forest Accuracy: 0.8970588235294118


## Project Report

### Aim of the Project
The aim of the project is to predict the presence of heart disease based on patient data using machine learning models.

### Data Characteristics
- **Origin**: UCI Machine Learning Repository
- **Size**: 303 instances
- **Attributes**: 14 attributes including age, sex, chest pain type, etc.

### Data Preparation Steps
1. Handled missing values using mean imputation.
2. Encoded categorical variables using one-hot encoding.
3. Scaled numerical features using StandardScaler.
4. Converted target variable to binary classification.
5. Balanced classes using SMOTE.

### Models Applied and Performance
- **Logistic Regression Accuracy**: 0.8333333333333334
- **Random Forest Accuracy**: 0.8872549019607843
- **SVC Accuracy**: 0.8284313725490197

### Optimization and Tuning Results
- **Best Parameters for Random Forest**: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}
- **Optimized Random Forest Accuracy**: 0.8970588235294118

### Resources
- Python, pandas, scikit-learn, imbalanced-learn, matplotlib, seaborn
