In [1]:
%pip install pandas numpy matplotlib seaborn scikit-learn

Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.3.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     -------------------- ------------------- 30.7/60.9 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 60.9/60.9 kB 1.1 MB/s eta 0:00:00
Collecting matplotlib
  Downloading matplotlib-3.10.6-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp312-cp312-win_amd64.whl.met


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [6]:


# To Set plot style 
sns.set_style('whitegrid')

# 1. LOADDING AND CLEANING THE DATA
print("--- Loading and Cleaning Data ---")
df = pd.read_csv('heart_disease_uci.csv')

# --- Data Cleaning and Preprocessing ---
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df = df.drop(['id', 'dataset', 'num'], axis=1)
df = df.replace('?', np.nan)

# Convert columns that should be numeric
for col in ['trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values by filling with the median
for col in df.columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# Encode categorical text columns into numbers
for column in ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

print("Data cleaning complete. Here's a preview:")
print(df.head())


# 2. THE ANALYSIS (EDA)
print("\n--- Generating Investigative Plots (saved as PNG files) ---")

# Investigation 1: Is Heart Disease an "Old Man's Disease"?
plt.figure(figsize=(10, 6))
sns.violinplot(x='sex', y='age', hue='target', data=df, split=True, palette='viridis')
plt.title('Investigation 1: Age Distribution by Gender and Disease Status')
plt.xticks([0, 1], ['Female', 'Male'])
plt.legend(title='Heart Disease', labels=['No', 'Yes'])
plt.savefig('plot1_age_gender_violin.png')
plt.close()

# Analysis 2: What Type of Chest Pain is the Real Red Flag?
pain_crosstab = pd.crosstab(df['cp'], df['target'])
pain_crosstab_normalized = pain_crosstab.div(pain_crosstab.sum(axis=1), axis=0)
pain_crosstab_normalized.plot(kind='bar', stacked=True, figsize=(10, 7), colormap='coolwarm')
plt.title('Investigation 2: Proportion of Heart Disease by Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.legend(title='Heart Disease', labels=['No', 'Yes'])
plt.savefig('plot2_chest_pain_proportion.png')
plt.close()

# Analysis 3: The Relationship Between Key Vitals
g = sns.jointplot(x='thalch', y='chol', data=df, hue='target', palette='magma', height=8)
g.fig.suptitle('Investigation 3: Cholesterol vs. Max Heart Rate', y=1.02)
g.savefig('plot3_chol_vs_thalach_jointplot.png')
plt.close()


# 3. THE MACHINE LEARNING MODEL
print("\n--- Building and Evaluating Machine Learning Model ---")
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'\nModel Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Disease', 'Disease'], yticklabels=['No Disease', 'Disease'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('plot4_confusion_matrix.png')
plt.close()

# 4. FEATURE IMPORTANCE
print("\n--- Identifying Key Predictive Factors ---")
importances = model.feature_importances_
feature_df = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_df, palette='rocket')
plt.title('Key Features Predicting Heart Disease')
plt.savefig('plot5_feature_importance.png')
plt.close()

print("\nAnalysis complete!")

--- Loading and Cleaning Data ---
Data cleaning complete. Here's a preview:
   age  sex  cp  trestbps   chol  fbs  restecg  thalch  exang  oldpeak  slope  \
0   63    1   3     145.0  233.0    1        0   150.0      0      2.3      0   
1   67    1   0     160.0  286.0    0        0   108.0      1      1.5      0   
2   67    1   0     120.0  229.0    0        0   129.0      1      2.6      0   
3   37    1   2     130.0  250.0    0        0   187.0      0      3.5      0   
4   41    0   1     130.0  204.0    0        0   172.0      0      1.4      0   

    ca  thal  target  
0  0.0     0       0  
1  3.0     0       1  
2  2.0     0       1  
3  0.0     0       0  
4  0.0     0       0  

--- Generating Investigative Plots (saved as PNG files) ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values


--- Building and Evaluating Machine Learning Model ---

Model Accuracy: 83.15%

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.78      0.81        82
           1       0.83      0.87      0.85       102

    accuracy                           0.83       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.83      0.83      0.83       184


--- Identifying Key Predictive Factors ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='importance', y='feature', data=feature_df, palette='rocket')



Analysis complete!
