# Predictive Model for Surgical Outcomes

This notebook demonstrates data cleaning, exploratory data analysis (EDA), and predictive modeling for surgical outcomes using a Random Forest Classifier.

## Data Loading

In [None]:

import pandas as pd
import numpy as np

# Load the dataset
data_path = r"C:\Users\Leek\Desktop\Personal Docs\P\OU SYS ENG MS\DECISON ANALYSIS\Project\SurgdecMODEL\CompleteDataExample_OperationsFor20232024.xlsx"
data = pd.read_excel(data_path)

# Display the first few rows
data.head()


## Data Cleaning

Handle missing values and drop unnecessary columns.

In [None]:

# Fill missing values for numerical and categorical columns
numerical_cols = ['AGE', 'BMI', 'CREATININE', 'HEMATOCRIT', 'WBC COUNT', 'PLATELET COUNT']
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

categorical_cols = ['SEX', 'ETHNICITY', 'Insurer']
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Drop columns with excessive missing values (>50%)
data = data.drop(columns=['TEAM LEAD', 'COMPLICATION', 'VASCULAR RF', 'NYHA CLASS', 'EF', 'CAD', 'PREVIOUS PCI', 'WEIGHT', 'HEIGHT'])

# Verify the cleaned data
data.info()


## Exploratory Data Analysis (EDA)

Visualize distributions, correlations, and other insights.

### Age Distribution

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['AGE'], kde=True, bins=20, color='teal')
plt.title('Age Distribution of Patients', fontsize=16)
plt.xlabel('Age (Years)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


### Morbidity and Mortality by Age Group

In [None]:

# Group by age
data['Age Group'] = pd.cut(data['AGE'], bins=[0, 40, 60, 80, 100], labels=['0-40', '41-60', '61-80', '81-100'])
morbidity_by_age_group = data.groupby('Age Group')['MORBIDITY & MORTALITY'].mean()

# Plot
plt.figure(figsize=(10, 6))
morbidity_by_age_group.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Average Morbidity & Mortality by Age Group', fontsize=16)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Average Morbidity & Mortality (%)', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## Feature Importance

Visualizing the importance of features in predicting outcomes.

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Create target variable
data['MORTALITY_HIGH'] = (data['OPERATIVE MORTALITY'] > data['OPERATIVE MORTALITY'].median()).astype(int)
target = 'MORTALITY_HIGH'
features = ['AGE', 'BMI', 'CREATININE', 'HEMATOCRIT', 'WBC COUNT', 'PLATELET COUNT',
            'MORBIDITY & MORTALITY', 'STROKE', 'RENAL FAILURE', 'REOPERATION',
            'PROLONGED VENTILATION', 'LONG HOSPITAL STAY', 'SHORT HOSPITAL STAY']

X = data[features]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train Random Forest model
model = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=5, min_samples_split=2)
model.fit(X_train, y_train)

# Feature importance
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance in Predictive Model', fontsize=16)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
