<a href="https://colab.research.google.com/github/HarshG01001/AIHC-Assignment-2-Case-Studies/blob/main/CaseStudy4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest

# Metrics
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')

In [11]:
# Load the dataset directly by its name
file_name = 'hospital_readmissions.csv'
df = pd.read_csv(file_name)

print(f"'{file_name}' loaded successfully. Shape: {df.shape}")

# Replace '?' with NaN to correctly identify and count missing values
df.replace('?', np.nan, inplace=True)
print("\n--- Missing Values Check ---")
print(df.isnull().sum())

'hospital_readmissions.csv' loaded successfully. Shape: (25000, 17)

--- Missing Values Check ---
age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64


In [16]:
# --- Target Variable Engineering ---
# Predict if a patient was readmitted in <30 days (binary target)
df['readmitted_binary'] = (df['readmitted'] == 'yes').astype(int)

# Check unique values in original 'readmitted' column
print("\n--- Unique values in 'readmitted' column ---")
print(df['readmitted'].unique())

# --- Feature Selection ---
# Drop identifiers and columns with many missing values
cols_to_drop = ['medical_specialty', 'readmitted']
df_clean = df.drop(columns=cols_to_drop)

# --- Impute Missing Values ---
# Drop rows with missing values in key categorical columns
df_clean.dropna(subset=['diag_1', 'diag_2', 'diag_3'], inplace=True)

# --- Identify Feature Types ---
numerical_features = df_clean.select_dtypes(include=np.number).columns.drop('readmitted_binary').tolist()
categorical_features = df_clean.select_dtypes(exclude=np.number).columns.tolist()

# --- Create Preprocessing Pipeline ---
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Preprocessing pipeline created successfully.")


--- Unique values in 'readmitted' column ---
['no' 'yes']
Preprocessing pipeline created successfully.


In [17]:
# Define features (X) and target (y) for classification
X = df_clean.drop('readmitted_binary', axis=1)
y = df_clean['readmitted_binary']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the class distribution in the training and testing sets
print("\n--- Class Distribution in y_train ---")
print(y_train.value_counts())

print("\n--- Class Distribution in y_test ---")
print(y_test.value_counts())


# Define models
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate each model
for name, classifier in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', classifier)])

    print(f"--- Training {name} ---")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"\n--- {name} Evaluation ---")
    print(classification_report(y_test, y_pred, target_names=['Not Readmitted', 'Readmitted <30']))


--- Class Distribution in y_train ---
readmitted_binary
0    10597
1     9403
Name: count, dtype: int64

--- Class Distribution in y_test ---
readmitted_binary
0    2649
1    2351
Name: count, dtype: int64
--- Training Logistic Regression ---

--- Logistic Regression Evaluation ---
                precision    recall  f1-score   support

Not Readmitted       0.60      0.79      0.68      2649
Readmitted <30       0.64      0.41      0.50      2351

      accuracy                           0.61      5000
     macro avg       0.62      0.60      0.59      5000
  weighted avg       0.62      0.61      0.60      5000

--- Training Random Forest ---

--- Random Forest Evaluation ---
                precision    recall  f1-score   support

Not Readmitted       0.60      0.67      0.63      2649
Readmitted <30       0.57      0.50      0.53      2351

      accuracy                           0.59      5000
     macro avg       0.59      0.59      0.58      5000
  weighted avg       0.59     

In [18]:
# Define features (X) and target (y) for regression
y_reg = X['time_in_hospital']
X_reg = X.drop('time_in_hospital', axis=1)

# Update the preprocessor to exclude the new target from the feature set
numerical_features_reg = [col for col in numerical_features if col != 'time_in_hospital']
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features_reg),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Create and train the Linear Regression pipeline
lin_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_reg),
                                   ('regressor', LinearRegression())])

print("--- Training Linear Regression ---")
lin_reg_pipeline.fit(X_train_reg, y_train_reg)
y_pred_reg = lin_reg_pipeline.predict(X_test_reg)

# Evaluate the model
mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

print("\n--- Linear Regression Evaluation ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

--- Training Linear Regression ---

--- Linear Regression Evaluation ---
Mean Absolute Error (MAE): 1.9378
Root Mean Squared Error (RMSE): 2.5370


In [19]:
# Apply the preprocessor to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Apply Isolation Forest
# Contamination is the expected proportion of anomalies (e.g., 5%)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
train_anomalies = iso_forest.fit_predict(X_train_processed) # -1 for anomalies, 1 for inliers

# Add anomaly flags back to the original training dataframe
X_train['anomaly'] = train_anomalies

# Analyze the results
print("--- Isolation Forest Anomaly Detection ---")
print(f"Number of anomalies detected in the training set: {(X_train['anomaly'] == -1).sum()}")

# Check if the detected anomalies have a higher readmission rate
anomaly_readmission_rate = y_train[X_train['anomaly'] == -1].mean()
normal_readmission_rate = y_train[X_train['anomaly'] == 1].mean()

print(f"\nReadmission rate for detected anomalies: {anomaly_readmission_rate:.2%}")
print(f"Readmission rate for normal records: {normal_readmission_rate:.2%}")

--- Isolation Forest Anomaly Detection ---
Number of anomalies detected in the training set: 1000

Readmission rate for detected anomalies: 52.60%
Readmission rate for normal records: 46.72%


In [20]:
# Apply the preprocessor to the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Apply Isolation Forest
# Contamination is the expected proportion of anomalies (e.g., 5%)
iso_forest = IsolationForest(contamination=0.05, random_state=42)
train_anomalies = iso_forest.fit_predict(X_train_processed) # -1 for anomalies, 1 for inliers

# Add anomaly flags back to the original training dataframe
X_train['anomaly'] = train_anomalies

# Analyze the results
print("--- Isolation Forest Anomaly Detection ---")
print(f"Number of anomalies detected in the training set: {(X_train['anomaly'] == -1).sum()}")

# Check if the detected anomalies have a higher readmission rate
anomaly_readmission_rate = y_train[X_train['anomaly'] == -1].mean()
normal_readmission_rate = y_train[X_train['anomaly'] == 1].mean()

print(f"\nReadmission rate for detected anomalies: {anomaly_readmission_rate:.2%}")
print(f"Readmission rate for normal records: {normal_readmission_rate:.2%}")

--- Isolation Forest Anomaly Detection ---
Number of anomalies detected in the training set: 1000

Readmission rate for detected anomalies: 52.60%
Readmission rate for normal records: 46.72%
