In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV



In [31]:
# Step 1: Load the Data
file_path = "Dataset/diabetic_data_training.csv"
df = pd.read_csv(file_path)

Temporal dataset saved as temporal_dataset.csv


In [36]:
# Step 2: Basic Data Inspection
#print(df.info())
#print(df.head())


In [24]:
# Step 3: Handle Missing Values
# Replace '?' with NaN for easier processing
df.replace('?', np.nan, inplace=True)

# Impute missing values for numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

# Fill categorical missing values with the mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


In [25]:
# Step 4: Encode Categorical Variables
# Encode categorical columns using one-hot encoding and LabelEncoder
encode_cols = ['gender', 'race', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']
df = pd.get_dummies(df, columns=encode_cols)

label_encoder = LabelEncoder()
df['readmitted'] = label_encoder.fit_transform(df['readmitted'])


In [35]:
# Step 5: Feature Engineering on Temporal Data
# Sort by patient and encounter
df.sort_values(by=['patient_nbr', 'encounter_id'], inplace=True)

# Aggregate features by patient
agg_features = df.groupby('patient_nbr').agg({
    'time_in_hospital': ['sum', 'mean', 'max'],
    'num_lab_procedures': ['sum', 'mean'],
    'num_medications': ['sum', 'mean'],
    'number_outpatient': ['sum'],
    'number_emergency': ['sum'],
    'number_inpatient': ['sum']
}).reset_index()
agg_features.columns = ['_'.join(col) for col in agg_features.columns]

# Create lag features
for col in ['time_in_hospital', 'num_lab_procedures', 'num_medications']:
    df[f'prev_{col}'] = df.groupby('patient_nbr')[col].shift(1).fillna(0)


# Create rolling features
#df['rolling_lab_procedures'] = df.groupby('patient_nbr')['num_lab_procedures'].rolling(3).mean().reset_index(0, drop=True)

# Create rolling features for multiple variables
rolling_features = ['time_in_hospital', 'num_lab_procedures', 'num_medications', 
                    'number_outpatient', 'number_emergency']
for col in rolling_features:
    df[f'rolling_mean_{col}'] = df.groupby('patient_nbr')[col].rolling(3).mean().reset_index(0, drop=True)
    df[f'rolling_sum_{col}'] = df.groupby('patient_nbr')[col].rolling(3).sum().reset_index(0, drop=True)
    df[f'rolling_std_{col}'] = df.groupby('patient_nbr')[col].rolling(3).std().reset_index(0, drop=True)

# Combine aggregated features back to original DataFrame
df = pd.merge(df, agg_features, left_on='patient_nbr', right_on='patient_nbr_')

# Drop redundant columns
df.drop(columns=['patient_nbr_'], inplace=True)

# Generate Timestep for Each Patient (Visit Number)
df['timestep'] = df.groupby('patient_nbr').cumcount() + 1

# Extract Temporal Features (Cumulative Features)
df['cumulative_time_in_hospital'] = df.groupby('patient_nbr')['time_in_hospital'].cumsum()
df['cumulative_lab_procedures'] = df.groupby('patient_nbr')['num_lab_procedures'].cumsum()
df['cumulative_medications'] = df.groupby('patient_nbr')['num_medications'].cumsum()

# Select Relevant Columns (Keep Timestep)
temporal_cols = [col for col in df.columns if col not in ['encounter_id']]
temporal_data = df[temporal_cols]

# Save the temporal data to CSV for external use
temporal_data.to_csv(r'C:\Users\vidur\Desktop\Temp\temporal_dataset(2).csv', index=False)

print("Temporal data saved as temporal_data.csv")


Temporal data saved as temporal_data.csv


In [None]:
# Step 6: Prepare Data for Modeling
# Drop unnecessary columns
X = df.drop(columns=['readmitted', 'encounter_id', 'patient_nbr'])
y = df['readmitted']

# Train-Test Split (Group by patient to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
'''
# Step 7: Train a Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Convert DataFrame to numerical format
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill any remaining NaN values after conversion
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Feature Importance
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("Top Features:")
print(feature_importances.head(10))
'''


Best Parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Tuned Model Accuracy: 0.7306474505950431
              precision    recall  f1-score   support

           0       0.57      0.03      0.06      2106
           1       0.66      0.63      0.64      6329
           2       0.77      0.94      0.85      9883

    accuracy                           0.73     18318
   macro avg       0.66      0.54      0.52     18318
weighted avg       0.71      0.73      0.69     18318

