In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

In [44]:
# Load datasets
train_dir = "train.csv"
test_dir = "test.csv"

# Read the data
train_data = pd.read_csv(train_dir)
#test_data = pd.read_csv(test_dir)

In [45]:
if 'Bed Grade' in train_data.columns and train_data['Bed Grade'].isnull().sum() > 0:
    train_data['Bed Grade'].fillna(train_data['Bed Grade'].mode()[0], inplace=True)
if 'City_Code_Patient' in train_data.columns and train_data['City_Code_Patient'].isnull().sum() > 0:
    train_data['City_Code_Patient'].fillna(train_data['City_Code_Patient'].mode()[0], inplace=True)

# Ensure Bed Grade and City Code are integers
if 'Bed Grade' in train_data.columns:
    train_data['Bed Grade'] = train_data['Bed Grade'].astype(int)
if 'City_Code_Patient' in train_data.columns:
    train_data['City_Code_Patient'] = train_data['City_Code_Patient'].astype(int)

In [46]:
train_data.drop(columns=[col for col in ['case_id', 'patientid'] if col in train_data.columns], inplace=True)

In [47]:
stay_mapping = {
    '0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4,
    '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9,
    '100 Days': 9, 'More than 100 Days': 10
}
train_data['Stay'] = train_data['Stay'].map(stay_mapping)

In [48]:
# Convert categorical variables using the specified mappings
if 'Hospital_type_code' in train_data.columns:
    train_data['Hospital_type_code'] = train_data['Hospital_type_code'].map(lambda x: ord(x.lower()) - ord('a') + 1)
if 'Ward_Facility_Code' in train_data.columns:
    train_data['Ward_Facility_Code'] = train_data['Ward_Facility_Code'].map(lambda x: ord(x) - ord('A') + 1)
if 'Hospital_region_code' in train_data.columns:
    region_mapping = {'X': 1, 'Y': 2, 'Z': 3}
    train_data['Hospital_region_code'] = train_data['Hospital_region_code'].map(region_mapping)
if 'Ward_Type' in train_data.columns:
    train_data['Ward_Type'] = train_data['Ward_Type'].map(lambda x: ord(x) - ord('P') + 1)
if 'Type of Admission' in train_data.columns:
    admission_mapping = {'Trauma': 1, 'Urgent': 2, 'Emergency': 3}
    train_data['Type of Admission'] = train_data['Type of Admission'].map(admission_mapping)
if 'Severity of Illness' in train_data.columns:
    severity_mapping = {'Minor': 1, 'Moderate': 2, 'Extreme': 3}
    train_data['Severity of Illness'] = train_data['Severity of Illness'].map(severity_mapping)

In [49]:
label_encoders = {}

# Label Encode Age by decade
if 'Age' in train_data.columns:
    le = LabelEncoder()
    train_data['Age'] = le.fit_transform(train_data['Age'])
    label_encoders['Age'] = le

# Label Encode Department
if 'Department' in train_data.columns:
    le = LabelEncoder()
    train_data['Department'] = le.fit_transform(train_data['Department'])
    label_encoders['Department'] = le

In [50]:
# Convert categorical variables using the specified mappings
categorical_codes = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Department', 'Ward_Facility_Code', 'Hospital_region_code', 'Ward_Type', 'City_Code_Patient']
for col in categorical_codes:
    if col in train_data.columns:
        train_data[col] = train_data[col].astype(str)

In [51]:
categorical_codes = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Department', 'Ward_Facility_Code', 'Hospital_region_code', 'Ward_Type', 'City_Code_Patient', 'Stay']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_codes, drop_first=False)

In [52]:
X = train_data.drop(columns=['Stay'])
y = train_data['Stay']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)       

In [53]:
xgb_clf = xgb.XGBClassifier(objective="multi:softmax", num_class=len(set(y_train)), eval_metric="mlogloss")

In [54]:
xgb_clf.fit(X_train_scaled, y_train)

In [55]:
y_pred = xgb_clf.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.4249
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.16      0.23      4721
           1       0.44      0.50      0.47     15628
           2       0.42      0.66      0.52     17498
           3       0.42      0.23      0.29     11032
           4       0.17      0.00      0.01      2349
           5       0.40      0.50      0.44      7004
           6       0.00      0.00      0.00       549
           7       0.39      0.04      0.07      2051
           8       0.38      0.23      0.29       967
           9       0.26      0.03      0.05       553
          10       0.56      0.42      0.48      1336

    accuracy                           0.42     63688
   macro avg       0.35      0.25      0.26     63688
weighted avg       0.41      0.42      0.39     63688

Confusion Matrix:
 [[  759  2289  1624    29     3    14     0     0     1     0     2]
 [  433  7877  6124   705     4   473     1     4     3   

In [56]:
import optuna
from sklearn.model_selection import StratifiedKFold

In [57]:
def objective(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": len(set(y_train)),
        "eval_metric": "mlogloss",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 10.0, log=True),
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    acc_scores = []

    for train_idx, valid_idx in skf.split(X_train_scaled, y_train):
        X_train_fold, X_valid_fold = X_train_scaled[train_idx], X_train_scaled[valid_idx]
        y_train_fold, y_valid_fold = y_train[train_idx], y_train[valid_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold)

        y_pred = model.predict(X_valid_fold)
        acc_scores.append(accuracy_score(y_valid_fold, y_pred))

    return sum(acc_scores) / len(acc_scores)

In [58]:
train_counts = np.bincount(y_train)
test_counts = np.bincount(y_test)

print("Class distribution in y_train:", train_counts)
print("Class distribution in y_test:", test_counts)

Class distribution in y_train: [18883 62511 69993 44127  9394 28014  2195  8203  3871  2212  5347]
Class distribution in y_test: [ 4721 15628 17498 11032  2349  7004   549  2051   967   553  1336]


In [59]:
print("y_train unique values:", np.unique(y_train))
print("y_test unique values:", np.unique(y_test))

y_train unique values: [ 0  1  2  3  4  5  6  7  8  9 10]
y_test unique values: [ 0  1  2  3  4  5  6  7  8  9 10]


In [60]:
y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel()

In [61]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-02-25 13:36:28,152] A new study created in memory with name: no-name-b07025a5-2e62-41a5-85bb-fa9a1d406900
[I 2025-02-25 13:37:10,521] Trial 0 finished with value: 0.42037684398149405 and parameters: {'max_depth': 4, 'learning_rate': 0.2573659867403731, 'n_estimators': 191, 'subsample': 0.5158209719266409, 'colsample_bytree': 0.8625268089373075, 'reg_lambda': 0.0025593562741458124, 'reg_alpha': 0.23110363091252106}. Best is trial 0 with value: 0.42037684398149405.
[I 2025-02-25 13:37:37,055] Trial 1 finished with value: 0.41680864217104013 and parameters: {'max_depth': 10, 'learning_rate': 0.19722011848013055, 'n_estimators': 68, 'subsample': 0.8443036614876025, 'colsample_bytree': 0.7517293906480875, 'reg_lambda': 0.16292016553102137, 'reg_alpha': 0.015891032680446504}. Best is trial 0 with value: 0.42037684398149405.
[I 2025-02-25 13:38:15,567] Trial 2 finished with value: 0.419218854696456 and parameters: {'max_depth': 10, 'learning_rate': 0.010869606904122019, 'n_estimators'

In [62]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'max_depth': 8, 'learning_rate': 0.02494607598555621, 'n_estimators': 286, 'subsample': 0.7958802861665262, 'colsample_bytree': 0.7036807919606924, 'reg_lambda': 0.0014771209725641977, 'reg_alpha': 0.000132423801201526}


In [63]:
best_xgb_clf = xgb.XGBClassifier(**best_params)
best_xgb_clf.fit(X_train_scaled, y_train)

In [64]:
# Final evaluation
y_pred = best_xgb_clf.predict(X_test_scaled)
print(f"Final Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Final Accuracy: 0.4266
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.13      0.19      4721
           1       0.43      0.51      0.47     15628
           2       0.42      0.67      0.52     17498
           3       0.43      0.24      0.31     11032
           4       0.11      0.00      0.00      2349
           5       0.41      0.50      0.45      7004
           6       0.00      0.00      0.00       549
           7       0.45      0.03      0.05      2051
           8       0.43      0.18      0.26       967
           9       0.29      0.01      0.02       553
          10       0.57      0.41      0.48      1336

    accuracy                           0.43     63688
   macro avg       0.36      0.24      0.25     63688
weighted avg       0.41      0.43      0.39     63688

Confusion Matrix:
 [[  613  2364  1711    27     0     6     0     0     0     0     0]
 [  320  7926  6216   700     1   464     0     1   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
