In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import pipeline

In [7]:
train_paths = ['training_setA/training', 'training_setB/training_setB']

In [8]:
def load_and_combine_psv_files(paths):
    all_data = []
    for folder in paths:
        for file in os.listdir(folder):
            if file.endswith(".psv"):
                df = pd.read_csv(os.path.join(folder, file), sep='|')
                df['patient_id'] = file.replace('.psv', '')
                all_data.append(df)
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

data = load_and_combine_psv_files(train_paths)

In [9]:
print(data.head())

      HR  O2Sat  Temp    SBP    MAP  DBP  Resp  EtCO2  BaseExcess  HCO3  ...  \
0    NaN    NaN   NaN    NaN    NaN  NaN   NaN    NaN         NaN   NaN  ...   
1   97.0   95.0   NaN   98.0  75.33  NaN  19.0    NaN         NaN   NaN  ...   
2   89.0   99.0   NaN  122.0  86.00  NaN  22.0    NaN         NaN   NaN  ...   
3   90.0   95.0   NaN    NaN    NaN  NaN  30.0    NaN        24.0   NaN  ...   
4  103.0   88.5   NaN  122.0  91.33  NaN  24.5    NaN         NaN   NaN  ...   

   Fibrinogen  Platelets    Age  Gender  Unit1  Unit2  HospAdmTime  ICULOS  \
0         NaN        NaN  83.14       0    NaN    NaN        -0.03       1   
1         NaN        NaN  83.14       0    NaN    NaN        -0.03       2   
2         NaN        NaN  83.14       0    NaN    NaN        -0.03       3   
3         NaN        NaN  83.14       0    NaN    NaN        -0.03       4   
4         NaN        NaN  83.14       0    NaN    NaN        -0.03       5   

   SepsisLabel  patient_id  
0            0     p0

In [10]:
# Keep last record per patient (to get final label)
latest_records = data.groupby('patient_id').last().reset_index()

# Features and Labels
features = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'Resp', 'EtCO2', 'WBC']
latest_records = latest_records.dropna(subset=features + ['SepsisLabel'])
X = latest_records[features]
y = latest_records['SepsisLabel']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       533
           1       0.47      0.22      0.30        94

    accuracy                           0.85       627
   macro avg       0.67      0.59      0.61       627
weighted avg       0.81      0.85      0.82       627



In [15]:
generator = pipeline("text-generation", model="gpt2", framework="pt")


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
def generate_explanation(features_dict, risk_score):
    prompt = f"Patient has the following vitals: {features_dict}. The model predicts a sepsis risk score of {risk_score:.2f}. Provide a brief medical explanation."
    explanation = generator(prompt, max_new_tokens=60, num_return_sequences=1)
    return explanation[0]['generated_text']

In [19]:
example_features = X_test.iloc[0].to_dict()
example_risk = model.predict_proba([list(example_features.values())])[0][1]
print(generate_explanation(example_features, example_risk))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Patient has the following vitals: {'HR': 79.0, 'O2Sat': 100.0, 'Temp': 37.3, 'SBP': 128.0, 'MAP': 125.0, 'Resp': 16.0, 'EtCO2': 33.0, 'WBC': 7.7}. The model predicts a sepsis risk score of 0.06. Provide a brief medical explanation. The models are expected to predict that patients over this age may die if it were discovered they will not seek medical treatment.


In [20]:
import joblib

# Save model
joblib.dump(model, 'sepsis_xgb_model.pkl')

# To load it later
# model = joblib.load('sepsis_xgb_model.pkl')


['sepsis_xgb_model.pkl']

In [22]:
from transformers import pipeline

# Load the generator in your Streamlit app like this:
generator = pipeline("text-generation", model="gpt2", framework="pt")
generator.model.save_pretrained("./gpt2-sepsis")
generator.tokenizer.save_pretrained("./gpt2-sepsis")


('./gpt2-sepsis\\tokenizer_config.json',
 './gpt2-sepsis\\special_tokens_map.json',
 './gpt2-sepsis\\vocab.json',
 './gpt2-sepsis\\merges.txt',
 './gpt2-sepsis\\added_tokens.json',
 './gpt2-sepsis\\tokenizer.json')