In [None]:
import pandas as pd
import zipfile
import os

# Rozpakuj plik ZIP
zip_path = "archive.zip"
extract_dir = "output"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Wczytaj dane
csv_path = os.path.join(extract_dir, "Dataset.csv")
df = pd.read_csv(csv_path)

# Podstawowa eksploracja danych
print(df.head())
print(df.isnull().sum().sort_values(ascending=False).head(20))


   Unnamed: 0  Hour    HR  O2Sat  Temp    SBP   MAP   DBP  Resp  EtCO2  ...  \
0           0     0   NaN    NaN   NaN    NaN   NaN   NaN   NaN    NaN  ...   
1           1     1  65.0  100.0   NaN    NaN  72.0   NaN  16.5    NaN  ...   
2           2     2  78.0  100.0   NaN    NaN  42.5   NaN   NaN    NaN  ...   
3           3     3  73.0  100.0   NaN    NaN   NaN   NaN  17.0    NaN  ...   
4           4     4  70.0  100.0   NaN  129.0  74.0  69.0  14.0    NaN  ...   

   Fibrinogen  Platelets    Age  Gender  Unit1  Unit2  HospAdmTime  ICULOS  \
0         NaN        NaN  68.54       0    NaN    NaN        -0.02       1   
1         NaN        NaN  68.54       0    NaN    NaN        -0.02       2   
2         NaN        NaN  68.54       0    NaN    NaN        -0.02       3   
3         NaN        NaN  68.54       0    NaN    NaN        -0.02       4   
4         NaN      330.0  68.54       0    NaN    NaN        -0.02       5   

   SepsisLabel  Patient_ID  
0            0       17072 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

# === Przygotowanie danych ===
X = df.drop(columns=['SepsisLabel'])
y = df['SepsisLabel']

# Imputacja braków
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Skalowanie
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

# Podział danych
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Oversampling klasy 1
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# === Random Forest z tuningiem ===
model = RandomForestClassifier(
    n_estimators=60,
    max_depth=15,
    min_samples_leaf=4,
    class_weight='balanced',
    random_state=42
)
model.fit(X_resampled, y_resampled)

# === Predykcja z regulacją progu ===
y_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.3  # niższy próg, lepszy recall
y_pred = (y_proba > threshold).astype(int)

# === Ewaluacja ===
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# === Ważność cech ===
importances = model.feature_importances_
important_features = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("\nTop 10 ważnych cech:")
print(important_features.head(10))


Confusion Matrix:
 [[186718 118141]
 [   938   4645]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.61      0.76    304859
           1       0.04      0.83      0.07      5583

    accuracy                           0.62    310442
   macro avg       0.52      0.72      0.42    310442
weighted avg       0.98      0.62      0.75    310442

ROC AUC: 0.8119459343941609

Top 10 ważnych cech:
Unnamed: 0     0.155702
ICULOS         0.145313
Hour           0.125260
Resp           0.091268
FiO2           0.070222
O2Sat          0.067557
HR             0.047594
HospAdmTime    0.037611
Patient_ID     0.037271
DBP            0.029287
dtype: float64


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# === Dane wejściowe ===
X = df.drop(columns=['SepsisLabel'])

# Usuń kolumny nieistotne
X = X.loc[:, ~X.columns.str.contains("Unnamed")]
X = X.drop(columns=['Patient_ID'], errors='ignore')
y = df['SepsisLabel']

# === Imputacja braków ===
X_imputed = pd.DataFrame(SimpleImputer(strategy='median').fit_transform(X), columns=X.columns)

# === Skalowanie ===
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X_imputed), columns=X.columns)

# === Wstępny model do wyboru cech ===
model_temp = RandomForestClassifier(n_estimators=40, random_state=42)
model_temp.fit(X_scaled, y)
importances = model_temp.feature_importances_
important_features = pd.Series(importances, index=X.columns).sort_values(ascending=False)
top_features = important_features.head(15).index

# === Redukcja do najlepszych cech ===
X_selected = X_scaled[top_features]

# === Podział danych ===
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=42
)

# === SMOTE ===
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# === XGBoost z dopasowanym scale_pos_weight ===
scale_weight = (y_resampled == 0).sum() / (y_resampled == 1).sum()
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_resampled, y_resampled)

# === Predykcja probabilistyczna ===
y_proba = model.predict_proba(X_test)[:, 1]

# === Dobór optymalnego progu ===
best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.1, 0.9, 0.01):
    y_temp = (y_proba > t).astype(int)
    score = f1_score(y_test, y_temp)
    if score > best_f1:
        best_f1 = score
        best_thresh = t

# === Finalna predykcja i ocena ===
y_pred = (y_proba > best_thresh).astype(int)
print(f"Najlepszy próg: {best_thresh:.2f}, F1-score: {best_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# === Top cechy ===
print("\nTop 15 cech użytych w modelu:")
print(important_features.head(15))



Parameters: { "use_label_encoder" } are not used.



Najlepszy próg: 0.60, F1-score: 0.1677
Confusion Matrix:
 [[295582   9277]
 [  4223   1360]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    304859
           1       0.13      0.24      0.17      5583

    accuracy                           0.96    310442
   macro avg       0.56      0.61      0.57    310442
weighted avg       0.97      0.96      0.96    310442

ROC AUC: 0.7481231703409131

Top 15 cech użytych w modelu:
Age            0.123837
HospAdmTime    0.100413
ICULOS         0.085504
Hour           0.080849
HR             0.073240
SBP            0.072547
MAP            0.068405
Resp           0.059242
DBP            0.056375
O2Sat          0.048293
Temp           0.042164
Glucose        0.017817
Gender         0.016327
EtCO2          0.015135
FiO2           0.010993
dtype: float64
