In [62]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler


from scipy.stats import shapiro, mannwhitneyu,ttest_ind
from collections import Counter

# Načtení datasetu

In [63]:
# Cesta k tabulce s pacienty
filepath = './dataSepsis.csv'

# Načtení celé tabulky
fullSepsis = pd.read_csv(filepath, sep=';')

# Odstranění nevyhovujících sloupců

In [64]:
drop_list = ['Temp','EtCO2','BaseExcess','HCO3','FiO2','pH',
 'PaCO2','SaO2','AST','Alkalinephos','Chloride','Bilirubin_direct',
 'Lactate','Phosphate','Bilirubin_total','TroponinI','PTT','Fibrinogen','Unit1',
 'Unit2', 'SBP', 'DBP', 'Hct']

relevantSepsis = fullSepsis.drop(columns=drop_list)


# Odstranění řádků, které mají více jak polovinu hodnot NaN

In [65]:
relevantSepsis = relevantSepsis.dropna(thresh=relevantSepsis.shape[1]/2)
relevantSepsis = relevantSepsis.reset_index()
relevantSepsis.drop(columns=['index'], inplace=True)

In [66]:
# Vyplnění NaN
imputer = KNNImputer(n_neighbors=1000,weights="uniform")
imputedFullSepsis = imputer.fit_transform(relevantSepsis)

In [67]:
# Převedení matice na dataframe s původními názvy sloupců
imputedDF = pd.DataFrame(data = imputedFullSepsis,
                         columns=relevantSepsis.columns)

Unnamed: 0,HR,O2Sat,MAP,Resp,BUN,Calcium,Creatinine,Glucose,Magnesium,Potassium,Hgb,WBC,Platelets,Age,Gender,HospAdmTime,ICULOS,isSepsis
0,103.0,90.0,83.09665,30.00000,14.0,9.30000,0.70,193.0,2.00000,3.8,12.5,5.7,317.0,83.14,0.0,-0.03,17.0,0.0
1,58.0,95.0,77.00000,11.00000,100.0,7.90000,2.50,78.0,2.50000,5.1,9.7,11.0,158.0,75.91,0.0,-98.60,10.0,0.0
2,91.0,94.0,74.00000,34.00000,30.0,10.90000,0.90,113.0,2.40000,3.8,8.8,8.3,465.0,45.82,0.0,-1195.71,11.0,0.0
3,92.0,100.0,86.35011,18.44800,9.0,8.38918,0.70,73.0,1.95320,3.8,12.2,12.0,298.0,52.01,1.0,-0.03,13.0,0.0
4,155.5,94.5,102.00000,33.00000,68.0,5.90000,3.80,263.0,1.90000,4.6,15.8,9.3,26.0,64.24,1.0,-0.05,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36120,59.0,95.0,110.00000,14.00000,9.0,8.80000,0.81,116.0,2.00000,3.5,13.1,7.0,154.0,76.00,1.0,-14.90,19.0,0.0
36121,81.0,99.0,71.00000,20.00000,8.0,8.80000,0.58,135.0,2.30000,4.0,13.8,12.6,238.0,84.00,0.0,-6.69,25.0,0.0
36122,76.0,96.0,79.00000,14.00000,6.0,9.80000,0.85,101.5,1.98515,3.1,16.1,10.8,201.0,30.00,1.0,-0.02,2.0,0.0
36123,76.0,89.0,102.00000,17.37875,49.0,7.80000,6.60,91.0,1.90000,4.2,8.9,12.5,188.0,60.00,0.0,-53.64,15.0,0.0


In [68]:
# Separování sloupce isSepsis
isSepsis = imputedDF['isSepsis']
imputedDF = imputedDF.drop(columns=['isSepsis'])

In [69]:
# Vytvoření dataframu pro hodnoty outlier scores
outlier_scores = pd.DataFrame(index=imputedDF.index, columns=imputedDF.columns)

# Vytvoření LocalOutlierFactor modelu
lof_model = LocalOutlierFactor(n_neighbors=1000, contamination='auto')

# Postupné procházení sloupců a hledání lokálně odlehlých hodnot
for feature in imputedDF.columns:
    # Získání hodnot pro score pro odlehlé hodnoty v jednotlivých sloupcích (-1 == outlier)
    outlier_scores[feature] = lof_model.fit_predict(imputedDF[[feature]])

# Replace outliers with NaNs
imputedDF[outlier_scores == -1] = np.nan


In [70]:
secondImputer = KNNImputer(n_neighbors=1000,weights="uniform")
finalSepsis = secondImputer.fit_transform(imputedDF)

In [71]:
# Převedení matice na dataframe s původními názvy sloupců
finalSepsis = pd.DataFrame(data = finalSepsis,
                         columns=imputedDF.columns)

Unnamed: 0,HR,O2Sat,MAP,Resp,BUN,Calcium,Creatinine,Glucose,Magnesium,Potassium,Hgb,WBC,Platelets,Age,Gender,HospAdmTime,ICULOS
0,103.000000,90.000000,83.09665,30.000000,14.0,9.300000,0.700000,193.0,2.000000,3.8,12.500000,5.7,317.000000,83.14,0.0,-0.03000,11.535
1,58.000000,95.000000,77.00000,17.409039,100.0,7.900000,2.500000,78.0,2.041298,5.1,9.700000,11.0,158.000000,75.91,0.0,-98.60000,10.000
2,91.000000,94.000000,74.00000,18.895090,30.0,8.279578,0.900000,113.0,2.400000,3.8,8.800000,8.3,465.000000,45.82,0.0,-71.26838,11.000
3,92.000000,100.000000,86.35011,18.130112,9.0,8.446196,0.700000,73.0,1.953200,3.8,12.200000,12.0,298.000000,52.01,1.0,-0.03000,13.000
4,84.778337,97.531698,102.00000,33.000000,68.0,8.333931,3.800000,263.0,1.900000,4.6,10.915671,9.3,213.084087,64.24,1.0,-0.05000,2.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36120,59.000000,95.000000,110.00000,14.000000,9.0,8.800000,1.158000,116.0,2.000000,3.5,13.100000,7.0,154.000000,76.00,1.0,-14.90000,19.000
36121,81.000000,99.000000,71.00000,20.000000,8.0,8.800000,1.055910,135.0,2.300000,4.0,13.800000,12.6,238.000000,84.00,0.0,-6.69000,25.000
36122,76.000000,96.000000,79.00000,14.000000,6.0,9.800000,0.850000,101.5,1.985150,3.1,11.703026,10.8,201.000000,30.00,1.0,-0.02000,2.000
36123,76.000000,97.504945,102.00000,17.378750,49.0,7.800000,6.600000,91.0,1.900000,4.2,8.900000,12.5,188.000000,60.00,0.0,-53.64000,15.000


In [87]:
# Funkce pro zhodnocení redundance a zjištění, zda je příznak významný či nikoliv
def Redundance(sampleSepsis0, sampleSepsis1):
    indices=[]
    indices_out = []
    for i in range(sampleSepsis0.shape[-1]):


        _, p_sepsis1 = shapiro(sampleSepsis1.iloc[:, i])
        _, p_sepsis0 = shapiro(sampleSepsis0.iloc[:, i])

        if (p_sepsis1<0.05 or p_sepsis0<0.05):
            _,p = mannwhitneyu(sampleSepsis0.iloc[:, i],sampleSepsis1.iloc[:, i])
        else:
            _,p = ttest_ind(sampleSepsis0.iloc[:, i], sampleSepsis1.iloc[:, i])

    
        if (p<0.05):
            indices.append(sampleSepsis0.iloc[:, i].name)
        else:
            indices_out.append(sampleSepsis0.iloc[:, i].name)

    return indices_out

# Zjištění, které příznaky nejsou významné

In [89]:
# Inicializace počítadla pro počítání výskytu příznaků
string_counter = Counter()

# Opakované volání funkce pro hodnocení redundance s náhodnými pacienty bez sepse a se sepsí 
for _ in range(1000):

    sampleSepsis1 = finalSepsis[isSepsis==1].sample(2000)
    sampleSepsis0 = finalSepsis[isSepsis==0].sample(2000)
    function_output = Redundance(sampleSepsis0, sampleSepsis1)
    string_counter.update(function_output)
    

# Vypsání příznaků, které nejsou významné, a vypsání počtu, kolikrát byly vyhodnoceny jako nevýznamné
for string, count in string_counter.items():
    print(f"{string}: {count}")

Potassium: 317
Platelets: 971
Age: 907
Gender: 241
O2Sat: 3
HospAdmTime: 7


In [90]:
finalSepsis = finalSepsis.drop(columns=['Age', 'Platelets'])