In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, VarianceThreshold, mutual_info_classif
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Hiperparametri eksperimenta

In [2]:
# PreviousAdmissionDays
PREV_ADM_DAYS_MEDIAN = 1055

# Height_Discharge
HEIGHT_MEDIAN_MALE = 176
HEIGHT_MEDIAN_FEMALE = 162
HEIGHT_MEDIAN_GENDER_MISSING = 172
HEIGHT_LOWER_LIM = 130
HEIGHT_UPPER_LIM = 204

# Weigth_Discharge
WEIGHT_MEDIAN_MALE = 90
WEIGHT_MEDIAN_FEMALE = 75
WEIGHT_MEDIAN_GENDER_MISSING = 85
WEIGHT_LOWER_LIM = 33
WEIGHT_UPPER_LIM = 199

GRUPE_LJEKOVA = {
  
  "Severe_cases_Antibiotics": [
    "amoxicillin_and_betalactamase_inhibitor_count",
    "ampicillin_and_betalactamase_inhibitor_count",
    "cefepime_count",
    "ceftazidime_count",
    "ceftazidime_and_betalactamase_inhibitor_count",
    "ceftriaxone_count",
    "colistin_count",
    "dexamethasone_and_antiinfectives_count",
    "linezolid_count",
    "meropenem_count",
    "piperacillin_and_betalactamase_inhibitor_count",
    "vancomycin_count"
  ],


 "Routine_cases_Antibiotics": [
    "amoxicillin_count",
    "ampicillin_count",
    "azithromycin_count",
    "benzathine_benzylpenicillin_count",
    "cefalexin_count",
    "cefazolin_count",
    "cefuroxime_count",
    "chloramphenicol_count",
    "ciprofloxacin_count",
    "clindamycin_count",
    "erythromycin_count",
    "flucloxacillin_count",
    "fosfomycin_count",
    "gentamicin_count",
    "levofloxacin_count",
    "metronidazole_count",
    "moxifloxacin_count",
    "mupirocin_count",
    # "noroxin_count",
    "rifampicin_count",
    "silver_sulfadiazine_count",
    "sulfamethoxazole_and_trimethoprim_count",
    "tobramycin_count"
  ],

  "Antikoagulanti": [
    "apixaban_count",
    "dabigatran_etexilate_count",
    "enoxaparin_count",
    "fondaparinux_count",
    "heparin_count",
    "nadroparin_count",
    "organoheparinoid_count",
    "warfarin_count"
  ],

  "Coagulation_factors": [
  
    "coagulation_factor_ix_ii_vii_and_x_in_combination_count",
    "coagulation_factor_viia_count"
  ],

  "Imunosupresivi": [

    "dexamethasone_count"
  ],

"5-ASA": [
    "mesalazine_count", 
 ],


  "Kortikosteroidi": [
    "dexamethasone_count",
    "dexamethasone_and_antiinfectives_count",
    "hydrocortisone_count",
    "mometasone_count",
    "prednisone_count"
 ],  

 "Oncology_Medications": [
    "exemestane_count"
  ],

  "Antitrombotici": [
    "clopidogrel_count",
    "ticagrelor_count"
   ],



  "Antidijabetici_simplex": [
    "acarbose_count",
    "dapagliflozin_count",
    "empagliflozin_count",
    "gliclazide_count",
    "glimepiride_count",
    "gliquidone_count",
    "linagliptin_count",
    "metformin_count",
    # "nateglinide_count",
    # "pioglitazone_count",
    "repaglinide_count",
    "sitagliptin_count"
  ],

  "Antidijabetici_complex_gradus_1": [
 
    "metformin_and_alogliptin_count",
    "metformin_and_empagliflozin_count",
    "metformin_and_vildagliptin_count"
  ],

  "Antidijabetici_complex_gradus_2": [
    "insulin_aspart_count",
    "insulin_glargine_count",
    "insulin_human_count"
  ],

 

  "NSAID_analgetici": [
    "paracetamol_count",
    "acetylsalicylic_acid_count",
    "diclofenac_count",
    "ibuprofen_count",
    "ketoprofen_count",
    "metamizole_sodium_count",
    "tramadol_and_paracetamol_count"

 ],
  "Opioidni_analgetici": [
    "fentanyl_count",
    "morphine_count",
    "pethidine_count",
    "tramadol_count",
    "tramadol_and_paracetamol_count"
  ],
 "Antihipertenzivi": [
    "amlodipine_count",
    "diltiazem_count",
    "doxazosin_count",
    "losartan_count",
    "atenolol_count",
    "bisoprolol_count",
    "carvedilol_count",
    "indapamide_count",
    "metoprolol_count",
    "moxonidine_count",
    "nebivolol_count",
    "lercanidipine_count",
    "lisinopril_count",
    "perindopril_count",
    "ramipril_count",
    "valsartan_count",
    "verapamil_count",
    "trandolapril_count",
    "urapidil_count",
    "zofenopril_count"
   ], 

 "Antihipertenzivi_kompleksna_th.": [
 
    "losartan_and_diuretics_count",
    "perindopril_and_amlodipine_count",
    "perindopril_amlodipine_and_indapamide_count",
    "perindopril_and_diuretics_count",
    "ramipril_and_amlodipine_count",
    "ramipril_and_diuretics_count",
    "valsartan_amlodipine_and_hydrochlorothiazide_count",
    "valsartan_and_sacubitril_count"
  
   ], 


"Digitalis" : [
"metildigoxin_count"
  ], 



 "Inotropes": [
    "dobutamine_count"
  ],

"HCN_blokatori": [
    "ivabradine_count"
   ],


"Antianginal": [
    "glyceryl_trinitrate_count",
    "isosorbide_dinitrate_count",
    "isosorbide_mononitrate_count",
    "trimetazidine_count"

  ],

 "Vasoactive_Agents": [
    "epinephrine_count"
 ],

  "Diuretici": [
    "eplerenone_count",
    "furosemide_count",
    "indapamide_count",
    "mannitol_count",
    "spironolactone_count",
    "torasemide_count"
 ],
  "Statini": [
    "atorvastatin_count",
    "rosuvastatin_count",
    "rosuvastatin_and_ezetimibe_count"

  ],

 "Antiarrhythmic_Medications": [
    "amiodarone_count", 
    "propafenone_count",
    "sotalol_count"
  ],

  "Antipsihotici": [
    "haloperidol_count",
    "promazine_count",
    "quetiapine_count",
    "risperidone_count",
    "sulpiride_count"
  ],
  "Antiepileptici": [
    "carbamazepine_count",
    "lamotrigine_count",
    "levetiracetam_count",
    "valproic_acid_count" 
 ], 
  "Sedativi_i_anksiolitici": [
    "alprazolam_count",
    "bromazepam_count",
    "clonazepam_count",
    "diazepam_count",
    "dexmedetomidine_count",
    "lorazepam_count",
    "midazolam_count",
    "methylphenobarbital_count",
    "nitrazepam_count",
    "oxazepam_count",
    "phenobarbital_count",
    "zolpidem_count"
  ],

 "Antidepressants": [
    "escitalopram_count",
    "mirtazapine_count",
    "paroxetine_count",
    "venlafaxine_count",
    "tianeptine_count"
  ],

  "Antigout": [
    "allopurinol_count",
    "colchicine_count"
 ],

  "Proton_pump_inhibitors_and_H2_blockers": [
   "esomeprazole_count", 
   "pantoprazole_count",
   "rabeprazole_count",
   "ranitidine_count"
 
  ],

"Induction_Anesthetics": [
    "etomidate_count",
    "ketamine_count",
    "propofol_count", 
    "thiopental_count"
  ],

"Gas_Anesthetics": [
  "sevoflurane_count"
 ],

  "Antihistaminici_i_drugi_antialergijski": [
    "loratadine_count",
    "chloropyramine_count",
    "montelukast_count"
  ],

  "Antiasmatici": [
    "aminophylline_count",
    "ipratropium_bromide_count",
    "salbutamol_count",
    "salbutamol_and_ipratropium_bromide_count",
    "theophylline_count",
    "tiotropium_bromide_count"
  ], 

 "Antimuscarinics": [
    "atropine_count",
    "butylscopolamine_count",
    "trospium_count"
  ],

  "Antiviralni": [
    "aciclovir_count",
    "oseltamivir_count"
  ],
  "Vitamini_i_suplementi": [
    "colecalciferol_count",
    "ferric_oxide_polymaltose_complexes_count",
    "ferrous_fumarate_count",
    "folic_acid_count",
    "phytomenadione_count"
  ],

  "Laxatives": [
    "lactulose_count",
    "macrogol_combinations_count"
 ],

 # "Antimigraine": [
 #    "sumatriptan_count"
 # ],

  "Antiemetici": [
    "ondansetron_count"
 ],

 "Thyroid_therapy": [
    "levothyroxine_sodium_count",
    "thiamazole_count"
 
  ],
  "Antifungalni": [
    "fluconazole_count",
    "miconazole_count",
    "clotrimazole_count"
  ],

  "Antiseptics_and_Disinfectants": [
    "chlorhexidine_count"
  ], 

"Antiemetics": [
    "metoclopramide_count",
    "thiethylperazine_count"
],

  "Nutritional_Components": [
    "carbohydrates_count",
    "omega3triglycerides_incl_other_esters_and_acids_count"
  ],

 "Electrolyte_and_volume_Management": [
   "albumin_count",
   "electrolytes_count",
   "calcium_chloride_count",
   "potassium_chloride_count",
   "potassium_different_salts_in_combination_count"


  ],


"Local_Anesthetics": [
    "levobupivacaine_count",
    "lidocaine_count"
  ],


"Antidijarojici": [
   "loperamide_count"
  ],


"Antiparkinsonici": [
  "levodopa_and_decarboxylase_inhibitor_count"
  ],

"Anti_Alzheimer": [
  "memantine_count"
  ],

"BHP": [ 
  "tamsulosin_count",
    "tamsulosin_and_dutasteride_count"
],


 "Vasodilators_and_Urological_Treatments": [
    "alprostadil_count"
  ],

"Radiocontrast" : [
"iodixanol_count" 
  ],

"Tranexamic_acid" : [
"tranexamic_acid_count"
  ],

 "Others": [
    "combinations_count",
    "flumazenil_count",
    "naloxone_count"
  ],

"Protamin": [
"protamine_count",

  ]
}

GROUP_ICD10 = {
  "High_Rehospitalization_Risk": [
    "A", # "Certain infectious and parasitic diseases"
    "C", # "Neoplasms"
    "E", # "Endocrine, nutritional and metabolic diseases"
    "I", # "Diseases of the circulatory system"
    "J", # "Diseases of the respiratory system"
    "K", # "Diseases of the digestive system"
    "N", # "Diseases of the genitourinary system"
    "S", # "Injury, poisoning and certain other consequences of external causes"
    "T"  # "Injury, poisoning and certain other consequences of external causes"
  ],

  "Low_Rehospitalization_Risk": [
    "B", # "Certain infectious and parasitic diseases"
    "D", # "Neoplasms and diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism"
    "F", # "Mental and behavioural disorders"
    "G", # "Diseases of the nervous system"
    "H", # "Diseases of the eye and adnexa; Diseases of the ear and mastoid process"
    "L", # "Diseases of the skin and subcutaneous tissue"
    "M", # "Diseases of the musculoskeletal system and connective tissue"
    # NOT IN SYNTH DATA "O", # "Pregnancy, childbirth and the puerperium"
    # NOT IN SYNTH DATA "P", # "Certain conditions originating in the perinatal period"
    "Q", # "Congenital malformations, deformations and chromosomal abnormalities"
    "R", # "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
    # NOT IN SYNTH DATA "V", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "W", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "X", # "External causes of morbidity and mortality"
    "Y", # "External causes of morbidity and mortality"
    "Z"  # "Factors influencing health status and contact with health services"
  ]
}



# Funkcije za ispravljanje dataset-a

In [3]:
df = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)
all = pd.concat([df, df_test])

In [4]:
# Laurini ljekovi koji se ne pojavljuju u nasem datasetu
not_appearing = []
all_laura_ljek = []
for k, v in GRUPE_LJEKOVA.items():
    for ljek in v: 
        all_laura_ljek.append(ljek)
        if ljek not in all.columns: 
            not_appearing.append(ljek)
not_appearing

[]

Ovi lijekovi koje je laura navela se ne pojavljuju u nasem datasetu (prvi krug)

In [5]:
# Kolone iz naseg dataseta koje se ne pojavljuju u laurinim ljekovima
for l in all.columns[16:]:
    if l not in all_laura_ljek:
        print(l)

b03aa07_count
betamethasone_count
indometacin_count
iohexol_count
lacidipine_count
methylprednisolone_count
n02bf01_count
n02bf02_count
neostigmine_count
nitrofurantoin_count
norepinephrine_count
norfloxacin_count
octreotide_count
pholcodine_count
prasugrel_count
ranolazine_count
rivaroxaban_count
rocuronium_bromide_count
salmeterol_count
salmeterol_and_fluticasone_count
sertraline_count
sufentanil_count
x99nijenavedenowho_count
A
B
C
D
E
F
G
H
I
J
K
L
M
N
Q
R
S
T
U
Y
Z
Probability_0
Probability_1


['doxycycline_count',
 'acetaminophen_count',
 'hydrochlorothiazide_count',
 'azathioprine_count'] -> ovi ljekovi iz laurinog .txt filea se ne pojavljuju u skupu podataka pa sam ih maknuo iz gornjeg dicta

In [6]:
age_groups_sorted = sorted(list(all.Age_Group.unique()))
age_group_numeric_mapping = {ag: i for i, ag in enumerate(age_groups_sorted)}
age_group_numeric_mapping

{'18-22': 0,
 '23-27': 1,
 '28-32': 2,
 '33-37': 3,
 '38-42': 4,
 '43-47': 5,
 '48-52': 6,
 '53-57': 7,
 '58-62': 8,
 '63-67': 9,
 '68-72': 10,
 '73-77': 11,
 '78-82': 12,
 '83-87': 13,
 '88-92': 14,
 '93-97': 15}

In [7]:
def fix_PreviousAdmissionDays(df):
    df['PrevAdmDaysAvail'] = (df.PreviousAdmissionDays > 0) * 1.0 # Valuable information for prediction
    df.loc[df.PreviousAdmissionDays == -8, 'PreviousAdmissionDays'] = PREV_ADM_DAYS_MEDIAN
    return df

def fix_Weigth_Discharge(df):
    gender_w_medians = {
        'Ž': WEIGHT_MEDIAN_FEMALE,
        'M': WEIGHT_MEDIAN_MALE,
        None: WEIGHT_MEDIAN_GENDER_MISSING
    }
    row_indexer = (df.Weight_Discharge < WEIGHT_LOWER_LIM) | (df.Weight_Discharge > WEIGHT_UPPER_LIM)
    df.loc[row_indexer, 'Weight_Discharge'] = df[row_indexer].Gender.map(gender_w_medians)
    return df

def fix_Heigth_Discharge(df):
    gender_h_medians = {
        'Ž': HEIGHT_MEDIAN_FEMALE,
        'M': HEIGHT_MEDIAN_MALE,
        None: HEIGHT_MEDIAN_GENDER_MISSING
    }
    row_indexer = (df.Height_Discharge < HEIGHT_LOWER_LIM) | (df.Height_Discharge > HEIGHT_UPPER_LIM)
    df.loc[row_indexer, 'Height_Discharge'] = df[row_indexer].Gender.map(gender_h_medians)
    return df

def aggregate_mediactions(df):
    to_drop = []
    for med_group, list_of_meds in GRUPE_LJEKOVA.items():
        df[f"{med_group}_count"] = df[list_of_meds].sum(axis=1)
        to_drop.extend(list_of_meds)
    df.drop(columns=to_drop, inplace=True)
    return df

def aggregate_icd10_diagnosis(df):
    to_drop = []
    for icd10_group, list_of_diag in GROUP_ICD10.items():
        df[f"{icd10_group}_count"] = df[list_of_diag].sum(axis=1)
        to_drop.extend(list_of_diag)
    df.drop(columns=to_drop, inplace=True)
    return df

def convert_age_groups_to_numeric(df):
    df['Age_Group'] = df.Age_Group.map(age_group_numeric_mapping)
    return df

def fix_dataset(df):
    df = fix_PreviousAdmissionDays(df)
    df = fix_Weigth_Discharge(df)
    df = fix_Heigth_Discharge(df)
    df = aggregate_mediactions(df)
    df = aggregate_icd10_diagnosis(df)
    df = convert_age_groups_to_numeric(df)
    return df
    

In [8]:
all = fix_dataset(all)

In [9]:
all.shape

(35923, 100)

In [10]:
all.head()

Unnamed: 0,AdmissionDx,AdmissionType,PreviousAdmissionDays,LOS,Age_Group,Gender,Surgery_Count,LOS_ICU,Discharge_Specialty,Dx_Discharge,Discharge_Status,Weight_Discharge,Height_Discharge,Education,Current_Work_Status,Label,b03aa07_count,betamethasone_count,indometacin_count,iohexol_count,lacidipine_count,methylprednisolone_count,n02bf01_count,n02bf02_count,neostigmine_count,nitrofurantoin_count,norepinephrine_count,norfloxacin_count,octreotide_count,pholcodine_count,prasugrel_count,ranolazine_count,rivaroxaban_count,rocuronium_bromide_count,salmeterol_count,salmeterol_and_fluticasone_count,sertraline_count,sufentanil_count,x99nijenavedenowho_count,U,Probability_0,Probability_1,PrevAdmDaysAvail,Severe_cases_Antibiotics_count,Routine_cases_Antibiotics_count,Antikoagulanti_count,Coagulation_factors_count,Imunosupresivi_count,5-ASA_count,Kortikosteroidi_count,Oncology_Medications_count,Antitrombotici_count,Antidijabetici_simplex_count,Antidijabetici_complex_gradus_1_count,Antidijabetici_complex_gradus_2_count,NSAID_analgetici_count,Opioidni_analgetici_count,Antihipertenzivi_count,Antihipertenzivi_kompleksna_th._count,Digitalis_count,Inotropes_count,HCN_blokatori_count,Antianginal_count,Vasoactive_Agents_count,Diuretici_count,Statini_count,Antiarrhythmic_Medications_count,Antipsihotici_count,Antiepileptici_count,Sedativi_i_anksiolitici_count,Antidepressants_count,Antigout_count,Proton_pump_inhibitors_and_H2_blockers_count,Induction_Anesthetics_count,Gas_Anesthetics_count,Antihistaminici_i_drugi_antialergijski_count,Antiasmatici_count,Antimuscarinics_count,Antiviralni_count,Vitamini_i_suplementi_count,Laxatives_count,Antiemetici_count,Thyroid_therapy_count,Antifungalni_count,Antiseptics_and_Disinfectants_count,Antiemetics_count,Nutritional_Components_count,Electrolyte_and_volume_Management_count,Local_Anesthetics_count,Antidijarojici_count,Antiparkinsonici_count,Anti_Alzheimer_count,BHP_count,Vasodilators_and_Urological_Treatments_count,Radiocontrast_count,Tranexamic_acid_count,Others_count,Protamin_count,High_Rehospitalization_Risk_count,Low_Rehospitalization_Risk_count
1,I,Elektivni,1055,1,13,Ž,0,0,3010100,I,KUĆI,53.0,156.0,ZAVRŠENA OSNOVNA ŠKOLA,DOMAĆICA,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
2,I,Elektivni,1055,16,10,M,1,0,3100400,I,KUĆI,96.0,167.0,NEPOZNAT OBRAZOVNI STATUS,UMIROVLJENIK,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,,,0.0,3,2,3,0,0,0,0,0,0,0,0,0,6,4,2,1,0,0,0,0,0,1,1,0,0,0,4,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,0,0,0,0,0,0,0,0,0,0,3,1
3,I,Elektivni,279,3,0,Ž,1,0,3010100,I,KUĆI,52.0,163.0,ZAVRŠENO VISOKO OBRAZOVANJE,STUDENT,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,,,1.0,0,2,2,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,0
4,I,Elektivni,1905,2,13,M,0,0,3010100,I,KUĆI,95.0,175.0,ZAVRŠENA SREDNJA ŠKOLA,UMIROVLJENIK,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0
5,I,Elektivni,1055,2,11,M,0,0,3010100,I,KUĆI,82.0,174.0,ZAVRŠENO VISOKO OBRAZOVANJE,REDOVAN POSAO,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,,,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0


In [11]:
list(all.columns)

['AdmissionDx',
 'AdmissionType',
 'PreviousAdmissionDays',
 'LOS',
 'Age_Group',
 'Gender',
 'Surgery_Count',
 'LOS_ICU',
 'Discharge_Specialty',
 'Dx_Discharge',
 'Discharge_Status',
 'Weight_Discharge',
 'Height_Discharge',
 'Education',
 'Current_Work_Status',
 'Label',
 'b03aa07_count',
 'betamethasone_count',
 'indometacin_count',
 'iohexol_count',
 'lacidipine_count',
 'methylprednisolone_count',
 'n02bf01_count',
 'n02bf02_count',
 'neostigmine_count',
 'nitrofurantoin_count',
 'norepinephrine_count',
 'norfloxacin_count',
 'octreotide_count',
 'pholcodine_count',
 'prasugrel_count',
 'ranolazine_count',
 'rivaroxaban_count',
 'rocuronium_bromide_count',
 'salmeterol_count',
 'salmeterol_and_fluticasone_count',
 'sertraline_count',
 'sufentanil_count',
 'x99nijenavedenowho_count',
 'U',
 'Probability_0',
 'Probability_1',
 'PrevAdmDaysAvail',
 'Severe_cases_Antibiotics_count',
 'Routine_cases_Antibiotics_count',
 'Antikoagulanti_count',
 'Coagulation_factors_count',
 'Imunosupr

In [12]:
all.isna().sum()[all.isna().sum() > 0]

AdmissionDx              498
Gender                     2
Education                280
Current_Work_Status       50
Label                   7336
Probability_0          35923
Probability_1          35923
dtype: int64

Counts of missing values

# OneHot encoding and Scaling functions

In [14]:
def oneHot_encode_feature(df, feature):
    ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
    ohe.fit(df[df[feature].notnull()][[feature]])
    X = ohe.transform(df[[feature]])
    df[ohe.get_feature_names_out()] = X.toarray()
    df.drop(columns=[feature], inplace=True)
    return df

def scale_numeric_features(df, numeric_cols):
    scaler = StandardScaler()
    scaler.fit(df[numeric_cols])
    df[numeric_cols] = scaler.transform(df[numeric_cols])
    return df

def encode_and_scale_features(df, enc_features, scale_features):
    for f in enc_features:
        df = oneHot_encode_feature(df, f)

    df = scale_numeric_features(df, scale_features)
    return df

# Obtaining Preprocessed dataset

In [15]:
cols_to_onehot_encode = [
    'AdmissionDx',
    'AdmissionType',
    'Gender',
    'Discharge_Specialty',
    'Dx_Discharge',
    'Discharge_Status',
    'Education',
    'Current_Work_Status'
]

cols_to_scale= [
    'PreviousAdmissionDays',
    'LOS',
    'Age_Group',
    'Surgery_Count',
    'LOS_ICU',
    'Weight_Discharge',
    'Height_Discharge'
]

In [16]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)
df_train['Train'] = 1
df_test['Train'] = 0
all = pd.concat([df_train, df_test])

In [17]:
all = fix_dataset(all)
all = encode_and_scale_features(
    all,
    enc_features=cols_to_onehot_encode,
    scale_features=cols_to_scale
)

  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.toarray()
  df[ohe.get_feature_names_out()] = X.to

In [18]:
all.head()

Unnamed: 0,PreviousAdmissionDays,LOS,Age_Group,Surgery_Count,LOS_ICU,Weight_Discharge,Height_Discharge,Label,b03aa07_count,betamethasone_count,indometacin_count,iohexol_count,lacidipine_count,methylprednisolone_count,n02bf01_count,n02bf02_count,neostigmine_count,nitrofurantoin_count,norepinephrine_count,norfloxacin_count,octreotide_count,pholcodine_count,prasugrel_count,ranolazine_count,rivaroxaban_count,rocuronium_bromide_count,salmeterol_count,salmeterol_and_fluticasone_count,sertraline_count,sufentanil_count,x99nijenavedenowho_count,U,Train,Probability_0,Probability_1,PrevAdmDaysAvail,Severe_cases_Antibiotics_count,Routine_cases_Antibiotics_count,Antikoagulanti_count,Coagulation_factors_count,Imunosupresivi_count,5-ASA_count,Kortikosteroidi_count,Oncology_Medications_count,Antitrombotici_count,Antidijabetici_simplex_count,Antidijabetici_complex_gradus_1_count,Antidijabetici_complex_gradus_2_count,NSAID_analgetici_count,Opioidni_analgetici_count,Antihipertenzivi_count,Antihipertenzivi_kompleksna_th._count,Digitalis_count,Inotropes_count,HCN_blokatori_count,Antianginal_count,Vasoactive_Agents_count,Diuretici_count,Statini_count,Antiarrhythmic_Medications_count,Antipsihotici_count,Antiepileptici_count,Sedativi_i_anksiolitici_count,Antidepressants_count,Antigout_count,Proton_pump_inhibitors_and_H2_blockers_count,Induction_Anesthetics_count,Gas_Anesthetics_count,Antihistaminici_i_drugi_antialergijski_count,Antiasmatici_count,Antimuscarinics_count,Antiviralni_count,Vitamini_i_suplementi_count,Laxatives_count,Antiemetici_count,Thyroid_therapy_count,Antifungalni_count,Antiseptics_and_Disinfectants_count,Antiemetics_count,Nutritional_Components_count,Electrolyte_and_volume_Management_count,Local_Anesthetics_count,Antidijarojici_count,Antiparkinsonici_count,Anti_Alzheimer_count,BHP_count,Vasodilators_and_Urological_Treatments_count,Radiocontrast_count,Tranexamic_acid_count,Others_count,Protamin_count,High_Rehospitalization_Risk_count,Low_Rehospitalization_Risk_count,AdmissionDx_B,AdmissionDx_C,AdmissionDx_D,AdmissionDx_E,AdmissionDx_F,AdmissionDx_G,AdmissionDx_H,AdmissionDx_I,AdmissionDx_J,AdmissionDx_K,AdmissionDx_L,AdmissionDx_M,AdmissionDx_N,AdmissionDx_Q,AdmissionDx_R,AdmissionDx_S,AdmissionDx_T,AdmissionDx_Y,AdmissionDx_Z,AdmissionType_Hitni,Gender_Ž,Discharge_Specialty_3010100,Discharge_Specialty_3100400,Discharge_Specialty_3100600,Discharge_Specialty_3190100,Discharge_Specialty_3190200,Dx_Discharge_C,Dx_Discharge_D,Dx_Discharge_E,Dx_Discharge_G,Dx_Discharge_H,Dx_Discharge_I,Dx_Discharge_J,Dx_Discharge_K,Dx_Discharge_L,Dx_Discharge_N,Dx_Discharge_Q,Dx_Discharge_R,Dx_Discharge_T,Dx_Discharge_Z,Discharge_Status_KUĆI,Discharge_Status_LIJEČENJE OKONČANO PROTIVNO SAVJETU DOKTORA,Discharge_Status_OSTALO,Discharge_Status_U DRUGU STACIONARNU ZDRAVSTVENU USTANOVU,Discharge_Status_UMRO (NIJE OBDUCIRAN),Education_NEPOZNAT OBRAZOVNI STATUS,Education_NIJE POHAĐALA OSNOVNU ŠKOLU,Education_NIJE ZAVRŠILA OSNOVNU ŠKOLU,Education_ZAVRŠENA OSNOVNA ŠKOLA,Education_ZAVRŠENA SREDNJA ŠKOLA,Education_ZAVRŠENA VIŠA ŠKOLA,Education_ZAVRŠENO VISOKO OBRAZOVANJE,Current_Work_Status_DOMAĆICA,Current_Work_Status_NEPOZNATO,Current_Work_Status_NEZAPOSLEN/POVREMENI POSAO,Current_Work_Status_POLJOPRIVREDNIK,Current_Work_Status_REDOVAN POSAO,Current_Work_Status_SAMOSTALNA DJELATNOST,Current_Work_Status_STUDENT,Current_Work_Status_UMIROVLJENIK,Current_Work_Status_UČENIK
1,-0.289586,-0.482977,1.69258,-0.458037,-0.050516,-2.007681,-1.644906,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.289586,1.883533,0.379549,1.910477,-0.050516,0.652788,-0.468054,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,,,0.0,3,2,3,0,0,0,0,0,0,0,0,0,6,4,2,1,0,0,0,0,0,1,1,0,0,0,4,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,0,0,0,0,0,0,0,0,0,0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.800832,-0.167442,-3.997224,1.910477,-0.050516,-2.069552,-0.896,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,,,1.0,0,2,2,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.270413,-0.32521,1.69258,-0.458037,-0.050516,0.590916,0.387838,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-0.289586,-0.32521,0.817226,-0.458037,-0.050516,-0.213411,0.280851,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,,,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
all.columns.values

array(['PreviousAdmissionDays', 'LOS', 'Age_Group', 'Surgery_Count',
       'LOS_ICU', 'Weight_Discharge', 'Height_Discharge', 'Label',
       'b03aa07_count', 'betamethasone_count', 'indometacin_count',
       'iohexol_count', 'lacidipine_count', 'methylprednisolone_count',
       'n02bf01_count', 'n02bf02_count', 'neostigmine_count',
       'nitrofurantoin_count', 'norepinephrine_count',
       'norfloxacin_count', 'octreotide_count', 'pholcodine_count',
       'prasugrel_count', 'ranolazine_count', 'rivaroxaban_count',
       'rocuronium_bromide_count', 'salmeterol_count',
       'salmeterol_and_fluticasone_count', 'sertraline_count',
       'sufentanil_count', 'x99nijenavedenowho_count', 'U', 'Train',
       'Probability_0', 'Probability_1', 'PrevAdmDaysAvail',
       'Severe_cases_Antibiotics_count',
       'Routine_cases_Antibiotics_count', 'Antikoagulanti_count',
       'Coagulation_factors_count', 'Imunosupresivi_count', '5-ASA_count',
       'Kortikosteroidi_count', 'Oncology_

In [20]:
# Grupirane dijagnoze za admission i discharge

AdmissionDx_ICD10 = {
  "High_Risk_Admissions": [
    # "AdmissionDx_A", # "Certain infectious and parasitic diseases"
    "AdmissionDx_C", # "Neoplasms"
    "AdmissionDx_E", # "Endocrine, nutritional and metabolic diseases"
    "AdmissionDx_I", # "Diseases of the circulatory system"
    "AdmissionDx_J", # "Diseases of the respiratory system"
    "AdmissionDx_K", # "Diseases of the digestive system"
    "AdmissionDx_N", # "Diseases of the genitourinary system"
    "AdmissionDx_S", # "Injury, poisoning and certain other consequences of external causes"
    "AdmissionDx_T"  # "Injury, poisoning and certain other consequences of external causes"
  ],

  "Low_Risk_Admissions": [
    "AdmissionDx_B", # "Certain infectious and parasitic diseases"
    "AdmissionDx_D", # "Neoplasms and diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism"
    "AdmissionDx_F", # "Mental and behavioural disorders"
    "AdmissionDx_G", # "Diseases of the nervous system"
    "AdmissionDx_H", # "Diseases of the eye and adnexa; Diseases of the ear and mastoid process"
    "AdmissionDx_L", # "Diseases of the skin and subcutaneous tissue"
    "AdmissionDx_M", # "Diseases of the musculoskeletal system and connective tissue"
    # NOT IN SYNTH DATA "O", # "Pregnancy, childbirth and the puerperium"
    # NOT IN SYNTH DATA "P", # "Certain conditions originating in the perinatal period"
    "AdmissionDx_Q", # "Congenital malformations, deformations and chromosomal abnormalities"
    "AdmissionDx_R", # "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
    # NOT IN SYNTH DATA "V", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "W", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "X", # "External causes of morbidity and mortality"
    "AdmissionDx_Y", # "External causes of morbidity and mortality"
    "AdmissionDx_Z"  # "Factors influencing health status and contact with health services"
  ]
}

DischargeDx_ICD10 = {
  "High_Risk_Discharge": [
    # "Dx_Discharge_A", # "Certain infectious and parasitic diseases"
    "Dx_Discharge_C", # "Neoplasms"
    "Dx_Discharge_E", # "Endocrine, nutritional and metabolic diseases"
    "Dx_Discharge_I", # "Diseases of the circulatory system"
    "Dx_Discharge_J", # "Diseases of the respiratory system"
    "Dx_Discharge_K", # "Diseases of the digestive system"
    "Dx_Discharge_N", # "Diseases of the genitourinary system"
    # "Dx_Discharge_S", # "Injury, poisoning and certain other consequences of external causes"
    "Dx_Discharge_T"  # "Injury, poisoning and certain other consequences of external causes"
  ],

  "Low_Risk_Discharge": [
    # "Dx_Discharge_B", # "Certain infectious and parasitic diseases"
    "Dx_Discharge_D", # "Neoplasms and diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism"
    # "Dx_Discharge_F", # "Mental and behavioural disorders"
    "Dx_Discharge_G", # "Diseases of the nervous system"
    "Dx_Discharge_H", # "Diseases of the eye and adnexa; Diseases of the ear and mastoid process"
    "Dx_Discharge_L", # "Diseases of the skin and subcutaneous tissue"
    # "Dx_Discharge_M", # "Diseases of the musculoskeletal system and connective tissue"
    # NOT IN SYNTH DATA "O", # "Pregnancy, childbirth and the puerperium"
    # NOT IN SYNTH DATA "P", # "Certain conditions originating in the perinatal period"
    "Dx_Discharge_Q", # "Congenital malformations, deformations and chromosomal abnormalities"
    "Dx_Discharge_R", # "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
    # NOT IN SYNTH DATA "V", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "W", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "X", # "External causes of morbidity and mortality"
    # "Dx_Discharge_Y", # "External causes of morbidity and mortality"
    "Dx_Discharge_Z"  # "Factors influencing health status and contact with health services"
  ]
}

In [21]:
def aggregate_admissions_diagnosis(df):
    to_drop = []
    for admissions_group, list_of_diag in AdmissionDx_ICD10.items():
        df[f"{admissions_group}_count"] = df[list_of_diag].sum(axis=1)
        to_drop.extend(list_of_diag)
    df.drop(columns=to_drop, inplace=True)
    return df

def aggregate_discharge_diagnosis(df):
    to_drop = []
    for discharge_group, list_of_diag in DischargeDx_ICD10.items():
        df[f"{discharge_group}_count"] = df[list_of_diag].sum(axis=1)
        to_drop.extend(list_of_diag)
    df.drop(columns=to_drop, inplace=True)
    return df
    
def fix_dataset2(df):
    df = aggregate_admissions_diagnosis(df)
    df = aggregate_discharge_diagnosis(df)
    return df

In [22]:
all.head()

Unnamed: 0,PreviousAdmissionDays,LOS,Age_Group,Surgery_Count,LOS_ICU,Weight_Discharge,Height_Discharge,Label,b03aa07_count,betamethasone_count,indometacin_count,iohexol_count,lacidipine_count,methylprednisolone_count,n02bf01_count,n02bf02_count,neostigmine_count,nitrofurantoin_count,norepinephrine_count,norfloxacin_count,octreotide_count,pholcodine_count,prasugrel_count,ranolazine_count,rivaroxaban_count,rocuronium_bromide_count,salmeterol_count,salmeterol_and_fluticasone_count,sertraline_count,sufentanil_count,x99nijenavedenowho_count,U,Train,Probability_0,Probability_1,PrevAdmDaysAvail,Severe_cases_Antibiotics_count,Routine_cases_Antibiotics_count,Antikoagulanti_count,Coagulation_factors_count,Imunosupresivi_count,5-ASA_count,Kortikosteroidi_count,Oncology_Medications_count,Antitrombotici_count,Antidijabetici_simplex_count,Antidijabetici_complex_gradus_1_count,Antidijabetici_complex_gradus_2_count,NSAID_analgetici_count,Opioidni_analgetici_count,Antihipertenzivi_count,Antihipertenzivi_kompleksna_th._count,Digitalis_count,Inotropes_count,HCN_blokatori_count,Antianginal_count,Vasoactive_Agents_count,Diuretici_count,Statini_count,Antiarrhythmic_Medications_count,Antipsihotici_count,Antiepileptici_count,Sedativi_i_anksiolitici_count,Antidepressants_count,Antigout_count,Proton_pump_inhibitors_and_H2_blockers_count,Induction_Anesthetics_count,Gas_Anesthetics_count,Antihistaminici_i_drugi_antialergijski_count,Antiasmatici_count,Antimuscarinics_count,Antiviralni_count,Vitamini_i_suplementi_count,Laxatives_count,Antiemetici_count,Thyroid_therapy_count,Antifungalni_count,Antiseptics_and_Disinfectants_count,Antiemetics_count,Nutritional_Components_count,Electrolyte_and_volume_Management_count,Local_Anesthetics_count,Antidijarojici_count,Antiparkinsonici_count,Anti_Alzheimer_count,BHP_count,Vasodilators_and_Urological_Treatments_count,Radiocontrast_count,Tranexamic_acid_count,Others_count,Protamin_count,High_Rehospitalization_Risk_count,Low_Rehospitalization_Risk_count,AdmissionDx_B,AdmissionDx_C,AdmissionDx_D,AdmissionDx_E,AdmissionDx_F,AdmissionDx_G,AdmissionDx_H,AdmissionDx_I,AdmissionDx_J,AdmissionDx_K,AdmissionDx_L,AdmissionDx_M,AdmissionDx_N,AdmissionDx_Q,AdmissionDx_R,AdmissionDx_S,AdmissionDx_T,AdmissionDx_Y,AdmissionDx_Z,AdmissionType_Hitni,Gender_Ž,Discharge_Specialty_3010100,Discharge_Specialty_3100400,Discharge_Specialty_3100600,Discharge_Specialty_3190100,Discharge_Specialty_3190200,Dx_Discharge_C,Dx_Discharge_D,Dx_Discharge_E,Dx_Discharge_G,Dx_Discharge_H,Dx_Discharge_I,Dx_Discharge_J,Dx_Discharge_K,Dx_Discharge_L,Dx_Discharge_N,Dx_Discharge_Q,Dx_Discharge_R,Dx_Discharge_T,Dx_Discharge_Z,Discharge_Status_KUĆI,Discharge_Status_LIJEČENJE OKONČANO PROTIVNO SAVJETU DOKTORA,Discharge_Status_OSTALO,Discharge_Status_U DRUGU STACIONARNU ZDRAVSTVENU USTANOVU,Discharge_Status_UMRO (NIJE OBDUCIRAN),Education_NEPOZNAT OBRAZOVNI STATUS,Education_NIJE POHAĐALA OSNOVNU ŠKOLU,Education_NIJE ZAVRŠILA OSNOVNU ŠKOLU,Education_ZAVRŠENA OSNOVNA ŠKOLA,Education_ZAVRŠENA SREDNJA ŠKOLA,Education_ZAVRŠENA VIŠA ŠKOLA,Education_ZAVRŠENO VISOKO OBRAZOVANJE,Current_Work_Status_DOMAĆICA,Current_Work_Status_NEPOZNATO,Current_Work_Status_NEZAPOSLEN/POVREMENI POSAO,Current_Work_Status_POLJOPRIVREDNIK,Current_Work_Status_REDOVAN POSAO,Current_Work_Status_SAMOSTALNA DJELATNOST,Current_Work_Status_STUDENT,Current_Work_Status_UMIROVLJENIK,Current_Work_Status_UČENIK
1,-0.289586,-0.482977,1.69258,-0.458037,-0.050516,-2.007681,-1.644906,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.289586,1.883533,0.379549,1.910477,-0.050516,0.652788,-0.468054,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,,,0.0,3,2,3,0,0,0,0,0,0,0,0,0,6,4,2,1,0,0,0,0,0,1,1,0,0,0,4,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,0,0,0,0,0,0,0,0,0,0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.800832,-0.167442,-3.997224,1.910477,-0.050516,-2.069552,-0.896,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,,,1.0,0,2,2,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.270413,-0.32521,1.69258,-0.458037,-0.050516,0.590916,0.387838,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-0.289586,-0.32521,0.817226,-0.458037,-0.050516,-0.213411,0.280851,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,,,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [23]:
all = fix_dataset2(all)

  df[f"{admissions_group}_count"] = df[list_of_diag].sum(axis=1)
  df[f"{admissions_group}_count"] = df[list_of_diag].sum(axis=1)
  df[f"{discharge_group}_count"] = df[list_of_diag].sum(axis=1)
  df[f"{discharge_group}_count"] = df[list_of_diag].sum(axis=1)


In [25]:
df_train = all[all.Train == 1].copy()
df_test = all[all.Train == 0].copy()
df_train.drop(columns=['Train'], inplace=True)
df_train.drop(columns=['Probability_0'], inplace=True)
df_train.drop(columns=['Probability_1'], inplace=True)
df_test.drop(columns=['Train'], inplace=True)
df_test.drop(columns=['Probability_0'], inplace=True)
df_test.drop(columns=['Probability_1'], inplace=True)
df_test.drop(columns=['Label'], inplace=True)

In [26]:
df_train.shape

(28587, 122)

In [27]:
df_test.shape

(7336, 121)

In [28]:
feature_cols = list(df_train.columns)
feature_cols.remove('Label')

In [29]:
print(feature_cols)

['PreviousAdmissionDays', 'LOS', 'Age_Group', 'Surgery_Count', 'LOS_ICU', 'Weight_Discharge', 'Height_Discharge', 'b03aa07_count', 'betamethasone_count', 'indometacin_count', 'iohexol_count', 'lacidipine_count', 'methylprednisolone_count', 'n02bf01_count', 'n02bf02_count', 'neostigmine_count', 'nitrofurantoin_count', 'norepinephrine_count', 'norfloxacin_count', 'octreotide_count', 'pholcodine_count', 'prasugrel_count', 'ranolazine_count', 'rivaroxaban_count', 'rocuronium_bromide_count', 'salmeterol_count', 'salmeterol_and_fluticasone_count', 'sertraline_count', 'sufentanil_count', 'x99nijenavedenowho_count', 'U', 'PrevAdmDaysAvail', 'Severe_cases_Antibiotics_count', 'Routine_cases_Antibiotics_count', 'Antikoagulanti_count', 'Coagulation_factors_count', 'Imunosupresivi_count', '5-ASA_count', 'Kortikosteroidi_count', 'Oncology_Medications_count', 'Antitrombotici_count', 'Antidijabetici_simplex_count', 'Antidijabetici_complex_gradus_1_count', 'Antidijabetici_complex_gradus_2_count', 'NSAI

In [30]:
X_train = df_train[feature_cols].values
y_train = df_train['Label'].values
X_test = df_test[feature_cols].values

X_train, y_train, i X_test su numpy matrice spremne za uproabu sa sklearn library-om.

# Feature Selection

Na kraju ovog poglavlja trebamo imati X_train, y_train, X_test sa nebitnim kolonama izbačenim. Najočitije beskorisne feature se treba maknuti sa variance thresholdima, ostalim metodama itd., i onda izvrtiti backward wrapper feature selection sa lightweight RandomForestom, ili L1 log.reg. ako se RF pokaže da je jednostavno prespor. 

## Lasso

In [31]:
print(X_train.shape)

(28587, 121)


In [32]:
# Set the regularization parameter C=1
logistic = LogisticRegression(C=0.6, penalty='l1', class_weight='balanced', solver='liblinear', random_state=42).fit(X_train, y_train)
f_selector = SelectFromModel(logistic, prefit=True, threshold=1e-3)

X_train_new = f_selector.transform(X_train)
y_train_new = y_train
X_test_new = f_selector.transform(X_test)

print(X_train_new.shape)
print(y_train_new.shape)
print(X_test_new.shape)

(28587, 102)
(28587,)
(7336, 102)


In [33]:
l1_coefs = pd.DataFrame({'feature': feature_cols, 'L1_coef': logistic.coef_[0]})

In [34]:
len(l1_coefs[l1_coefs.L1_coef.abs() < 1e-3])

19

In [35]:
irrelevant_features = l1_coefs[l1_coefs['L1_coef'].abs() < 1e-3]['feature'].tolist()
df_train.drop(columns=irrelevant_features, inplace=True)


In [36]:
df_train.shape
feature_cols = list(df_train.columns)
feature_cols.remove('Label')

In [37]:
l1_coefs[l1_coefs.L1_coef.abs() < 1e-3]

Unnamed: 0,feature,L1_coef
7,b03aa07_count,0.0
19,octreotide_count,0.0
27,sertraline_count,0.0
30,U,0.0
39,Oncology_Medications_count,0.0
42,Antidijabetici_complex_gradus_1_count,0.0
56,Antipsihotici_count,0.0
80,Anti_Alzheimer_count,0.0
82,Vasodilators_and_Urological_Treatments_count,0.0
83,Radiocontrast_count,0.0


In [None]:
l1_coefs[l1_coefs.L1_coef.abs() > 1e-3]

In [38]:
df_train['Label'] = y_train

In [None]:
df_train.dobutamine_count.value_counts()

In [None]:
df_train[['dobutamine_count', 'Label']].groupby('dobutamine_count').value_counts()

In [None]:
matthews_corrcoef(y_train, logistic.predict(X_train_new))

In [None]:
logistic.predict(X_train_new).mean()

L1 feature selection might be an option...

## Variance filter methods

We can use this method to get rid of monotone features

In [40]:
v_threshold = VarianceThreshold(threshold=0.001)
v_threshold.fit(X_train_new)
meets_var_thresholds = pd.DataFrame({'feature': feature_cols, 'passes_var': v_threshold.get_support()})

X_train_filtered = v_threshold.transform(X_train_new)
feature_cols_filtered = [feature_cols[i] for i in range(len(feature_cols)) if v_threshold.get_support()[i]]
df_train = pd.DataFrame(X_train_filtered, columns=feature_cols_filtered)

In [41]:
X_test_filtered = v_threshold.transform(X_test_new)
df_test = pd.DataFrame(X_test_filtered, columns=feature_cols_filtered)

In [42]:
meets_var_thresholds.passes_var.mean()

0.9117647058823529

In [43]:
meets_var_thresholds[~meets_var_thresholds.passes_var]

Unnamed: 0,feature,passes_var
7,betamethasone_count,False
13,n02bf02_count,False
19,prasugrel_count,False
23,salmeterol_count,False
33,5-ASA_count,False
44,HCN_blokatori_count,False
60,Antiviralni_count,False
72,Antiparkinsonici_count,False
85,Discharge_Status_OSTALO,False


In [44]:
len(meets_var_thresholds[~meets_var_thresholds.passes_var])

9

In [None]:
df_test.sitagliptin_count.value_counts()

In [None]:
#df_train[['sitagliptin_count', 'Label']].groupby('sitagliptin_count').value_counts(normalize=True)

In [45]:
df_train.shape
df_train['Label'] = y_train
feature_cols = list(df_train.columns)
feature_cols.remove('Label')
X_train = df_train[feature_cols].values
y_train = df_train['Label'].values
X_test = df_test[feature_cols].values

## Information Gain

In [None]:
importances = mutual_info_classif(X_train, y_train)

In [None]:
mutual_info = pd.DataFrame({'feature': feature_cols, 'IG': importances})

In [None]:
mutual_info.plot(kind='bar')

In [None]:
mutual_info[mutual_info.IG > 0.001]

In [None]:
df_train[['A', 'Label']].groupby('A').value_counts(normalize=True)

In [None]:
df_train[['A', 'Label']].groupby('A').value_counts()

## Wrapper methods

In [None]:
clf = RandomForestClassifier(n_estimators=200, min_samples_split=9, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_train)

In [None]:
matthews_corrcoef(y_train, y_pred)

In [None]:
(y_pred == y_train).mean()

In [None]:
y_pred.mean()

In [None]:
cfm=confusion_matrix(y_train, y_pred)
ax = sns.heatmap(cfm, annot=True, fmt='d')

ax.set_xlabel("Predicted Label")
ax.set_ylabel("True Label")

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [None]:
# Kao sto se vidi ispod ovaj model bi mogli koristiti za backward wrapper method feature selection....
clf = RandomForestClassifier(n_estimators=200, min_samples_split=40, class_weight='balanced', oob_score=matthews_corrcoef, random_state=42)
clf.fit(X2_train, y2_train)
clf.oob_score_

In [None]:
y2_pred = clf.predict(X2_test)
matthews_corrcoef(y2_test, y2_pred)

In [None]:
y2_pred.mean()

In [None]:
cfm=confusion_matrix(y2_test, y2_pred)
ax = sns.heatmap(cfm, annot=True, fmt='d')

ax.set_xlabel("Predicted Label")
ax.set_ylabel("True Label")

In [41]:
X_train.shape

(28587, 160)

In [42]:
y_train.shape

(28587,)

In [43]:
from sklearn.metrics import make_scorer, matthews_corrcoef
mcc_scorer = make_scorer(matthews_corrcoef)

In [None]:
estimator = RandomForestClassifier(n_estimators=200, min_samples_split=40, class_weight='balanced', random_state=42)
sfs = SequentialFeatureSelector(estimator, tol=-0.001, direction="backward", scoring=mcc_scorer)
sfs.fit(X_train, y_train)
wrapper_results = pd.DataFrame({'features': feature_cols, 'selected': sfs.get_support()})

Moramo dosta srezati poceti skup featurea prije nego sto ovo provertimo. Mozemo maknuti najocitije beskorisne feature iz svih proslih metoda (tipa variance threshold) i onda ovo izvrtit.

In [None]:
wrapper_results[wrapper_results.selected]

In [None]:
# MCC score if we just predict based on whether prev admission days was -8 or not
matthews_corrcoef(y_train, df_train.PrevAdmDaysAvail == 0)

Dakle najgluplji model kojeg mozemo smislit ima mcc 0.258.

# Model Selection

Jednom kada imamo X_train, i y_train sa nebitnim kolonama ukonjenima treba izvršiti stratified k-fold cross validation grid search hiperparametra za sve klasicne modele (log.reg., SVM, RandomForest, GradientBoostingClassifier, xgboost.XGBClassifier). Scoring metrika za cross validaciju treba biti mcc. Rezultat ovog poglavlja je izbor modela i njegovih hiperparametara koje ćemo koristiti za finalno rješenje.

Kros validaciju s ovim napravit: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html


Grid search + Cross validation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [46]:
X_train.shape

(28587, 93)

In [47]:
X_test.shape

(7336, 93)

In [48]:
y_train.shape

(28587,)

In [50]:
param_grid = {
    'log_reg': {
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced']
    },
    'svm': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__gamma': [1, 0.1, 0.01, 0.001],
        'classifier__kernel': ['rbf'],
        'class_weight': ['balanced']
    },
    'random_forest': {
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_features': ['auto', 'sqrt', 'log2'],
        'classifier__max_depth': [None, 10, 20, 30, 40, 50],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced']
    },
    'gradient_boosting': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.001, 0.01, 0.1, 1],
        'classifier__max_depth': [3, 4, 5, 6, 7],
        'class_weight': ['balanced']
    }
}

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, matthews_corrcoef
parameters = {'penalty':['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100],'class_weight': ['balanced'], 'verbose': [1], 'solver': ['liblinear']}
logreg = LogisticRegression()
mcc_scorer = make_scorer(matthews_corrcoef)
clf = GridSearchCV(logreg, parameters, cv=5, verbose=True, n_jobs=-1, scoring=mcc_scorer)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LibLinear]

In [53]:
clf.cv_results_

{'mean_fit_time': array([ 0.15710649,  0.24010286,  0.25533581,  0.34998527,  2.63253889,
         0.59293609, 20.54669323,  1.05116854, 35.28371243,  2.27346115,
        40.5671772 ,  3.31403403]),
 'std_fit_time': array([1.40950846e-02, 1.01003845e-02, 1.25057106e-02, 7.22283015e-03,
        3.78636278e-01, 5.22059131e-02, 2.77073689e+00, 5.26785011e-02,
        1.57745589e+01, 2.71548979e-01, 2.78351702e+01, 1.97008006e-01]),
 'mean_score_time': array([0.00972614, 0.0094274 , 0.00899115, 0.00881147, 0.00852675,
        0.00698514, 0.01532865, 0.00743785, 0.01518974, 0.01296301,
        0.01190777, 0.01447678]),
 'std_score_time': array([0.00130434, 0.00087672, 0.0013527 , 0.00226186, 0.00133104,
        0.00101366, 0.00291486, 0.00122308, 0.0052283 , 0.00201015,
        0.00292528, 0.00104544]),
 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100,
                    100],
              mask=[False, False, False, False, False, False, False, False,
  

In [54]:
clf.best_estimator_

In [67]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators': [200], 
              'max_features': ['sqrt', 'log2'],
              'class_weight': ['balanced'], 
              'verbose': [1], 
              'max_depth': [None, 10, 20],
              # 'min_samples_split': [2, 5, 10],
              # 'min_samples_leaf': [1, 2, 4]
             }
random_forest = RandomForestClassifier()
clf_forest = GridSearchCV(random_forest, parameters, cv=5, verbose=True, n_jobs=-1, scoring=mcc_scorer)
clf_forest.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    7.1s


In [65]:
clf_forest.best_estimator_

In [70]:
from sklearn.ensemble import GradientBoostingClassifier
parameters = {'n_estimators': [100, 200, 300], 
              'learning_rate': [0.001, 0.01, 0.1, 1],
              # 'class_weight': ['balanced'], 
              'verbose': [1], 
              # 'max_depth': [5, 6, 7],
              # 'min_samples_split': [2, 5, 10],
              # 'min_samples_leaf': [1, 2, 4]
             }
gradient_boost = GradientBoostingClassifier()
clf_gb = GridSearchCV(gradient_boost, parameters, cv=5, verbose=True, n_jobs=-1, scoring=mcc_scorer)
clf_gb.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
      Iter       Train Loss   Remaining Time 
         1           0.4687           53.54s
         2           0.4554           50.56s
         3           0.4449           51.71s
         4           0.4363           50.86s
         5           0.4292           51.17s
         6           0.4232           50.52s
         7           0.4180           50.34s
         8           0.4129           50.01s
         9           0.4074           50.03s
        10           0.4037           50.00s
        20           0.3818           49.08s
        30           0.3726           47.36s
        40           0.3669           45.57s
        50           0.3619           43.75s
        60           0.3575           42.11s
        70           0.3526           39.93s
        80           0.3499           38.01s
        90           0.3468           36.29s
       100           0.3452           34.33s
       200           0.3250           

In [84]:
import lightgbm as lgb

parameters = {'num_leaves': [200, 400, 600, 800], 
              #'learning_rate': [0.001, 0.01, 0.1, 1],
              'class_weight': ['balanced'], 
              'verbose': [1], 
              'max_depth': [50, 75, 100],
              # 'min_samples_split': [2, 5, 10],
              # 'min_samples_leaf': [1, 2, 4]
             }
lgbm = lgb.LGBMClassifier()
clf_lgbm = GridSearchCV(lgbm, parameters, cv=5, verbose=True, n_jobs=-1, scoring=mcc_scorer)
clf_lgbm.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 1890, number of negative: 26697
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021625 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 910
[LightGBM] [Info] Number of data points in the train set: 28587, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [86]:
clf_lgbm.best_estimator_

In [85]:
print(clf_lgbm.best_score_)
print(clf_gb.best_score_)
print(clf_forest.best_score_)
print(clf.best_score_)

0.3587851242519114
0.2208707668505565
0.27636640920031286
0.2713419526983987


# Test Set Prediction

U ovom poglavlju treba izgenerirati finalni model s hiperparametrima izabranim u Model Selection poglavlju nad čitavim train datasetom (s nebitnim featurima maknutima naravno). Ne trebamo hold-out test set. Generirati predikcije za test set u traženom formatu. Upload-ati ih na ai4health stranicu i cekat da vidimo jel prodemo u drugi krug :)

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(y_pred)

[0. 1. 0. ... 0. 0. 1.]


In [48]:
# Get the probability of each class
y_proba = clf.predict_proba(X_test)

# Create a DataFrame for the predictions and probabilities
df_predictions = pd.DataFrame({
    'Label': y_pred,
    'Probability_0': y_proba[:, 0],
    'Probability_1': y_proba[:, 1]
})

In [49]:
df_predictions.head()

Unnamed: 0,Label,Probability_0,Probability_1
0,0.0,0.878656,0.121344
1,1.0,0.272571,0.727429
2,0.0,0.958368,0.041632
3,0.0,0.897291,0.102709
4,0.0,0.909403,0.090597


In [51]:
df_predictions.shape

(7336, 3)

In [52]:
# Encoding defaults to UTF-8
df_predictions.to_csv('Kotao_1_09032024.csv', index=False)

In [53]:
df_predictions['Label'] = df_predictions['Label'].astype(int)

In [55]:
# Encoding defaults to UTF-8
df_predictions.to_csv('Kotao_2_09032024.csv', index=False)

In [59]:
clf.cv_results_

{'mean_fit_time': array([0.95744181]),
 'std_fit_time': array([0.06852031]),
 'mean_score_time': array([0.00661731]),
 'std_score_time': array([0.00146937]),
 'param_C': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_class_weight': masked_array(data=['balanced'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l2'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_solver': masked_array(data=['liblinear'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_verbose': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1,
   'class_weight': 'balanced',
   'penalty': 'l2',
   'solver': 'liblinear',
   'verbose': 1}],
 'split0_test_score': array([0.68293109]),
 'split1_test_score': array([0.68328087]),
 'split2_test_