In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, VarianceThreshold, mutual_info_classif
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Hiperparametri eksperimenta

In [2]:
# PreviousAdmissionDays
PREV_ADM_DAYS_MEDIAN = 1055

# Height_Discharge
HEIGHT_MEDIAN_MALE = 176
HEIGHT_MEDIAN_FEMALE = 162
HEIGHT_MEDIAN_GENDER_MISSING = 172
HEIGHT_LOWER_LIM = 130
HEIGHT_UPPER_LIM = 204

# Weigth_Discharge
WEIGHT_MEDIAN_MALE = 90
WEIGHT_MEDIAN_FEMALE = 75
WEIGHT_MEDIAN_GENDER_MISSING = 85
WEIGHT_LOWER_LIM = 33
WEIGHT_UPPER_LIM = 199

GRUPE_LJEKOVA = {
  "Antibiotici": [
    "amoxicillin_count",
    "amoxicillin_and_betalactamase_inhibitor_count",
    "ampicillin_count",
    "ampicillin_and_betalactamase_inhibitor_count",
    "azithromycin_count",
    "cefalexin_count",
    "cefazolin_count",
    "cefepime_count",
    "ceftazidime_count",
    "ceftazidime_and_betalactamase_inhibitor_count",
    "ceftriaxone_count",
    "cefuroxime_count",
    "chloramphenicol_count",
    "ciprofloxacin_count",
    "clindamycin_count",
    "colistin_count",
    "erythromycin_count",
    "flucloxacillin_count",
    "gentamicin_count",
    "meropenem_count",
    "metronidazole_count",
    "moxifloxacin_count",
    "vancomycin_count"
  ],
  "Antikoagulanti": [
    "apixaban_count",
    "dabigatran_etexilate_count",
    "enoxaparin_count",
    "fondaparinux_count",
    "warfarin_count"
  ],

  "Antitrombotici": [
    "clopidogrel_count"
   ],

  "Antidijabetici": [
    "acarbose_count",
    "dapagliflozin_count",
    "empagliflozin_count",
    "gliclazide_count",
    "glimepiride_count",
    "insulin_aspart_count",
    "insulin_glargine_count",
    "insulin_human_count",
    "metformin_count",
    "metformin_and_alogliptin_count",
    "metformin_and_empagliflozin_count",
    "metformin_and_vildagliptin_count"
  ],
  "NSAID analgetici": [
    "diclofenac_count",
    "ibuprofen_count",
    "ketoprofen_count",
    "tramadol_and_paracetamol_count"

 ],
  "Opioidni analgetici": [
    "fentanyl_count",
    "morphine_count",
    "tramadol_count",
    "tramadol_and_paracetamol_count"
  ],
 "Antihipertenzivi": [
    "amlodipine_count",
    "diltiazem_count",
    "doxazosin_count",
    "losartan_count",
    "losartan_and_diuretics_count"
  ],
  "Beta blokatori": [
    "atenolol_count",
    "bisoprolol_count",
    "carvedilol_count",
    "metoprolol_count",
    "nebivolol_count"
  ],

 "ACE inhibitori": [
    "lisinopril_count",
    "perindopril_count",
    "ramipril_count"
 ],
  "Diuretici": [
    "furosemide_count",
    "spironolactone_count",
    "torasemide_count"
 ],
  "Statini": [
    "atorvastatin_count",
    "rosuvastatin_count"

  ],
  "Antipsihotici": [
    "haloperidol_count",
    "quetiapine_count",
    "risperidone_count"
  ],
  "Antiepileptici": [
    "carbamazepine_count",
    "lamotrigine_count",
    "levetiracetam_count",
    "valproic_acid_count" 
 ], 
  "Sedativi i anksiolitici": [
    "alprazolam_count",
    "bromazepam_count",
    "diazepam_count",
    "lorazepam_count",
    "midazolam_count",
    "zolpidem_count"
  ],
  "Antigiht": [
    "allopurinol_count",
    "colchicine_count"
 ],

  "Proton pump inhibitors": [
   "esomeprazole_count", 
   "pantoprazole_count"
 
  ],
  "Antihistaminici": [
    "loratadine_count"
  ],
  "Antiasmatici": [
    "aminophylline_count",
    "ipratropium_bromide_count",
    "salbutamol_count",
    "salbutamol_and_ipratropium_bromide_count"
  ],
  "Antiviralni": [
    "aciclovir_count",
    "oseltamivir_count"
  ],
  "Vitamini i Suplementi": [
    "colecalciferol_count"
  ],
  "Antiemetici": [
    "ondansetron_count"
 ],

 "Thyroid therapy": [
    "levothyroxine_sodium_count"
 
  ],
  "Antifungalni": [
    "fluconazole_count",
    "miconazole_count",
    "clotrimazole_count"
  ],
  "Antihipertenzivi": [
    "amlodipine_count",
    "doxazosin_count",
    "losartan_count",
    "losartan_and_diuretics_count"

  ],
  "Imunosupresivi": [
    "dexamethasone_count"
  ],
  "Kortikosteroidi": [
    "dexamethasone_count"
 ]
 
}

GROUP_ICD10 = {
  "High_Rehospitalization_Risk": [
    "A", # "Certain infectious and parasitic diseases"
    "C", # "Neoplasms"
    "E", # "Endocrine, nutritional and metabolic diseases"
    "I", # "Diseases of the circulatory system"
    "J", # "Diseases of the respiratory system"
    "K", # "Diseases of the digestive system"
    "N", # "Diseases of the genitourinary system"
    "S", # "Injury, poisoning and certain other consequences of external causes"
    "T"  # "Injury, poisoning and certain other consequences of external causes"
  ],

  "Low_Rehospitalization_Risk": [
    "B", # "Certain infectious and parasitic diseases"
    "D", # "Neoplasms and diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism"
    "F", # "Mental and behavioural disorders"
    "G", # "Diseases of the nervous system"
    "H", # "Diseases of the eye and adnexa; Diseases of the ear and mastoid process"
    "L", # "Diseases of the skin and subcutaneous tissue"
    "M", # "Diseases of the musculoskeletal system and connective tissue"
    # NOT IN SYNTH DATA "O", # "Pregnancy, childbirth and the puerperium"
    # NOT IN SYNTH DATA "P", # "Certain conditions originating in the perinatal period"
    "Q", # "Congenital malformations, deformations and chromosomal abnormalities"
    "R", # "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified"
    # NOT IN SYNTH DATA "V", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "W", # "External causes of morbidity and mortality"
    # NOT IN SYNTH DATA "X", # "External causes of morbidity and mortality"
    "Y", # "External causes of morbidity and mortality"
    "Z"  # "Factors influencing health status and contact with health services"
  ]
}



# Funkcije za ispravljanje dataset-a

In [3]:
df = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)
all = pd.concat([df, df_test])

In [4]:
# Laurini ljekovi koji se ne pojavljuju u nasem datasetu
not_appearing = []
all_laura_ljek = []
for k, v in GRUPE_LJEKOVA.items():
    for ljek in v: 
        all_laura_ljek.append(ljek)
        if ljek not in all.columns: 
            not_appearing.append(ljek)
not_appearing

[]

In [5]:
# Kolone iz naseg dataseta koje se ne pojavljuju u laurinim ljekovima
for l in all.columns[16:]:
    if l not in all_laura_ljek:
        print(l)

acetylsalicylic_acid_count
albumin_count
alprostadil_count
amiodarone_count
atropine_count
b03aa07_count
benzathine_benzylpenicillin_count
betamethasone_count
butylscopolamine_count
calcium_chloride_count
carbohydrates_count
chlorhexidine_count
chloropyramine_count
clonazepam_count
coagulation_factor_ix_ii_vii_and_x_in_combination_count
coagulation_factor_viia_count
combinations_count
dexamethasone_and_antiinfectives_count
dexmedetomidine_count
diltiazem_count
dobutamine_count
electrolytes_count
epinephrine_count
eplerenone_count
escitalopram_count
etomidate_count
exemestane_count
ferric_oxide_polymaltose_complexes_count
ferrous_fumarate_count
flumazenil_count
folic_acid_count
fosfomycin_count
gliquidone_count
glyceryl_trinitrate_count
heparin_count
hydrocortisone_count
indapamide_count
indometacin_count
iodixanol_count
iohexol_count
isosorbide_dinitrate_count
isosorbide_mononitrate_count
ivabradine_count
ketamine_count
lacidipine_count
lactulose_count
lercanidipine_count
levobupivacai

['doxycycline_count',
 'acetaminophen_count',
 'hydrochlorothiazide_count',
 'azathioprine_count'] -> ovi ljekovi iz laurinog .txt filea se ne pojavljuju u skupu podataka pa sam ih maknuo iz gornjeg dicta

In [6]:
age_groups_sorted = sorted(list(all.Age_Group.unique()))
age_group_numeric_mapping = {ag: i for i, ag in enumerate(age_groups_sorted)}
age_group_numeric_mapping

{'18-22': 0,
 '23-27': 1,
 '28-32': 2,
 '33-37': 3,
 '38-42': 4,
 '43-47': 5,
 '48-52': 6,
 '53-57': 7,
 '58-62': 8,
 '63-67': 9,
 '68-72': 10,
 '73-77': 11,
 '78-82': 12,
 '83-87': 13,
 '88-92': 14,
 '93-97': 15}

In [7]:
def fix_PreviousAdmissionDays(df):
    df['PrevAdmDaysAvail'] = (df.PreviousAdmissionDays > 0) * 1.0 # Valuable information for prediction
    df.loc[df.PreviousAdmissionDays == -8, 'PreviousAdmissionDays'] = PREV_ADM_DAYS_MEDIAN
    return df

def fix_Weigth_Discharge(df):
    gender_w_medians = {
        'Ž': WEIGHT_MEDIAN_FEMALE,
        'M': WEIGHT_MEDIAN_MALE,
        None: WEIGHT_MEDIAN_GENDER_MISSING
    }
    row_indexer = (df.Weight_Discharge < WEIGHT_LOWER_LIM) | (df.Weight_Discharge > WEIGHT_UPPER_LIM)
    df.loc[row_indexer, 'Weight_Discharge'] = df[row_indexer].Gender.map(gender_w_medians)
    return df

def fix_Heigth_Discharge(df):
    gender_h_medians = {
        'Ž': HEIGHT_MEDIAN_FEMALE,
        'M': HEIGHT_MEDIAN_MALE,
        None: HEIGHT_MEDIAN_GENDER_MISSING
    }
    row_indexer = (df.Height_Discharge < HEIGHT_LOWER_LIM) | (df.Height_Discharge > HEIGHT_UPPER_LIM)
    df.loc[row_indexer, 'Height_Discharge'] = df[row_indexer].Gender.map(gender_h_medians)
    return df

def aggregate_mediactions(df):
    to_drop = []
    for med_group, list_of_meds in GRUPE_LJEKOVA.items():
        df[f"{med_group}_count"] = df[list_of_meds].sum(axis=1)
        to_drop.extend(list_of_meds)
    df.drop(columns=to_drop, inplace=True)
    return df

def aggregate_icd10_diagnosis(df):
    to_drop = []
    for icd10_group, list_of_diag in GROUP_ICD10.items():
        df[f"{icd10_group}_count"] = df[list_of_diag].sum(axis=1)
        to_drop.extend(list_of_diag)
    df.drop(columns=to_drop, inplace=True)
    return df

def convert_age_groups_to_numeric(df):
    df['Age_Group'] = df.Age_Group.map(age_group_numeric_mapping)
    return df

def fix_dataset(df):
    df = fix_PreviousAdmissionDays(df)
    df = fix_Weigth_Discharge(df)
    df = fix_Heigth_Discharge(df)
    df = aggregate_mediactions(df)
    df = aggregate_icd10_diagnosis(df)
    df = convert_age_groups_to_numeric(df)
    return df
    

In [8]:
all = fix_dataset(all)

In [9]:
all.shape

(35923, 188)

In [10]:
all.head()

Unnamed: 0,AdmissionDx,AdmissionType,PreviousAdmissionDays,LOS,Age_Group,Gender,Surgery_Count,LOS_ICU,Discharge_Specialty,Dx_Discharge,Discharge_Status,Weight_Discharge,Height_Discharge,Education,Current_Work_Status,Label,acetylsalicylic_acid_count,albumin_count,alprostadil_count,amiodarone_count,atropine_count,b03aa07_count,benzathine_benzylpenicillin_count,betamethasone_count,butylscopolamine_count,calcium_chloride_count,carbohydrates_count,chlorhexidine_count,chloropyramine_count,clonazepam_count,coagulation_factor_ix_ii_vii_and_x_in_combination_count,coagulation_factor_viia_count,combinations_count,dexamethasone_and_antiinfectives_count,dexmedetomidine_count,diltiazem_count,dobutamine_count,electrolytes_count,epinephrine_count,eplerenone_count,escitalopram_count,etomidate_count,exemestane_count,ferric_oxide_polymaltose_complexes_count,ferrous_fumarate_count,flumazenil_count,folic_acid_count,fosfomycin_count,gliquidone_count,glyceryl_trinitrate_count,heparin_count,hydrocortisone_count,indapamide_count,indometacin_count,iodixanol_count,iohexol_count,isosorbide_dinitrate_count,isosorbide_mononitrate_count,ivabradine_count,ketamine_count,lacidipine_count,lactulose_count,lercanidipine_count,levobupivacaine_count,levodopa_and_decarboxylase_inhibitor_count,levofloxacin_count,lidocaine_count,linagliptin_count,linezolid_count,loperamide_count,macrogol_combinations_count,mannitol_count,memantine_count,mesalazine_count,metamizole_sodium_count,methylphenobarbital_count,methylprednisolone_count,metildigoxin_count,metoclopramide_count,mirtazapine_count,mometasone_count,montelukast_count,moxonidine_count,mupirocin_count,n02bf01_count,n02bf02_count,nadroparin_count,naloxone_count,neostigmine_count,nitrazepam_count,nitrofurantoin_count,norepinephrine_count,norfloxacin_count,octreotide_count,omega3triglycerides_incl_other_esters_and_acids_count,organoheparinoid_count,oxazepam_count,paracetamol_count,paroxetine_count,perindopril_amlodipine_and_indapamide_count,perindopril_and_amlodipine_count,perindopril_and_diuretics_count,pethidine_count,phenobarbital_count,pholcodine_count,phytomenadione_count,piperacillin_and_betalactamase_inhibitor_count,potassium_chloride_count,potassium_different_salts_in_combination_count,prasugrel_count,prednisone_count,promazine_count,propafenone_count,propofol_count,protamine_count,rabeprazole_count,ramipril_and_amlodipine_count,ramipril_and_diuretics_count,ranitidine_count,ranolazine_count,repaglinide_count,rifampicin_count,rivaroxaban_count,rocuronium_bromide_count,rosuvastatin_and_ezetimibe_count,salmeterol_count,salmeterol_and_fluticasone_count,sertraline_count,sevoflurane_count,silver_sulfadiazine_count,sitagliptin_count,sotalol_count,sufentanil_count,sulfamethoxazole_and_trimethoprim_count,sulpiride_count,tamsulosin_count,tamsulosin_and_dutasteride_count,theophylline_count,thiamazole_count,thiethylperazine_count,thiopental_count,tianeptine_count,ticagrelor_count,tiotropium_bromide_count,tobramycin_count,trandolapril_count,tranexamic_acid_count,trimetazidine_count,trospium_count,urapidil_count,valsartan_count,valsartan_amlodipine_and_hydrochlorothiazide_count,valsartan_and_sacubitril_count,venlafaxine_count,verapamil_count,x99nijenavedenowho_count,zofenopril_count,U,Probability_0,Probability_1,PrevAdmDaysAvail,Antibiotici_count,Antikoagulanti_count,Antitrombotici_count,Antidijabetici_count,NSAID analgetici_count,Opioidni analgetici_count,Antihipertenzivi_count,Beta blokatori_count,ACE inhibitori_count,Diuretici_count,Statini_count,Antipsihotici_count,Antiepileptici_count,Sedativi i anksiolitici_count,Antigiht_count,Proton pump inhibitors_count,Antihistaminici_count,Antiasmatici_count,Antiviralni_count,Vitamini i Suplementi_count,Antiemetici_count,Thyroid therapy_count,Antifungalni_count,Imunosupresivi_count,Kortikosteroidi_count,High_Rehospitalization_Risk_count,Low_Rehospitalization_Risk_count
1,I,Elektivni,1055,1,13,Ž,0,0,3010100,I,KUĆI,53.0,156.0,ZAVRŠENA OSNOVNA ŠKOLA,DOMAĆICA,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2
2,I,Elektivni,1055,16,10,M,1,0,3100400,I,KUĆI,96.0,167.0,NEPOZNAT OBRAZOVNI STATUS,UMIROVLJENIK,0.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,,,0.0,5,2,0,0,3,3,0,0,1,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0,0,3,1
3,I,Elektivni,279,3,0,Ž,1,0,3010100,I,KUĆI,52.0,163.0,ZAVRŠENO VISOKO OBRAZOVANJE,STUDENT,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,,,1.0,2,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0
4,I,Elektivni,1905,2,13,M,0,0,3010100,I,KUĆI,95.0,175.0,ZAVRŠENA SREDNJA ŠKOLA,UMIROVLJENIK,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0
5,I,Elektivni,1055,2,11,M,0,0,3010100,I,KUĆI,82.0,174.0,ZAVRŠENO VISOKO OBRAZOVANJE,REDOVAN POSAO,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0


In [None]:
all.isna().sum()[all.isna().sum() > 0]

Counts of missing values

# OneHot encoding and Scaling functions

In [11]:
def oneHot_encode_feature(df, feature):
    ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore')
    ohe.fit(df[df[feature].notnull()][[feature]])
    X = ohe.transform(df[[feature]])
    df[ohe.get_feature_names_out()] = X.toarray()
    df.drop(columns=[feature], inplace=True)
    return df

def scale_numeric_features(df, numeric_cols):
    scaler = StandardScaler()
    scaler.fit(df[numeric_cols])
    df[numeric_cols] = scaler.transform(df[numeric_cols])
    return df

def encode_and_scale_features(df, enc_features, scale_features):
    for f in enc_features:
        df = oneHot_encode_feature(df, f)

    df = scale_numeric_features(df, scale_features)
    return df

# Obtaining Preprocessed dataset

In [12]:
cols_to_onehot_encode = [
    'AdmissionDx',
    'AdmissionType',
    'Gender',
    'Discharge_Specialty',
    'Dx_Discharge',
    'Discharge_Status',
    'Education',
    'Current_Work_Status'
]

cols_to_scale= [
    'PreviousAdmissionDays',
    'LOS',
    'Age_Group',
    'Surgery_Count',
    'LOS_ICU',
    'Weight_Discharge',
    'Height_Discharge'
]

In [13]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)
df_train['Train'] = 1
df_test['Train'] = 0
all = pd.concat([df_train, df_test])

In [14]:
all = fix_dataset(all)
all = encode_and_scale_features(
    all,
    enc_features=cols_to_onehot_encode,
    scale_features=cols_to_scale
)



In [15]:
all.head()

Unnamed: 0,PreviousAdmissionDays,LOS,Age_Group,Surgery_Count,LOS_ICU,Weight_Discharge,Height_Discharge,Label,acetylsalicylic_acid_count,albumin_count,alprostadil_count,amiodarone_count,atropine_count,b03aa07_count,benzathine_benzylpenicillin_count,betamethasone_count,butylscopolamine_count,calcium_chloride_count,carbohydrates_count,chlorhexidine_count,chloropyramine_count,clonazepam_count,coagulation_factor_ix_ii_vii_and_x_in_combination_count,coagulation_factor_viia_count,combinations_count,dexamethasone_and_antiinfectives_count,dexmedetomidine_count,diltiazem_count,dobutamine_count,electrolytes_count,epinephrine_count,eplerenone_count,escitalopram_count,etomidate_count,exemestane_count,ferric_oxide_polymaltose_complexes_count,ferrous_fumarate_count,flumazenil_count,folic_acid_count,fosfomycin_count,gliquidone_count,glyceryl_trinitrate_count,heparin_count,hydrocortisone_count,indapamide_count,indometacin_count,iodixanol_count,iohexol_count,isosorbide_dinitrate_count,isosorbide_mononitrate_count,ivabradine_count,ketamine_count,lacidipine_count,lactulose_count,lercanidipine_count,levobupivacaine_count,levodopa_and_decarboxylase_inhibitor_count,levofloxacin_count,lidocaine_count,linagliptin_count,linezolid_count,loperamide_count,macrogol_combinations_count,mannitol_count,memantine_count,mesalazine_count,metamizole_sodium_count,methylphenobarbital_count,methylprednisolone_count,metildigoxin_count,metoclopramide_count,mirtazapine_count,mometasone_count,montelukast_count,moxonidine_count,mupirocin_count,n02bf01_count,n02bf02_count,nadroparin_count,naloxone_count,neostigmine_count,nitrazepam_count,nitrofurantoin_count,norepinephrine_count,norfloxacin_count,octreotide_count,omega3triglycerides_incl_other_esters_and_acids_count,organoheparinoid_count,oxazepam_count,paracetamol_count,paroxetine_count,perindopril_amlodipine_and_indapamide_count,perindopril_and_amlodipine_count,perindopril_and_diuretics_count,pethidine_count,phenobarbital_count,pholcodine_count,phytomenadione_count,piperacillin_and_betalactamase_inhibitor_count,potassium_chloride_count,potassium_different_salts_in_combination_count,prasugrel_count,prednisone_count,promazine_count,propafenone_count,propofol_count,protamine_count,rabeprazole_count,ramipril_and_amlodipine_count,ramipril_and_diuretics_count,ranitidine_count,ranolazine_count,repaglinide_count,rifampicin_count,rivaroxaban_count,rocuronium_bromide_count,rosuvastatin_and_ezetimibe_count,salmeterol_count,salmeterol_and_fluticasone_count,sertraline_count,sevoflurane_count,silver_sulfadiazine_count,sitagliptin_count,sotalol_count,sufentanil_count,sulfamethoxazole_and_trimethoprim_count,sulpiride_count,tamsulosin_count,tamsulosin_and_dutasteride_count,theophylline_count,thiamazole_count,thiethylperazine_count,thiopental_count,tianeptine_count,ticagrelor_count,tiotropium_bromide_count,tobramycin_count,trandolapril_count,tranexamic_acid_count,trimetazidine_count,trospium_count,urapidil_count,valsartan_count,valsartan_amlodipine_and_hydrochlorothiazide_count,valsartan_and_sacubitril_count,venlafaxine_count,verapamil_count,x99nijenavedenowho_count,zofenopril_count,U,Train,Probability_0,Probability_1,PrevAdmDaysAvail,Antibiotici_count,Antikoagulanti_count,Antitrombotici_count,Antidijabetici_count,NSAID analgetici_count,Opioidni analgetici_count,Antihipertenzivi_count,Beta blokatori_count,ACE inhibitori_count,Diuretici_count,Statini_count,Antipsihotici_count,Antiepileptici_count,Sedativi i anksiolitici_count,Antigiht_count,Proton pump inhibitors_count,Antihistaminici_count,Antiasmatici_count,Antiviralni_count,Vitamini i Suplementi_count,Antiemetici_count,Thyroid therapy_count,Antifungalni_count,Imunosupresivi_count,Kortikosteroidi_count,High_Rehospitalization_Risk_count,Low_Rehospitalization_Risk_count,AdmissionDx_B,AdmissionDx_C,AdmissionDx_D,AdmissionDx_E,AdmissionDx_F,AdmissionDx_G,AdmissionDx_H,AdmissionDx_I,AdmissionDx_J,AdmissionDx_K,AdmissionDx_L,AdmissionDx_M,AdmissionDx_N,AdmissionDx_Q,AdmissionDx_R,AdmissionDx_S,AdmissionDx_T,AdmissionDx_Y,AdmissionDx_Z,AdmissionType_Hitni,Gender_Ž,Discharge_Specialty_3010100,Discharge_Specialty_3100400,Discharge_Specialty_3100600,Discharge_Specialty_3190100,Discharge_Specialty_3190200,Dx_Discharge_C,Dx_Discharge_D,Dx_Discharge_E,Dx_Discharge_G,Dx_Discharge_H,Dx_Discharge_I,Dx_Discharge_J,Dx_Discharge_K,Dx_Discharge_L,Dx_Discharge_N,Dx_Discharge_Q,Dx_Discharge_R,Dx_Discharge_T,Dx_Discharge_Z,Discharge_Status_KUĆI,Discharge_Status_LIJEČENJE OKONČANO PROTIVNO SAVJETU DOKTORA,Discharge_Status_OSTALO,Discharge_Status_U DRUGU STACIONARNU ZDRAVSTVENU USTANOVU,Discharge_Status_UMRO (NIJE OBDUCIRAN),Education_NEPOZNAT OBRAZOVNI STATUS,Education_NIJE POHAĐALA OSNOVNU ŠKOLU,Education_NIJE ZAVRŠILA OSNOVNU ŠKOLU,Education_ZAVRŠENA OSNOVNA ŠKOLA,Education_ZAVRŠENA SREDNJA ŠKOLA,Education_ZAVRŠENA VIŠA ŠKOLA,Education_ZAVRŠENO VISOKO OBRAZOVANJE,Current_Work_Status_DOMAĆICA,Current_Work_Status_NEPOZNATO,Current_Work_Status_NEZAPOSLEN/POVREMENI POSAO,Current_Work_Status_POLJOPRIVREDNIK,Current_Work_Status_REDOVAN POSAO,Current_Work_Status_SAMOSTALNA DJELATNOST,Current_Work_Status_STUDENT,Current_Work_Status_UMIROVLJENIK,Current_Work_Status_UČENIK
1,-0.289586,-0.482977,1.69258,-0.458037,-0.050516,-2.007681,-1.644906,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.289586,1.883533,0.379549,1.910477,-0.050516,0.652788,-0.468054,0.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,,,0.0,5,2,0,0,3,3,0,0,1,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0,0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.800832,-0.167442,-3.997224,1.910477,-0.050516,-2.069552,-0.896,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,,,1.0,2,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.270413,-0.32521,1.69258,-0.458037,-0.050516,0.590916,0.387838,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,,,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-0.289586,-0.32521,0.817226,-0.458037,-0.050516,-0.213411,0.280851,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
df_train = all[all.Train == 1].copy()
df_test = all[all.Train == 0].copy()
df_train.drop(columns=['Train'], inplace=True)
df_train.drop(columns=['Probability_0'], inplace=True)
df_train.drop(columns=['Probability_1'], inplace=True)
df_test.drop(columns=['Train'], inplace=True)
df_test.drop(columns=['Probability_0'], inplace=True)
df_test.drop(columns=['Probability_1'], inplace=True)
df_test.drop(columns=['Label'], inplace=True)

In [17]:
df_train.shape

(28587, 239)

In [18]:
df_test.shape

(7336, 238)

In [19]:
feature_cols = list(df_train.columns)
feature_cols.remove('Label')

In [20]:
print(feature_cols)

['PreviousAdmissionDays', 'LOS', 'Age_Group', 'Surgery_Count', 'LOS_ICU', 'Weight_Discharge', 'Height_Discharge', 'acetylsalicylic_acid_count', 'albumin_count', 'alprostadil_count', 'amiodarone_count', 'atropine_count', 'b03aa07_count', 'benzathine_benzylpenicillin_count', 'betamethasone_count', 'butylscopolamine_count', 'calcium_chloride_count', 'carbohydrates_count', 'chlorhexidine_count', 'chloropyramine_count', 'clonazepam_count', 'coagulation_factor_ix_ii_vii_and_x_in_combination_count', 'coagulation_factor_viia_count', 'combinations_count', 'dexamethasone_and_antiinfectives_count', 'dexmedetomidine_count', 'diltiazem_count', 'dobutamine_count', 'electrolytes_count', 'epinephrine_count', 'eplerenone_count', 'escitalopram_count', 'etomidate_count', 'exemestane_count', 'ferric_oxide_polymaltose_complexes_count', 'ferrous_fumarate_count', 'flumazenil_count', 'folic_acid_count', 'fosfomycin_count', 'gliquidone_count', 'glyceryl_trinitrate_count', 'heparin_count', 'hydrocortisone_count

In [21]:
X_train = df_train[feature_cols].values
y_train = df_train['Label'].values
X_test = df_test[feature_cols].values

X_train, y_train, i X_test su numpy matrice spremne za uproabu sa sklearn library-om.

# Feature Selection

Na kraju ovog poglavlja trebamo imati X_train, y_train, X_test sa nebitnim kolonama izbačenim. Najočitije beskorisne feature se treba maknuti sa variance thresholdima, ostalim metodama itd., i onda izvrtiti backward wrapper feature selection sa lightweight RandomForestom, ili L1 log.reg. ako se RF pokaže da je jednostavno prespor. 

## Lasso

In [22]:
print(X_train.shape)

(28587, 238)


In [23]:
# Set the regularization parameter C=1
logistic = LogisticRegression(C=0.6, penalty='l1', class_weight='balanced', solver='liblinear', random_state=42).fit(X_train, y_train)
f_selector = SelectFromModel(logistic, prefit=True, threshold=1e-3)

X_train_new = f_selector.transform(X_train)
y_train_new = y_train
X_test_new = f_selector.transform(X_test)

print(X_train_new.shape)
print(y_train_new.shape)
print(X_test_new.shape)

(28587, 185)
(28587,)
(7336, 185)


In [24]:
l1_coefs = pd.DataFrame({'feature': feature_cols, 'L1_coef': logistic.coef_[0]})

In [25]:
len(l1_coefs[l1_coefs.L1_coef.abs() < 1e-3])

53

In [26]:
irrelevant_features = l1_coefs[l1_coefs['L1_coef'].abs() < 1e-3]['feature'].tolist()
df_train.drop(columns=irrelevant_features, inplace=True)


In [27]:
df_train.shape
feature_cols = list(df_train.columns)
feature_cols.remove('Label')

In [None]:
l1_coefs[l1_coefs.L1_coef.abs() < 1e-3]

In [None]:
l1_coefs[l1_coefs.L1_coef.abs() > 1e-3]

In [28]:
df_train['Label'] = y_train

In [None]:
df_train.dobutamine_count.value_counts()

In [None]:
df_train[['dobutamine_count', 'Label']].groupby('dobutamine_count').value_counts()

In [None]:
matthews_corrcoef(y_train, logistic.predict(X_train_new))

In [None]:
logistic.predict(X_train_new).mean()

L1 feature selection might be an option...

## Variance filter methods

We can use this method to get rid of monotone features

In [29]:
v_threshold = VarianceThreshold(threshold=0.001)
v_threshold.fit(X_train_new)
meets_var_thresholds = pd.DataFrame({'feature': feature_cols, 'passes_var': v_threshold.get_support()})

X_train_filtered = v_threshold.transform(X_train_new)
feature_cols_filtered = [feature_cols[i] for i in range(len(feature_cols)) if v_threshold.get_support()[i]]
df_train = pd.DataFrame(X_train_filtered, columns=feature_cols_filtered)

In [30]:
X_test_filtered = v_threshold.transform(X_test_new)
df_test = pd.DataFrame(X_test_filtered, columns=feature_cols_filtered)

In [30]:
meets_var_thresholds.passes_var.mean()

0.8648648648648649

In [31]:
meets_var_thresholds[~meets_var_thresholds.passes_var]

Unnamed: 0,feature,passes_var
10,betamethasone_count,False
16,coagulation_factor_viia_count,False
22,escitalopram_count,False
26,fosfomycin_count,False
36,ivabradine_count,False
42,levodopa_and_decarboxylase_inhibitor_count,False
43,levofloxacin_count,False
46,linezolid_count,False
49,mesalazine_count,False
51,methylphenobarbital_count,False


In [32]:
len(meets_var_thresholds[~meets_var_thresholds.passes_var])

25

In [None]:
df_test.sitagliptin_count.value_counts()

In [None]:
#df_train[['sitagliptin_count', 'Label']].groupby('sitagliptin_count').value_counts(normalize=True)

In [31]:
df_train.shape
df_train['Label'] = y_train
feature_cols = list(df_train.columns)
feature_cols.remove('Label')
X_train = df_train[feature_cols].values
y_train = df_train['Label'].values
X_test = df_test[feature_cols].values

## Information Gain

In [None]:
importances = mutual_info_classif(X_train, y_train)

In [None]:
mutual_info = pd.DataFrame({'feature': feature_cols, 'IG': importances})

In [None]:
mutual_info.plot(kind='bar')

In [None]:
mutual_info[mutual_info.IG > 0.001]

In [None]:
df_train[['A', 'Label']].groupby('A').value_counts(normalize=True)

In [None]:
df_train[['A', 'Label']].groupby('A').value_counts()

## Wrapper methods

In [None]:
clf = RandomForestClassifier(n_estimators=200, min_samples_split=9, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_train)

In [None]:
matthews_corrcoef(y_train, y_pred)

In [None]:
(y_pred == y_train).mean()

In [None]:
y_pred.mean()

In [None]:
cfm=confusion_matrix(y_train, y_pred)
ax = sns.heatmap(cfm, annot=True, fmt='d')

ax.set_xlabel("Predicted Label")
ax.set_ylabel("True Label")

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [None]:
# Kao sto se vidi ispod ovaj model bi mogli koristiti za backward wrapper method feature selection....
clf = RandomForestClassifier(n_estimators=200, min_samples_split=40, class_weight='balanced', oob_score=matthews_corrcoef, random_state=42)
clf.fit(X2_train, y2_train)
clf.oob_score_

In [None]:
y2_pred = clf.predict(X2_test)
matthews_corrcoef(y2_test, y2_pred)

In [None]:
y2_pred.mean()

In [None]:
cfm=confusion_matrix(y2_test, y2_pred)
ax = sns.heatmap(cfm, annot=True, fmt='d')

ax.set_xlabel("Predicted Label")
ax.set_ylabel("True Label")

In [41]:
X_train.shape

(28587, 160)

In [42]:
y_train.shape

(28587,)

In [43]:
from sklearn.metrics import make_scorer, matthews_corrcoef
mcc_scorer = make_scorer(matthews_corrcoef)

In [None]:
estimator = RandomForestClassifier(n_estimators=200, min_samples_split=40, class_weight='balanced', random_state=42)
sfs = SequentialFeatureSelector(estimator, tol=-0.001, direction="backward", scoring=mcc_scorer)
sfs.fit(X_train, y_train)
wrapper_results = pd.DataFrame({'features': feature_cols, 'selected': sfs.get_support()})

Moramo dosta srezati poceti skup featurea prije nego sto ovo provertimo. Mozemo maknuti najocitije beskorisne feature iz svih proslih metoda (tipa variance threshold) i onda ovo izvrtit.

In [None]:
wrapper_results[wrapper_results.selected]

In [None]:
# MCC score if we just predict based on whether prev admission days was -8 or not
matthews_corrcoef(y_train, df_train.PrevAdmDaysAvail == 0)

Dakle najgluplji model kojeg mozemo smislit ima mcc 0.258.

# Model Selection

Jednom kada imamo X_train, i y_train sa nebitnim kolonama ukonjenima treba izvršiti stratified k-fold cross validation grid search hiperparametra za sve klasicne modele (log.reg., SVM, RandomForest, GradientBoostingClassifier, xgboost.XGBClassifier). Scoring metrika za cross validaciju treba biti mcc. Rezultat ovog poglavlja je izbor modela i njegovih hiperparametara koje ćemo koristiti za finalno rješenje.

Kros validaciju s ovim napravit: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html


Grid search + Cross validation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [32]:
X_train.shape

(28587, 160)

In [33]:
X_test.shape

(7336, 160)

In [34]:
y_train.shape

(28587,)

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# from sklearn import svm
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from xgboost import XGBClassifier

# Define the parameter grid for each estimator
param_grid = {
    'log_reg': {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'class_weight': ['balanced']
    },
    'svm': {
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__gamma': [1, 0.1, 0.01, 0.001],
        'classifier__kernel': ['rbf'],
        'class_weight': ['balanced']
    },
    'random_forest': {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100, 200],
        'classifier__max_features': ['auto', 'sqrt', 'log2'],
        'classifier__max_depth': [None, 10, 20, 30, 40, 50],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'class_weight': ['balanced']
    },
    'gradient_boosting': {
        'classifier': [GradientBoostingClassifier()],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.001, 0.01, 0.1, 1],
        'classifier__max_depth': [3, 4, 5, 6, 7],
        'class_weight': ['balanced']
    }
}

# Note: You should loop through param_grid to create individual GridSearchCV for each estimator
# For example, to create a GridSearch for Logistic Regression:
# from sklearn.pipeline import Pipeline

# You might need to adjust the following depending on your specific needs
# pipe = Pipeline([('classifier', LogisticRegression())])  # Starting with Logistic Regression as a placeholder
# log_reg_search = GridSearchCV(pipe, param_grid['log_reg'], cv=5, verbose=True, n_jobs=-1)

# parameters = {'C':[1], 'gamma': [1], 'class_weight': ['balanced'], 'probability': [True]}
# svc = svm.SVC()
parameters = {'penalty':['l2'], 'C': [1],'class_weight': ['balanced'], 'verbose': [1], 'solver': ['liblinear']}
logreg = LogisticRegression()
clf = GridSearchCV(logreg, parameters, cv=5, verbose=True, n_jobs=-1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LibLinear]

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(y_pred)

[0. 1. 0. ... 0. 0. 1.]


In [48]:
# Get the probability of each class
y_proba = clf.predict_proba(X_test)

# Create a DataFrame for the predictions and probabilities
df_predictions = pd.DataFrame({
    'Label': y_pred,
    'Probability_0': y_proba[:, 0],
    'Probability_1': y_proba[:, 1]
})

In [49]:
df_predictions.head()

Unnamed: 0,Label,Probability_0,Probability_1
0,0.0,0.878656,0.121344
1,1.0,0.272571,0.727429
2,0.0,0.958368,0.041632
3,0.0,0.897291,0.102709
4,0.0,0.909403,0.090597


In [51]:
df_predictions.shape

(7336, 3)

In [52]:
# Encoding defaults to UTF-8
df_predictions.to_csv('Kotao_1_09032024.csv', index=False)

In [53]:
df_predictions['Label'] = df_predictions['Label'].astype(int)

In [55]:
# Encoding defaults to UTF-8
df_predictions.to_csv('Kotao_2_09032024.csv', index=False)

In [59]:
clf.cv_results_

{'mean_fit_time': array([0.95744181]),
 'std_fit_time': array([0.06852031]),
 'mean_score_time': array([0.00661731]),
 'std_score_time': array([0.00146937]),
 'param_C': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_class_weight': masked_array(data=['balanced'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l2'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_solver': masked_array(data=['liblinear'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_verbose': masked_array(data=[1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1,
   'class_weight': 'balanced',
   'penalty': 'l2',
   'solver': 'liblinear',
   'verbose': 1}],
 'split0_test_score': array([0.68293109]),
 'split1_test_score': array([0.68328087]),
 'split2_test_

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, matthews_corrcoef
parameters = {'penalty':['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100],'class_weight': ['balanced'], 'verbose': [1], 'solver': ['liblinear']}
logreg = LogisticRegression()
mcc_scorer = make_scorer(matthews_corrcoef)
clf = GridSearchCV(logreg, parameters, cv=5, verbose=True, n_jobs=-1, scoring=mcc_scorer)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LibLinear]

In [65]:
clf.cv_results_

{'mean_fit_time': array([  0.34874635,   0.46056142,   0.51149559,   0.69426122,
          2.36387539,   1.14325519, 108.00460753,   2.20281324,
        103.95614815,   3.22067895, 146.47461472,   4.85919948]),
 'std_fit_time': array([2.03163130e-02, 1.23835028e-02, 4.05341674e-02, 2.06049424e-02,
        5.17334402e-01, 9.32117249e-02, 7.66189588e+00, 2.54294718e-01,
        2.17817093e+01, 3.11379526e-01, 5.27852040e+01, 3.73325554e-01]),
 'mean_score_time': array([0.0171452 , 0.01696095, 0.01670442, 0.01476636, 0.01574416,
        0.01502671, 0.01362681, 0.01281519, 0.01698389, 0.01541495,
        0.0160078 , 0.01492825]),
 'std_score_time': array([0.00198017, 0.00094313, 0.002714  , 0.00224781, 0.00371835,
        0.00225971, 0.0015266 , 0.00223062, 0.00366679, 0.00204084,
        0.0015625 , 0.00211354]),
 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100,
                    100],
              mask=[False, False, False, False, False, False, Fals

In [70]:
clf.best_estimator_

# Test Set Prediction

U ovom poglavlju treba izgenerirati finalni model s hiperparametrima izabranim u Model Selection poglavlju nad čitavim train datasetom (s nebitnim featurima maknutima naravno). Ne trebamo hold-out test set. Generirati predikcije za test set u traženom formatu. Upload-ati ih na ai4health stranicu i cekat da vidimo jel prodemo u drugi krug :)