In [8]:
import os
from os import path,listdir
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
import json 

pd.options.display.max_columns = 500
import warnings
warnings.filterwarnings("ignore")

def find_most_similar_string(source_string, target_strings):
    scores = []
    for string in target_strings:
        score = editdistance.eval(source_string, string)
        scores.append((string, score))
    scores.sort(key=lambda x: x[1])
    return scores[0]

In [2]:
input_path = 'inputs'
train_csv = path.join(input_path,'new1_traindata.swissnoso.csv')
test_csv = path.join(input_path,'new1_testdata.swissnoso.csv')
clean_train_csv = path.join(input_path,'scaled2.train.csv')
clean_test_csv = path.join(input_path,'scaled2.test.csv')

groups_csv = path.join(input_path,'groups9.csv')
mapping_json = path.join(input_path,'mapping.json')

## Prepare features

In [3]:
dfTR = pd.read_csv(train_csv,sep=';')
dfTE = pd.read_csv(test_csv,sep=';')

cols_bacteria = list(dfTR.columns)[dfTR.columns.get_loc('staphylococci'):dfTR.columns.get_loc('other.bacteria')+1]
cols_fts_doubt = ['bmi','ht','atb_oui']
cols_fts_cat = ['asa','proc','secondary','implant','endo',]
cols_fts = ['sex','wt','age','elective','class','duree','destination',
            'age.group','bmi.group','hosp_grp','hosp_size','death',
            'surg','infection.simp']
cols_sarah = ["age","sex","bmi","surg","prev.hosp.b","hosp_size","asa",
              "class","elective","endo_2L","duree","SAP.b", "cephalosporin.all",
              "nitroimidazole","penicillin.all","quinolone",
              "carbapenem","glycopeptide","clindamycine","other.SAP2",
              'scoreT','implant']
# ft_cols = sorted(set(cols_fts_doubt+cols_fts_cat+cols_fts+cols_sarah),key=lambda x:x.lower())
ft_cols = sorted(set(cols_sarah),key=lambda x:x.lower())
print(cols_bacteria)
print(ft_cols)

dfTR = dfTR[ft_cols+cols_bacteria]
dfTE = dfTE[ft_cols+cols_bacteria]

['staphylococci', 'streptococci', 'enterococci', 'e.coli', 'klebsiella', 'proteus', 'serratia', 'enterobacter', 'other.enterobacteriacea', 'pseudomonas', 'cutibacterium', 'clostridium', 'bacteroides', 'fungi', 'other.grampos', 'other.gramneg', 'other.anaerobic', 'other.bacteria']
['age', 'asa', 'bmi', 'carbapenem', 'cephalosporin.all', 'class', 'clindamycine', 'duree', 'elective', 'endo_2L', 'glycopeptide', 'hosp_size', 'implant', 'nitroimidazole', 'other.SAP2', 'penicillin.all', 'prev.hosp.b', 'quinolone', 'SAP.b', 'scoreT', 'sex', 'surg']


In [4]:
### Check NaNs
print('Sizes on training', len(dfTR),len(dfTR.dropna()))
print('Sizes on test',len(dfTE),len(dfTE.dropna()))

print('>>> Training')
for c in ft_cols:
    l = len(dfTR[dfTR[c].isnull()])
    if l>0:
        print(c,l)
print('>>> Test')
for c in ft_cols:
    l = len(dfTE[dfTE[c].isnull()])
    if l>0:
        print(c,l)

Sizes on training 7442 5239
Sizes on test 1872 1363
>>> Training
asa 32
bmi 2176
duree 1
endo_2L 1
>>> Test
asa 8
bmi 503


In [5]:
dfTR.dropna(inplace=True)
dfTE.dropna(inplace=True)
print('Training', len(dfTR))
print('Test',len(dfTE))

Training 5239
Test 1363


In [6]:
### FIX COLUMN TYPES
dfTR['bmi'] = dfTR['bmi'].apply(lambda x:np.nan if pd.isnull(x) else float(x.replace(',','.')))
dfTR['age'] = dfTR['age'].apply(lambda x:np.nan if pd.isnull(x) else float(x.replace(',','.')))
dfTR['asa'] = dfTR['asa'].apply(lambda x:np.nan if pd.isnull(x) else str(int(x)))
dfTR["elective"] = dfTR["elective"].map({'No':0,'Yes':1})
dfTR["endo_2L"] = dfTR["endo_2L"].map({'No':0,'Yes':1})
dfTR["prev.hosp.b"] = dfTR["prev.hosp.b"].map({'No':0,'Yes':1})
dfTR["SAP.b"] = dfTR["SAP.b"].map({'No':0,'Yes':1})
dfTR["sex"] = dfTR["sex"].map({'Female':0,'Male':1})

dfTE['bmi'] = dfTE['bmi'].apply(lambda x:np.nan if pd.isnull(x) else float(x.replace(',','.')))
dfTE['age'] = dfTE['age'].apply(lambda x:np.nan if pd.isnull(x) else float(x.replace(',','.')))
dfTE['asa'] = dfTE['asa'].apply(lambda x:np.nan if pd.isnull(x) else str(int(x)))
dfTE["elective"] = dfTE["elective"].map({'No':0,'Yes':1})
dfTE["endo_2L"] = dfTE["endo_2L"].map({'No':0,'Yes':1})
dfTE["prev.hosp.b"] = dfTE["prev.hosp.b"].map({'No':0,'Yes':1})
dfTE["SAP.b"] = dfTE["SAP.b"].map({'No':0,'Yes':1})
dfTE["sex"] = dfTE["sex"].map({'Female':0,'Male':1})

### Map

In [10]:
# Function to detect pseudo-binary columns
def is_pseudo_binary(col):
    unique_values = col.unique()
    return len(unique_values) == 2 and set(unique_values) == {0, 1}

# Detect pseudo-binary columns
pseudo_binary_cols = [col for col in dfTR.columns if is_pseudo_binary(dfTR[col])]
# Identify categorical and numerical columns
cat_cols = dfTR.select_dtypes(include=['object']).columns
num_cols = dfTR.select_dtypes(exclude=['object']).columns
# Update numerical columns by excluding pseudo-binary columns
num_cols = num_cols.difference(pseudo_binary_cols)

# Update transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ], 
    remainder='passthrough'  # keep the pseudo-binary columns as they are
)

# Apply transformations to train dataset
dfTR_transformed = preprocessor.fit_transform(dfTR)

# Apply transformations to test dataset
dfTE_transformed = preprocessor.transform(dfTE)

# Record parameters for the transformations
transform_params = {
    'num_cols': num_cols.tolist(),
    'cat_cols': cat_cols.tolist(),
    'pseudo_binary_cols': pseudo_binary_cols,
    'scaler_min_': preprocessor.named_transformers_['num'].min_.tolist() if 'num' in preprocessor.named_transformers_ else None,
    'scaler_scale_': preprocessor.named_transformers_['num'].scale_.tolist() if 'num' in preprocessor.named_transformers_ else None,
    'encoder_categories': [cat.tolist() for cat in preprocessor.named_transformers_['cat'].categories_] if 'cat' in preprocessor.named_transformers_ else None
}

# Save parameters to a text file
with open('outputs_final_rev/transform_params.txt', 'w') as file:
    file.write(json.dumps(transform_params, indent=4))

transformed_cat_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
transformed_cols = np.concatenate([num_cols, transformed_cat_cols, pseudo_binary_cols])

# Convert transformed train dataset to DataFrame
dfTR_transformed_df = pd.DataFrame(dfTR_transformed, columns=transformed_cols)
dfTE_transformed_df = pd.DataFrame(dfTE_transformed, columns=transformed_cols)

dfTR_transformed_df[list(transformed_cat_cols)+list(pseudo_binary_cols)] = dfTR_transformed_df[list(transformed_cat_cols)+list(pseudo_binary_cols)].astype(int)
dfTE_transformed_df[list(transformed_cat_cols)+list(pseudo_binary_cols)] = dfTE_transformed_df[list(transformed_cat_cols)+list(pseudo_binary_cols)].astype(int)

dfTR_transformed_df, dfTE_transformed_df

(           age       bmi     duree  asa_1  asa_2  asa_3  asa_4  asa_5   
 0     0.376885  0.309129  0.187324      0      0      0      1      0  \
 1     0.688981  0.408714  0.235211      0      0      0      1      0   
 2     0.791265  0.369295  0.095775      0      1      0      0      0   
 3     0.373682  0.101660  0.065493      0      0      1      0      0   
 4     0.726505  0.325726  0.160563      0      0      1      0      0   
 ...        ...       ...       ...    ...    ...    ...    ...    ...   
 5234  0.535455  0.174274  0.071831      0      1      0      0      0   
 5235  0.789678  0.715768  0.079577      0      1      0      0      0   
 5236  0.477497  0.323651  0.186620      0      1      0      0      0   
 5237  0.826295  0.234440  0.189437      0      1      0      0      0   
 5238  0.434786  0.230290  0.185211      0      1      0      0      0   
 
       class_Clean  class_Clean-Contaminated  class_Contaminated   
 0               1                        

In [11]:
ft_cols = sorted(set(dfTR_transformed_df.columns)-set(cols_bacteria))
dfTR_transformed_df.rename(columns = {x:'ft_'+x for x in ft_cols},inplace=True)
dfTR_transformed_df.rename(columns = {x:'LABEL_'+x for x in cols_bacteria},inplace=True)
dfTE_transformed_df.rename(columns = {x:'ft_'+x for x in ft_cols},inplace=True)
dfTE_transformed_df.rename(columns = {x:'LABEL_'+x for x in cols_bacteria},inplace=True)

In [12]:
dfTR_transformed_df.to_csv(clean_train_csv,index=None)
print('# Wrote',clean_train_csv)
dfTE_transformed_df.to_csv(clean_test_csv,index=None)
print('# Wrote',clean_test_csv)

# Wrote inputs/scaled2.train.csv
# Wrote inputs/scaled2.test.csv


In [13]:
print(len(dfTR))
print(len(dfTE))

5239
1363


In [None]:
>>> Excluding surgeries
Before cleaning: 7442, 1872
After cleaning: 5239, 1363
>>> All surgeries
Before cleaning: 8492, 2140
After cleaning: 5949, 1457
