In [1]:
%%capture capt
import numpy as np
import pandas as pd
from tqdm import tqdm

### Importation data

In [2]:
base_patient = pd.read_csv("../data/profil_patient.csv")

In [3]:
%%capture capt
p_soins = pd.read_csv("../data/parcours_soins.csv")

In [4]:
p_soins.head(3)

Unnamed: 0,BEN_NIR_IDT,2008-01-15,2008-02-15,2008-03-15,2008-04-15,2008-05-15,2008-06-15,2008-07-15,2008-08-15,2008-09-15,...,2018-12-19,2018-12-20,2018-12-21,2018-12-22,2018-12-24,2018-12-25,2018-12-26,2018-12-27,2018-12-28,2018-12-29
0,P0,,,,,,,,,,...,,,,,,,,,,
1,P6,,,,,,,,,,...,,,,,,,,,,
2,P8,,,,,,,,,,...,,,,,,,,,,


In [5]:
base_patient.head(3)

Unnamed: 0,CODE_PATIENT,date_h0,y_nais,BEN_RES_DPT,BEN_SEX_COD,dte_deces,Mort,Nb_survie,Nb_hospit,CHOC,Nb_jours_sej,cat_age,cat_nb_hospit,cat_nb_jours_sej,cluster
0,P0,2011-11-22,1949,11,1,,False,2596,9,False,23,1,1,1,0
1,P6,2013-12-05,1929,14,2,2015-03,True,456,8,False,51,2,1,1,1
2,P8,2015-07-02,1967,54,2,2015-08,True,41,3,True,43,0,1,1,2


### Data Pre-processing

In [6]:
base_patient.Mort = base_patient.Mort.astype(int)
base_patient.CHOC = base_patient.CHOC.astype(int)
base_patient.BEN_SEX_COD = base_patient.BEN_SEX_COD-1

In [7]:
# on ne conserve que l'annee de la premiere hospitalisation

base_patient.date_h0 = pd.to_datetime(base_patient.date_h0).apply(lambda date: int(date.year))

### One-Hot Encoding des GHM

##### On recupere tous les differents GHM qui existent dans notre base

In [8]:
all_ghm = set()
for i in range(len(p_soins)):
    ghm_row = set(p_soins.iloc[i].dropna().tolist()[1:])
    all_ghm = all_ghm.union(ghm_row)    

In [9]:
nb_ghm = len(all_ghm)
print(nb_ghm)

2346


##### On fait un One-Hot Encoding de chaque GHM, qu'on stocke dans un dictionnaire

In [10]:
encoding_ghm = dict()
encoding_ghm[''] = encoding = [0]*nb_ghm
cur_index_to_encode_ghm = 0
for ghm in all_ghm:
    if encoding_ghm.get(ghm) != None:
        pass #le ghm a deja ete encode
    else:
        encoding = [0]*nb_ghm
        encoding[cur_index_to_encode_ghm] = 1
        cur_index_to_encode_ghm+=1
        encoding_ghm[ghm] = encoding

In [11]:
# on verifie que l'encodage a ete bien fait
# i.e on verifie que chaque ghm a bien un encodage unique
# si la boucle ne print rien, c'est que l'encodage ne presente pas de probleme
cur_ghm = -1
for ghm in encoding_ghm.keys():
    if np.argmax(encoding_ghm[ghm]) != cur_ghm:
        print(ghm)
    cur_ghm += 1




In [12]:
ghm_patient_encoded = []
for i in tqdm(range(len(base_patient))):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    encoding_ghm_row = []
    for ghm in ghm_row:
        encoded_ghm = encoding_ghm[ghm]
        encoding_ghm_row.append(encoded_ghm)
    ghm_patient_encoded.append(encoding_ghm_row)

100%|██████████| 10051/10051 [00:05<00:00, 1840.71it/s]


In [13]:
max_len = 0
for p_soins_encoded in ghm_patient_encoded:
    max_len = max(max_len, len(p_soins_encoded))
    
print(max_len)

254


In [14]:
for i in range(len(ghm_patient_encoded)):
    while len(ghm_patient_encoded[i]) < max_len:
        ghm_patient_encoded[i].append(encoding_ghm[''])

### Encoding des patients

In [15]:
items_to_keep = ["date_h0", "y_nais", "BEN_SEX_COD", "Mort", "Nb_survie", "Nb_hospit", "CHOC"]

In [16]:
patient = base_patient[items_to_keep]

In [17]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [18]:
X_patient = []
for i in tqdm(range(len(patient))):
    row_patient = list(patient.iloc[i])
    ghm_patient = flatten(ghm_patient_encoded[i])
    X_patient_i = row_patient + ghm_patient
    X_patient.append(X_patient_i)

100%|██████████| 10051/10051 [03:54<00:00, 42.91it/s]


One-Hot Encoding est trop lourd: pouir chaque patient, on a un vecteur de dimension 254 x 2346 pour encoder les GHM

### Label Encoder

In [23]:
label_encoding_ghm = dict()
label_encoding_ghm[''] = 0
cur_label = 1
for ghm in all_ghm:
    if label_encoding_ghm.get(ghm) != None:
        pass #le ghm a deja ete encode
    else:
        encoding = cur_label/nb_ghm
        cur_label += 1
        label_encoding_ghm[ghm] = encoding

In [24]:
# on verifie que l'encodage a ete bien fait
# i.e on verifie que chaque ghm a bien un encodage unique
# si la boucle ne print rien, c'est que l'encodage ne presente pas de probleme
cur_label = 0
for ghm in label_encoding_ghm.keys():
    if label_encoding_ghm[ghm] != cur_label/nb_ghm:
        print(ghm)
    cur_label += 1

In [25]:
ghm_patient_label_encoded = []
for i in tqdm(range(len(base_patient))):
    ghm_row = p_soins.iloc[i].dropna().tolist()[1:]
    encoding_ghm_row = []
    for ghm in ghm_row:
        encoded_ghm = label_encoding_ghm[ghm]
        encoding_ghm_row.append(encoded_ghm)
    ghm_patient_label_encoded.append(encoding_ghm_row)

100%|██████████| 10051/10051 [00:04<00:00, 2016.82it/s]


In [26]:
max_len = 0
for p_soins_encoded in ghm_patient_label_encoded:
    max_len = max(max_len, len(p_soins_encoded))
    
print(max_len)

254


In [27]:
for i in range(len(ghm_patient_label_encoded)):
    while len(ghm_patient_label_encoded[i]) < max_len:
        ghm_patient_label_encoded[i].append(label_encoding_ghm[''])

In [None]:
X_patient = []
for i in tqdm(range(len(patient))):
    row_patient = list(patient.iloc[i])
    ghm_patient = ghm_patient_label_encoded[i]
    X_patient_i = row_patient + ghm_patient
    X_patient.append(X_patient_i)

 72%|███████▏  | 7213/10051 [00:00<00:00, 24325.84it/s]