# EDA - Bases de données patient, consommant et hospitalisation

### Installation libraire requise

In [1]:
!pip install pandas_profiling

Collecting pandas_profiling
  Downloading pandas_profiling-3.5.0-py2.py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.0/325.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting phik<0.13,>=0.11.1
  Downloading phik-0.12.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (679 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.5/679.5 kB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<2.14,>=2.13.2
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting multimethod<1.10,>=1.4
  Downloading multimethod-1.9-py3-none-any.whl (10 kB)
Collecting pydantic<1.11,>=1.8.1
  Downloading pydantic-1.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting htmlmin==0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing

### Import librairies

In [2]:
import numpy as np
import os
import pandas as pd
from pandas_profiling import ProfileReport

### Import des bases de données

In [13]:
hospit=pd.read_csv("base_hospit.csv", sep=';')
patient=pd.read_csv("base_patient.csv", sep=';')
consommant=pd.read_csv("consommant.csv", sep=';')

  hospit=pd.read_csv("base_hospit.csv", sep=';')


### Conversion des dates en datetime

In [14]:
hospit['EXE_SOI_DTD']=pd.to_datetime(hospit['EXE_SOI_DTD'], dayfirst=True)
hospit['SEJ_NBJ']=hospit['SEJ_NBJ'].fillna(np.nan).astype(float)
hospit['SEJ_NBJ']=hospit['SEJ_NBJ'].astype('Int64')

In [18]:
consommant['datemax']=pd.to_datetime(consommant['datemax'], dayfirst=True)

In [20]:
patient['date_h0']=pd.to_datetime(patient['date_h0'], dayfirst=True)
patient['dte_deces']=pd.to_datetime(patient['dte_deces'], dayfirst=True)

## Suppression des patients dont la première hospitalisation est postérieure à 2016

In [28]:
patient=patient[patient['date_h0']<'01-01-2016']

In [29]:
patient['date_h0']=pd.to_datetime(patient['date_h0'], dayfirst=True)

In [None]:
patient.head()

In [34]:
patient['date_h0'].dt.year

0        2011
1        2008
2        2008
5        2008
6        2013
         ... 
24305    2009
24306    2008
24307    2008
24308    2011
24310    2010
Name: date_h0, Length: 17416, dtype: int64

In [36]:
consommant = consommant[consommant['CODE_PATIENT'].isin(patient['CODE_PATIENT'])].reset_index(drop=True)

In [None]:
consommant.head()

In [None]:
hospit.head()

In [43]:
hospit = hospit[hospit['BEN_NIR_IDT'].isin(patient['CODE_PATIENT'])].reset_index(drop=True)

In [44]:
hospit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158715 entries, 0 to 158714
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   BEN_NIR_IDT  158715 non-null  object        
 1   RSA_NUM      158715 non-null  object        
 2   ETA_NUM      158715 non-null  object        
 3   EXE_SOI_DTD  158713 non-null  datetime64[ns]
 4   SEJ_NBJ      155779 non-null  Int64         
 5   GRG_GHM      158715 non-null  object        
 6   DGN_PAL      158715 non-null  object        
 7   DGN_REL      68448 non-null   object        
dtypes: Int64(1), datetime64[ns](1), object(6)
memory usage: 9.8+ MB


## On conserve uniquement les mois de décès

In [46]:
patient['dte_deces']=patient['dte_deces'].dt.to_period('M')

In [None]:
patient.head()

## Ajout des insuffisances cardiaques avec état de choc cardiogénique

In [86]:
#Insuffisances cardiaques et états de choc circulatoire
#hospit[hospit['GRG_GHM'].str.startswith('05M09')]

#Insuffisances cardiaques et états de choc cardiogénique (code R57)
#hospit[(hospit['GRG_GHM'].str.startswith('05M09'))&(hospit['DGN_PAL'].str.startswith('R57'))]

#Ajout d'une colonne CHOC
hospit['CHOC']=hospit.index.isin(hospit[(hospit['GRG_GHM'].str.startswith('05M09'))&(hospit['DGN_PAL'].str.startswith('R57'))].index)
hospit['CHOC']=hospit['CHOC'].astype('int')

In [87]:
hospit.iloc[18]

BEN_NIR_IDT      006X0JKEB0JEXXH2B
RSA_NUM                       5707
ETA_NUM                  370000093
EXE_SOI_DTD    2015-07-02 00:00:00
SEJ_NBJ                          0
GRG_GHM                     05M09T
DGN_PAL                       R571
DGN_REL                        NaN
CHOC                             1
Name: 18, dtype: object

## Statistiques descriptives sur les datasets

### Hospitalisation

In [88]:
hospit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158715 entries, 0 to 158714
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   BEN_NIR_IDT  158715 non-null  object        
 1   RSA_NUM      158715 non-null  object        
 2   ETA_NUM      158715 non-null  object        
 3   EXE_SOI_DTD  158713 non-null  datetime64[ns]
 4   SEJ_NBJ      155779 non-null  Int64         
 5   GRG_GHM      158715 non-null  object        
 6   DGN_PAL      158715 non-null  object        
 7   DGN_REL      68448 non-null   object        
 8   CHOC         158715 non-null  int64         
dtypes: Int64(1), datetime64[ns](1), int64(1), object(6)
memory usage: 11.0+ MB


In [89]:
profile_hospit = ProfileReport(hospit, title="Hospit Profile Report")

In [None]:
profile_hospit.to_notebook_iframe()

In [91]:
profile_hospit.to_file("profile_hospit.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Consommant

In [None]:
consommant.head()

In [93]:
consommant.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17340 entries, 0 to 17339
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   CODE_PATIENT  17340 non-null  object        
 1   _TEMG001      17340 non-null  int64         
 2   datemax       17340 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 406.5+ KB


In [94]:
profile_conso = ProfileReport(consommant, title="Consommant Profile Report")

In [None]:
profile_conso.to_notebook_iframe()

In [96]:
profile_conso.to_file("profile_conso.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Patient

In [None]:
patient.head()

In [98]:
patient.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17416 entries, 0 to 24310
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   CODE_PATIENT  17416 non-null  object        
 1   date_h0       17416 non-null  datetime64[ns]
 2   ALD_before    17416 non-null  int64         
 3   pop           17416 non-null  int64         
 4   y_nais        17416 non-null  int64         
 5   BEN_RES_DPT   17416 non-null  object        
 6   BEN_SEX_COD   17416 non-null  int64         
 7   dte_deces     11831 non-null  period[M]     
dtypes: datetime64[ns](1), int64(4), object(2), period[M](1)
memory usage: 1.2+ MB


In [99]:
profile_patient = ProfileReport(patient, title="Patient Profile Report")

In [None]:
profile_patient.to_notebook_iframe()

In [101]:
profile_patient.to_file("profile_patient.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]