In [90]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.stats.weightstats as sms
from scipy import stats
import math
from scipy.stats import norm
from warnings import filterwarnings
filterwarnings('ignore')

## Average cost, all years, stay focused and according to system

#### NAN statistics issue

In [114]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [131]:
ame_cost = df[df['raison']=='AME']['cost'].to_numpy()
suv_cost = df[df['raison']=='SUV']['cost'].to_numpy()
cmuc_cost = df[df['raison']=='CMU-C']['cost'].to_numpy()
amo_cost = df[df['raison']=='AMO hors CMU-C']['cost'].to_numpy()

### 2-sample t-tests (two-sided)

#### AME vs SUV

In [132]:
stats.ttest_ind(ame_cost,suv_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

#### AME vs CMU-C

In [119]:
stats.ttest_ind(ame_cost,cmuc_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

#### AME vs AMO

In [120]:
stats.ttest_ind(ame_cost,amo_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

#### SUV vs CMU-C

In [121]:
stats.ttest_ind(suv_cost,cmuc_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

#### SUV vs AMO

In [122]:
stats.ttest_ind(suv_cost,amo_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

#### CMU-C vs AMO

In [123]:
stats.ttest_ind(cmuc_cost,amo_cost)

Ttest_indResult(statistic=nan, pvalue=nan)

## Total Cost, all years, stay focused and according to system

Let's model the total cost per system $Y$ as a compound Poisson random variable.

$$
Y=\sum_{i=1}^{N} X_{i}
$$
* $X_i$ refers to the cost of the admisssion $i$, $X_1,X_2,..$ are assumed to be $iid$ for the same law as $X$. Furthermore, they are deemed as independent from $N$.
* The number of admission $N \approx \mathcal{P}(\lambda)$, with $\lambda$ unknown.
* $\mu_{1} = E(X)$, estimated through the empirical moment estimator $\hat{\mu_{1}}$
* $\mu_{2} = E(X^{2})$, estimated through the empirical moment estimator $\hat{\mu_{2}}$

Some calculus gives us the following quadratic equation in $\theta = E(Y)$ :

$$
\theta^{2}-\left(2 Y+z_{*}^{2} \frac{\hat{\mu}_{2}}{\hat{\mu}_{1}}\right) \theta+Y^{2}=0
$$

* Solving this equation for $z_{*} = 1,96$ gives the two extremeties of a 95% confidence interval for $\theta$

The idea is to check whether or not the CIs intersect across systems.



In [70]:
def generate_ic(Y,N,mu_1,mu_2,conf):
    
    z = norm.ppf(conf)
    
    a = 1
    b = -(2*Y + (z**2)*(mu_2/mu_1))
    c = Y**2
    
    delta = b**2 - 4*a*c
    
    l_ci = (-b - math.sqrt(delta))/(2*a)
    h_ci = (-b + math.sqrt(delta))/(2*a)
    
    return l_ci, h_ci
        

### Computing $Y$, $N$,$\hat{\mu_{1}}$,$\hat{\mu_{2}}$ and generating $\theta$ confidence intervalls for each system


In [None]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

#### AME

In [73]:
Y= df[df['raison']=='AME']['cost'].sum()
N = len(df[df['raison']=='AME']['cost'])
mu_1_ame = df[df['raison']=='AME']['cost'].mean()
mu_2_ame = (df[df['raison']=='AME']['cost']**2).mean()

In [76]:
Y

2663752063.874427

In [77]:
generate_ic(Y,N,mu_1_ame,mu_2_ame,conf=0.95)

(2652017594.1798105, 2675538455.4640245)

#### SUV

In [78]:
Y= df[df['raison']=='SUV']['cost'].sum()
N = len(df[df['raison']=='SUV']['cost'])
mu_1_suv = df[df['raison']=='SUV']['cost'].mean()
mu_2_suv = (df[df['raison']=='SUV']['cost']**2).mean()

In [79]:
Y

480335826.8353447

In [80]:
generate_ic(Y,N,mu_1_suv,mu_2_suv,conf=0.95)

(475057362.79221976, 485672941.0222981)

#### CMU-C

In [81]:
Y= df[df['raison']=='CMU-C']['cost'].sum()
N = len(df[df['raison']=='CMU-C']['cost'])
mu_1_cmuc = df[df['raison']=='CMU-C']['cost'].mean()
mu_2_cmuc = (df[df['raison']=='CMU-C']['cost']**2).mean()

In [82]:
Y

1932703266.0760472

In [83]:
generate_ic(Y,N,mu_1_cmuc,mu_2_cmuc,conf=0.95)

(1924248317.8414445, 1941195364.479361)

#### AMO

In [84]:
Y= df[df['raison']=='AMO hors CMU-C']['cost'].sum()
N = len(df[df['raison']=='AMO hors CMU-C']['cost'])
mu_1_amo = df[df['raison']=='AMO hors CMU-C']['cost'].mean()
mu_2_amo = (df[df['raison']=='AMO hors CMU-C']['cost']**2).mean()

In [85]:
Y

2841914529.19382

In [86]:
generate_ic(Y,N,mu_1_amo,mu_2_amo,conf=0.95)

(2831787074.2416954, 2852078203.445319)

## Total Cost across groups and systems, all years, stay focused

We aim to perform the following $\chi^{2}$ test :

$$
\begin{cases}H_{0}: & \text { grp_cln and raison are independant } \\ H_{1}: & \text { grp_cln and raison are dependant }\end{cases}
$$

In [1]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df['grp_cln'] = df['grp_cln'].astype('str')
df = df[df['grp_cln'] != 'no match']

NameError: name 'pd' is not defined

In [88]:
df_cg = df.groupby(['grp_cln','raison']).agg({'cost':'sum'})
df_cg = df_cg.reset_index()
df_cg = df_cg[df_cg['grp_cln']!='no match']

In [89]:
df_cg

Unnamed: 0,grp_cln,raison,cost
0,1,AME,1.982820e+08
1,1,AMO hors CMU-C,6.996243e+07
2,1,CMU-C,4.703201e+07
3,1,SUV,2.995112e+07
4,10,AME,5.136779e+06
...,...,...,...
75,8,SUV,6.132987e+06
76,9,AME,8.023549e+07
77,9,AMO hors CMU-C,4.289026e+07
78,9,CMU-C,2.071299e+07


In [90]:
df_r = df_cg.groupby(['raison']).agg({'cost':'sum'})
df_r = df_r.reset_index()

In [91]:
df_r

Unnamed: 0,raison,cost
0,AME,961721300.0
1,AMO hors CMU-C,844557800.0
2,CMU-C,408147500.0
3,SUV,115860500.0


In [92]:
df_cg['system_total'] = list(df_r['cost'])*20

In [93]:
df_cg['% of system'] = df_cg['cost']/df_cg['system_total']

In [94]:
df_cg[df_cg['raison']=='AME']

Unnamed: 0,grp_cln,raison,cost,system_total,% of system
0,1,AME,198282000.0,961721300.0,0.206174
4,10,AME,5136779.0,961721300.0,0.005341
8,11,AME,1242286.0,961721300.0,0.001292
12,12,AME,12662240.0,961721300.0,0.013166
16,13,AME,13866800.0,961721300.0,0.014419
20,14,AME,193394100.0,961721300.0,0.201092
24,15,AME,17321630.0,961721300.0,0.018011
28,16,AME,17605910.0,961721300.0,0.018307
32,17,AME,13489930.0,961721300.0,0.014027
36,18,AME,10335430.0,961721300.0,0.010747


In [95]:
df_cg['% of system'] = (df_cg['% of system']*1e+08).astype('int')
df_cg = df_cg.pivot_table('% of system', ['grp_cln'], 'raison')
df_cg = df_cg.reset_index()
df_cg = df_cg.rename_axis(None, axis=1)
df_cg['grp_cln'] = df_cg['grp_cln'].astype('int')
df_cg = df_cg.sort_values(by='grp_cln')
df_cg.index = range(1,len(df_cg)+1)

In [96]:
df_cg

Unnamed: 0,grp_cln,AME,AMO hors CMU-C,CMU-C,SUV
1,1,20617406,8283913,11523288,25851013
2,2,3199503,1023030,2580257,2394628
3,3,9884287,2878115,4131890,11603098
4,4,318548,395499,889649,2113306
5,5,2171955,548964,1640605,3634645
6,6,5874335,1391767,2627791,10330676
7,7,1163681,3246447,3571831,240225
8,8,3551417,5742425,4433755,5293423
9,9,8342904,5078428,5074877,5907470
10,10,534123,649248,1243098,85157


In [97]:
cont = df_cg[df_cg.columns[1:]].to_numpy()

In [98]:
cont

array([[20617406,  8283913, 11523288, 25851013],
       [ 3199503,  1023030,  2580257,  2394628],
       [ 9884287,  2878115,  4131890, 11603098],
       [  318548,   395499,   889649,  2113306],
       [ 2171955,   548964,  1640605,  3634645],
       [ 5874335,  1391767,  2627791, 10330676],
       [ 1163681,  3246447,  3571831,   240225],
       [ 3551417,  5742425,  4433755,  5293423],
       [ 8342904,  5078428,  5074877,  5907470],
       [  534123,   649248,  1243098,    85157],
       [  129173,   202317,   334838,   167372],
       [ 1316622,  1890336, 10362040,  1746760],
       [ 1441873,   919977,  2669133,  2337897],
       [20109160, 34275631, 26708121, 10480819],
       [ 1801107,  2815473,  4584839,  2128017],
       [ 1830666,  2880101,  2501565,   419613],
       [ 1402686,   510655,  1001866,  3238994],
       [ 1074680,   856892,  3454465,  1006871],
       [ 4950357,  1022694,  1951735,  8667138],
       [10285507, 25388078,  8714347,  2352868]])

In [99]:
chi2, p, dof, expec = stats.chi2_contingency(cont)

In [100]:
print(f'chi2 stat: {chi2}, p_value: {p}')

chi2 stat: 113052966.77863106, p_value: 0.0


We reject $H_{0}$

## Hemodialysis Total Cost, all years, stay focused and according to system

Let's model the total cost per system $Y$ as a compound Poisson random variable.

$$
Y=\sum_{i=1}^{N} X_{i}
$$
* $X_i$ refers to the cost of the admisssion $i$, $X_1,X_2,..$ are assumed to be $iid$ for the same law as $X$. Furthermore, they are deemed as independent from $N$.
* The number of admission $N \approx \mathcal{P}(\lambda)$, with $\lambda$ unknown.
* $\mu_{1} = E(X)$, estimated through the empirical moment estimator $\hat{\mu_{1}}$
* $\mu_{2} = E(X^{2})$, estimated through the empirical moment estimator $\hat{\mu_{2}}$

Some calculus gives us the following quadratic equation in $\theta = E(Y)$ :

$$
\theta^{2}-\left(2 Y+z_{*}^{2} \frac{\hat{\mu}_{2}}{\hat{\mu}_{1}}\right) \theta+Y^{2}=0
$$

* Solving this equation for $z_{*} = 1,96$ gives the two extremeties of a 95% confidence interval for $\theta$

The idea is to check whether or not the CIs intersect across systems.



In [101]:
def generate_ic(Y,N,mu_1,mu_2,conf):
    
    z = norm.ppf(conf)
    
    a = 1
    b = -(2*Y + (z**2)*(mu_2/mu_1))
    c = Y**2
    
    delta = b**2 - 4*a*c
    
    l_ci = (-b - math.sqrt(delta))/(2*a)
    h_ci = (-b + math.sqrt(delta))/(2*a)
    
    return l_ci, h_ci
        

### Computing $Y$, $N$,$\hat{\mu_{1}}$,$\hat{\mu_{2}}$ and generating $\theta$ confidence intervalls for each system


In [102]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


#### AME

In [103]:
df['Libellé GHM'].nunique()

2474

In [104]:
hem = (df['raison']=='AME')&(df['Libellé GHM']=='Hémodialyse, en séances')
Y= df[hem]['cost'].sum()
N = len(df[hem]['cost'])
mu_1_ame = df[hem]['cost'].mean()
mu_2_ame = (df[hem]['cost']**2).mean()

In [109]:
Y

6962406.870203313

In [105]:
generate_ic(Y,N,mu_1_ame,mu_2_ame,conf=0.95)

(64353999.20010129, 65445884.196268015)

#### SUV

In [106]:
hem = (df['raison']=='SUV')&(df['Libellé GHM']=='Hémodialyse, en séances')
Y= df[hem]['cost'].sum()
N = len(df[hem]['cost'])
mu_1_suv = df[hem]['cost'].mean()
mu_2_suv = (df[hem]['cost']**2).mean()

In [107]:
Y

6962406.870203313

In [108]:
generate_ic(Y,N,mu_1_suv,mu_2_suv,conf=0.95)

(6814081.672534358, 7113960.729535103)

#### CMU-C

In [110]:
hem = (df['raison']=='CMU-C')&(df['Libellé GHM']=='Hémodialyse, en séances')
Y= df[hem]['cost'].sum()
N = len(df[hem]['cost'])
mu_1_cmuc = df[hem]['cost'].mean()
mu_2_cmuc = (df[hem]['cost']**2).mean()

In [111]:
Y

15365557.262149628

In [112]:
generate_ic(Y,N,mu_1_cmuc,mu_2_cmuc,conf=0.95)

(15031198.904664442, 15707353.18412513)

#### AMO

In [113]:
hem = (df['raison']=='AMO hors CMU-C')&(df['Libellé GHM']=='Hémodialyse, en séances')
Y= df[hem]['cost'].sum()
N = len(df[hem]['cost'])
mu_1_amo = df[hem]['cost'].mean()
mu_2_amo = (df[hem]['cost']**2).mean()

In [114]:
Y

46162809.090261616

In [115]:
generate_ic(Y,N,mu_1_amo,mu_2_amo,conf=0.95)

(45603002.8781383, 46729487.28395092)

## Severity repartition across systems, all years, stay focused

We aim to perform the following $\chi^{2}$ test :

$$
\begin{cases}H_{0}: & \text { grp_cln and raison are independant } \\ H_{1}: & \text { grp_cln and raison are dependant }\end{cases}
$$

In [91]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [92]:
df['severity'].unique()

array(['2', '3', '1', 'Pas de niveau de sévérité', '4', 1, 3, 2, 4],
      dtype=object)

In [93]:
df['severity'] = df['severity'].astype(str)
df = df[df['severity']!='Pas de niveau de sévérité']

In [94]:
df.columns

Index(['finess', 'mois', 'annee', 'sexe', 'ghm2', 'GHS', 'age', 'duree',
       'supp_rea', 'supp_si', 'supp_stf', 'supp_src', 'supp_nn1', 'supp_nn2',
       'supp_nn3', 'supp_rep', 'ano_date', 'anonyme', 'nbActe', 'nbRum',
       'modeEntree', 'provenance', 'modeSortie', 'motif', 'dp', 'dr', 'cost',
       'raison', 'hp_type', 'severity', 'ghm_racine', 'cmd', 'departement',
       'id_dep', 'region_label', 'population_region', 'Libellé GHM', 'racine',
       'Libellé GHM Racine', 'label_cmd', 'lib_dp', 'region',
       'effectif_region_2020', 'grp_cln'],
      dtype='object')

In [95]:
df_cg = df.groupby(['raison','severity']).agg({'anonyme':'count'})
df_cg = df_cg.reset_index()

In [96]:
df_cg

Unnamed: 0,raison,severity,anonyme
0,AME,1,168161
1,AME,2,65255
2,AME,3,34220
3,AME,4,14911
4,AMO hors CMU-C,1,181076
5,AMO hors CMU-C,2,87864
6,AMO hors CMU-C,3,72421
7,AMO hors CMU-C,4,25149
8,CMU-C,1,202990
9,CMU-C,2,69932


In [97]:
df_r = df_cg.groupby(['raison']).agg({'anonyme':'sum'})
df_r = df_r.reset_index()

In [98]:
df_r

Unnamed: 0,raison,anonyme
0,AME,282547
1,AMO hors CMU-C,366510
2,CMU-C,328505
3,SUV,51339


In [99]:
sys_tot = [x_ar for x_ar in df_r['anonyme'].values for i in range(4)]

In [100]:
df_cg['system_total'] = sys_tot

In [101]:
df_cg['% of system'] = df_cg['anonyme']/df_cg['system_total'] *100

In [102]:
df_cg[df_cg['raison']=='AME']

Unnamed: 0,raison,severity,anonyme,system_total,% of system
0,AME,1,168161,282547,59.516116
1,AME,2,65255,282547,23.095273
2,AME,3,34220,282547,12.111259
3,AME,4,14911,282547,5.277352


In [103]:
df_cg

Unnamed: 0,raison,severity,anonyme,system_total,% of system
0,AME,1,168161,282547,59.516116
1,AME,2,65255,282547,23.095273
2,AME,3,34220,282547,12.111259
3,AME,4,14911,282547,5.277352
4,AMO hors CMU-C,1,181076,366510,49.405473
5,AMO hors CMU-C,2,87864,366510,23.973152
6,AMO hors CMU-C,3,72421,366510,19.759625
7,AMO hors CMU-C,4,25149,366510,6.86175
8,CMU-C,1,202990,328505,61.792058
9,CMU-C,2,69932,328505,21.287956


In [104]:
df_cg = df_cg.pivot_table('anonyme', ['severity'], 'raison')
df_cg = df_cg.reset_index()
df_cg = df_cg.rename_axis(None, axis=1)
df_cg['severity'] = df_cg['severity'].astype('int')
df_cg = df_cg.sort_values(by='severity')
df_cg.index = range(1,len(df_cg)+1)

In [105]:
df_cg

Unnamed: 0,severity,AME,AMO hors CMU-C,CMU-C,SUV
1,1,168161,181076,202990,27904
2,2,65255,87864,69932,11805
3,3,34220,72421,40060,7902
4,4,14911,25149,15523,3728


In [106]:
cont = df_cg[df_cg.columns[1:]].to_numpy()

In [107]:
cont

array([[168161, 181076, 202990,  27904],
       [ 65255,  87864,  69932,  11805],
       [ 34220,  72421,  40060,   7902],
       [ 14911,  25149,  15523,   3728]], dtype=int64)

In [108]:
chi2, p, dof, expec = stats.chi2_contingency(cont)

In [109]:
print(f'chi2 stat: {chi2}, p_value: {p}')

chi2 stat: 16477.162049222956, p_value: 0.0


We reject $H_{0}$

## Average supplement, all years, stay focused and according to system

In [133]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [134]:
df['sup'] = df['supp_rea'] + df['supp_si'] + df['supp_stf'] + df['supp_src']

In [135]:
ame_sup = df[df['raison']=='AME']['sup']
suv_sup = df[df['raison']=='SUV']['sup']
cmuc_sup = df[df['raison']=='CMU-C']['sup']
amo_sup = df[df['raison']=='AMO hors CMU-C']['sup']

### 2-sample t-tests (two-sided)

#### AME vs SUV

In [136]:
stats.ttest_ind(ame_sup,suv_sup)

Ttest_indResult(statistic=-23.560127025660165, pvalue=1.0535892500217554e-122)

#### AME vs CMU-C

In [137]:
stats.ttest_ind(ame_sup,cmuc_sup)

Ttest_indResult(statistic=-14.364955913162875, pvalue=8.635694858398821e-47)

#### AME vs AMO

In [138]:
stats.ttest_ind(ame_sup,amo_sup)

Ttest_indResult(statistic=6.868818672619909, pvalue=6.475196375738712e-12)

#### SUV vs CMU-C

In [139]:
stats.ttest_ind(suv_sup,cmuc_sup)

Ttest_indResult(statistic=17.183990242083798, pvalue=3.584539962692694e-66)

#### SUV vs AMO

In [140]:
stats.ttest_ind(suv_sup,amo_sup)

Ttest_indResult(statistic=33.53288167141703, pvalue=2.0174966027687074e-246)

#### CMU-C vs AMO

In [141]:
stats.ttest_ind(cmuc_sup,amo_sup)

Ttest_indResult(statistic=24.234309472329652, pvalue=1.010262307158509e-129)

## Patients with supplement repartition across systems, all years, stay focused

We aim to perform the following $\chi^{2}$ test :

$$
\begin{cases}H_{0}: & \text { grp_cln and raison are independant } \\ H_{1}: & \text { grp_cln and raison are dependant }\end{cases}
$$

In [179]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [180]:
df['sup'] = df['supp_rea'] + df['supp_si'] + df['supp_stf'] + df['supp_src']

In [181]:
df['sup_1'] = df['sup'] >= 1

In [182]:
df_cg = df.groupby(['raison','sup_1']).agg({'anonyme':'nunique'})
df_cg = df_cg.reset_index()

In [183]:
df_cg

Unnamed: 0,raison,sup_1,anonyme
0,AME,False,335648
1,AME,True,26939
2,AMO hors CMU-C,False,509330
3,AMO hors CMU-C,True,42650
4,CMU-C,False,442191
5,CMU-C,True,34893
6,SUV,False,54084
7,SUV,True,6610


In [184]:
df_r = df.groupby(['raison']).agg({'anonyme':'nunique'})
df_r = df_r.reset_index()

In [185]:
df_r

Unnamed: 0,raison,anonyme
0,AME,343392
1,AMO hors CMU-C,527849
2,CMU-C,460333
3,SUV,57276


In [186]:
df_cg = df_cg.pivot_table('anonyme', ['sup_1'], 'raison')
df_cg = df_cg.reset_index()
df_cg = df_cg.rename_axis(None, axis=1)
df_cg.index = range(1,len(df_cg)+1)

In [187]:
df_cg

Unnamed: 0,sup_1,AME,AMO hors CMU-C,CMU-C,SUV
1,False,335648,509330,442191,54084
2,True,26939,42650,34893,6610


In [188]:
cont = df_cg[df_cg.columns[1:]].to_numpy()

In [189]:
cont

array([[335648, 509330, 442191,  54084],
       [ 26939,  42650,  34893,   6610]], dtype=int64)

In [190]:
chi2, p, dof, expec = stats.chi2_contingency(cont)

In [191]:
print(f'chi2 stat: {chi2}, p_value: {p}')

chi2 stat: 1008.1890639528796, p_value: 3.010700713323296e-218


We reject $H_{0}$

## Patients coming from emergency services repartition across systems, all years, stay focused

We aim to perform the following $\chi^{2}$ test :

$$
\begin{cases}H_{0}: & \text { grp_cln and raison are independant } \\ H_{1}: & \text { grp_cln and raison are dependant }\end{cases}
$$

In [193]:
def provenance_proc(prov):
    if prov == 5 or prov==5.0 or prov=='5' or prov=='5.0':
        return 'Urgences'
    else:
        return 'Autres'

In [194]:
df = pd.read_csv('pmsi_analysis_v14.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [195]:
df['provenance'] = df['provenance'].apply(provenance_proc)

In [196]:
df_cg = df.groupby(['raison','provenance']).agg({'anonyme':'nunique'})
df_cg = df_cg.reset_index()

In [197]:
df_cg

Unnamed: 0,raison,provenance,anonyme
0,AME,Autres,246543
1,AME,Urgences,166108
2,AMO hors CMU-C,Autres,373995
3,AMO hors CMU-C,Urgences,217723
4,CMU-C,Autres,280024
5,CMU-C,Urgences,227209
6,SUV,Autres,29993
7,SUV,Urgences,38756


In [198]:
df_cg = df_cg.pivot_table('anonyme', ['provenance'], 'raison')
df_cg = df_cg.reset_index()
df_cg = df_cg.rename_axis(None, axis=1)
df_cg.index = range(1,len(df_cg)+1)

In [199]:
df_cg

Unnamed: 0,provenance,AME,AMO hors CMU-C,CMU-C,SUV
1,Autres,246543,373995,280024,29993
2,Urgences,166108,217723,227209,38756


In [200]:
cont = df_cg[df_cg.columns[1:]].to_numpy()

In [201]:
cont

array([[246543, 373995, 280024,  29993],
       [166108, 217723, 227209,  38756]], dtype=int64)

In [202]:
chi2, p, dof, expec = stats.chi2_contingency(cont)

In [203]:
print(f'chi2 stat: {chi2}, p_value: {p}')

chi2 stat: 14133.633600968273, p_value: 0.0


We reject $H_{0}$