# Inferential statistics

## Importing useful libraries

In [1]:
# Importing the pandas library for data manipulation
import pandas as pd

# Import numpy library for efficient numeric operations
import numpy as np

# Import statistics library for basic statistical functions
import statistics as st

# Import the chisquare function to perform a chi-square test.
from scipy.stats import chisquare

## Importing the data used

In [2]:
df = pd.read_excel('dados_cancerpositivo.xlsx', index_col = 'Número')
df

Unnamed: 0_level_0,Medical record,Date of collection,Histopathological,Estrogen receptor expression,Progesterone receptor expression,HER2-amplified,KI67,Molecular subtype,Tumor size,Grade,...,Menopausal status,Weight,Height,BMI,Exposure to pesticides,Chemoresistance,Recurrence,Death,Municipality,HER
Número,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,26214,2015-05-27,1.0,1.0,1.0,0.0,1.0,2.0,20.0,1.0,...,1.0,57.0,1.60,22.265625,1.0,0.0,0.0,0.0,Dois Vizinhos,
3,24773,2015-04-06,1.0,0.0,0.0,0.0,1.0,5.0,14.0,2.0,...,1.0,84.0,1.62,32.007316,1.0,0.0,0.0,0.0,Capanema,
5,26248,2015-06-08,1.0,0.0,0.0,0.0,1.0,5.0,30.0,2.0,...,1.0,64.0,1.52,27.700831,1.0,1.0,1.0,0.0,Planalto,
7,25778,2015-06-10,1.0,1.0,1.0,1.0,1.0,4.0,25.0,1.0,...,0.0,52.0,1.55,21.644121,1.0,1.0,0.0,0.0,Dois Vizinhos,
8,15847,2015-06-10,1.0,1.0,1.0,0.0,1.0,2.0,16.0,2.0,...,1.0,52.0,1.55,21.644121,1.0,1.0,1.0,1.0,Dois Vizinhos,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,44730,2022-12-06,1.0,1.0,1.0,1.0,1.0,4.0,15.0,2.0,...,,,,,,0.0,0.0,0.0,,
942,46377,2022-12-23,1.0,1.0,1.0,0.0,0.0,1.0,17.0,1.0,...,,,,,,0.0,0.0,0.0,,
946,44770,2023-02-08,1.0,1.0,0.0,0.0,1.0,2.0,8.0,3.0,...,,,,,,0.0,0.0,0.0,,
948,45672,2023-02-09,1.0,1.0,1.0,0.0,1.0,1.0,15.0,2.0,...,,,,,,0.0,0.0,0.0,,


Changing the nomenclature of molecular subtypes to perform the chi-square test. 
For this analysis, data on molecular subtypes are organized as follows: Luminal A, Luminal B, HER2 and Triple negative, previously organized as 1,2,4 and 5 respectively.

In [3]:
df[df.columns[7]].replace(1, 'A Luminal', inplace=True)
df[df.columns[7]].replace(2, 'B Luminal', inplace=True)
df[df.columns[7]].replace(4, 'HER2-amplified', inplace=True)
df[df.columns[7]].replace(5, 'Triple-negative', inplace=True)
df[df.columns[7]].value_counts()

B Luminal          120
A Luminal          118
HER2-amplified      59
Triple-negative     58
Name: Molecular subtype, dtype: int64

## Defining the populations of interest

### Separating the population exposed to pesticides from the total population

In [4]:
exposto = df[df["Exposure to pesticides"]==1]

### Separating the population not exposed to pesticides from the total population

In [5]:
n_exposto = df[df["Exposure to pesticides"]==0]

## Defining the functions for calculating chi-square

In [6]:
def print_do_p_value(p: np.array) -> list:
    return_list = ["<0.01" if i < 0.01 else str(i.round(3)) for i in p]
    return return_list

In [7]:
def teste_estatistico(df_retorno: pd.DataFrame, index_teste: list) -> list:
    obs = np.array(df_retorno.iloc[:,index_teste]).T
    _, p = chisquare(obs)
    return print_do_p_value(p)

### Function for calculation using the average and frequency of RISK STRATIFICATION

In [8]:
''' To calculate the comparison with risk stratification in parameters that use the average'''

def tabela_freq_med(df: pd.DataFrame, coluna_grupo: list) -> pd.DataFrame:
    df_retorno = round(df.loc[:, coluna_grupo_med].groupby([coluna_filter_1]).mean().T.copy(),2)
    
    index_df_1 = df_retorno.columns[0][0]+"/"+df_retorno.columns[1][0]
    index_df_2 = df_retorno.columns[1][0]+"/"+df_retorno.columns[2][0]
    index_df_3 = df_retorno.columns[0][0]+"/"+df_retorno.columns[2][0]
    
    df_retorno[index_df_1] = teste_estatistico(df_retorno, [0,1])
    df_retorno[index_df_2] = teste_estatistico(df_retorno, [1,2])
    df_retorno[index_df_3] = teste_estatistico(df_retorno, [0,2])
    
    return df_retorno

In [9]:
''' To calculate the comparison with risk stratification in binary parameters'''

def tabela_freq_soma(df: pd.DataFrame, coluna_grupo: list) -> pd.DataFrame:
    df_retorno = df.loc[:, coluna_grupo_sum].groupby([coluna_filter_1]).sum().T.copy()
    
    index_df_1 = df_retorno.columns[0][0]+"/"+df_retorno.columns[1][0]
    index_df_2 = df_retorno.columns[1][0]+"/"+df_retorno.columns[2][0]
    index_df_3 = df_retorno.columns[0][0]+"/"+df_retorno.columns[2][0]
    
    df_retorno[index_df_1] = teste_estatistico(df_retorno, [0,1])
    df_retorno[index_df_2] = teste_estatistico(df_retorno, [1,2])
    df_retorno[index_df_3] = teste_estatistico(df_retorno, [0,2])
    
    return df_retorno

### Function for calculation using the average and frequency of MOLECULAR SUBTYPES

In [10]:
''' To calculate the comparison with subtypes with parameters that use the average '''

def tabela_freq_med_sub(df: pd.DataFrame, coluna_grupo_med: list) -> pd.DataFrame:
    df_retorno = round(df.loc[:, coluna_grupo_med].groupby([coluna_filter_2]).mean().T.copy(),2)
    
    index_df_1 = df_retorno.columns[0][0]+"/"+df_retorno.columns[1][0]
    index_df_2 = df_retorno.columns[1][0]+"/"+df_retorno.columns[2][0]
    index_df_3 = df_retorno.columns[2][0]+"/"+df_retorno.columns[3][0]
    index_df_4 = df_retorno.columns[3][0]+"/"+df_retorno.columns[0][0]
    index_df_5 = df_retorno.columns[1][0]+"/"+df_retorno.columns[3][0]
    index_df_6 = df_retorno.columns[2][0]+"/"+df_retorno.columns[0][0]
    
    df_retorno[index_df_1] = teste_estatistico(df_retorno, [0,1])
    df_retorno[index_df_2] = teste_estatistico(df_retorno, [1,2])
    df_retorno[index_df_3] = teste_estatistico(df_retorno, [2,3])
    df_retorno[index_df_4] = teste_estatistico(df_retorno, [3,0])
    df_retorno[index_df_5] = teste_estatistico(df_retorno, [1,3])
    df_retorno[index_df_6] = teste_estatistico(df_retorno, [2,0])
    
    return df_retorno

In [11]:
''' To calculate comparison with subtypes in binary parameters'''

def tabela_freq_soma_sub(df: pd.DataFrame, coluna_grupo_sum: list) -> pd.DataFrame:
    df_retorno = df.loc[:, coluna_grupo_sum].groupby([coluna_filter_2]).sum().T.copy()
    
    index_df_1 = df_retorno.columns[0][0]+"/"+df_retorno.columns[1][0]
    index_df_2 = df_retorno.columns[1][0]+"/"+df_retorno.columns[2][0]
    index_df_3 = df_retorno.columns[2][0]+"/"+df_retorno.columns[3][0]
    index_df_4 = df_retorno.columns[3][0]+"/"+df_retorno.columns[0][0]
    index_df_5 = df_retorno.columns[1][0]+"/"+df_retorno.columns[3][0]
    index_df_6 = df_retorno.columns[2][0]+"/"+df_retorno.columns[0][0]
    
    df_retorno[index_df_1] = teste_estatistico(df_retorno, [0,1])
    df_retorno[index_df_2] = teste_estatistico(df_retorno, [1,2])
    df_retorno[index_df_3] = teste_estatistico(df_retorno, [2,3])
    df_retorno[index_df_4] = teste_estatistico(df_retorno, [3,0])
    df_retorno[index_df_5] = teste_estatistico(df_retorno, [1,3])
    df_retorno[index_df_6] = teste_estatistico(df_retorno, [2,0])
    
    return df_retorno

### Separating the filters used in the function

In [12]:
# Separating the parameters of interest for comparison
coluna_filter_1 = "Risk stratification"
coluna_filter_2 = "Molecular subtype"

# Separating the parameters used for calculation using AVERAGE and FREQUENCY
coluna_grupo_med = list(df.columns[[8, 14,16, 17, 18, 7, 13]].values.tolist())
coluna_grupo_sum = list(df.columns[[3,4,5,6,10,11,12,15,19,20,21,22, 7, 13]].values.tolist())

## Calculating the Chi-square test

In the Chi-square test of independence, the mean of quantitative parameters and the frequency of binary parameters were evaluated for an exploratory nature. Only results with clinical significance were reported in the article.

### Calculating for the population exposed to pesticides

#### Are patients exposed to pesticides different from each other according to RISK STRATIFICATION?

In [13]:
comp_exposto_soma = tabela_freq_soma(exposto, coluna_grupo_sum)
comp_exposto_soma

  terms = (f_obs_float - f_exp)**2 / f_exp


Risk stratification,High,Low,Medium,H/L,L/M,H/M
Estrogen receptor expression,25.0,15.0,83.0,0.114,<0.01,<0.01
Progesterone receptor expression,16.0,10.0,54.0,0.239,<0.01,<0.01
HER2-amplified,23.0,0.0,0.0,<0.01,,<0.01
KI67,46.0,2.0,62.0,<0.01,<0.01,0.124
Angiolymphatic emboli,24.0,1.0,24.0,<0.01,<0.01,1.0
Lymph node,33.0,0.0,31.0,<0.01,<0.01,0.803
Distant metastasis,37.0,0.0,35.0,<0.01,<0.01,0.814
Menopausal status,41.0,11.0,66.0,<0.01,<0.01,0.016
Exposure to pesticides,72.0,15.0,95.0,<0.01,<0.01,0.075
Chemoresistance,21.0,1.0,18.0,<0.01,<0.01,0.631


In [14]:
comp_exposto_med = tabela_freq_med(exposto, coluna_grupo_med)
comp_exposto_med

Risk stratification,High,Low,Medium,H/L,L/M,H/M
Tumor size,38.44,12.07,27.35,<0.01,0.015,0.172
Age at diagnosis,54.1,58.8,56.74,0.658,0.848,0.802
Weight,72.88,66.55,72.13,0.592,0.636,0.95
Height,1.6,1.61,1.59,0.996,0.991,0.996
BMI,28.0,26.21,28.11,0.808,0.797,0.988


#### Are patients with exposure to pesticides distinct in comparison to MOLECULAR SUBTYPE?

In [15]:
comp_exposto_soma_sm = tabela_freq_soma_sub(exposto, coluna_grupo_sum)
comp_exposto_soma_sm

  terms = (f_obs_float - f_exp)**2 / f_exp


Molecular subtype,A Luminal,B Luminal,HER2-amplified,Triple-negative,A/B,B/H,H/T,T/A,B/T,H/A
Estrogen receptor expression,58.0,64.0,10.0,1.0,0.587,<0.01,<0.01,<0.01,<0.01,<0.01
Progesterone receptor expression,43.0,38.0,8.0,0.0,0.579,<0.01,<0.01,<0.01,<0.01,<0.01
HER2-amplified,0.0,0.0,24.0,0.0,,<0.01,<0.01,,,<0.01
KI67,8.0,65.0,27.0,20.0,<0.01,<0.01,0.307,0.023,<0.01,<0.01
Angiolymphatic emboli,15.0,18.0,6.0,13.0,0.602,0.014,0.108,0.705,0.369,0.05
Lymph node,19.0,30.0,9.0,10.0,0.116,<0.01,0.819,0.095,<0.01,0.059
Distant metastasis,22.0,30.0,13.0,11.0,0.267,<0.01,0.683,0.056,<0.01,0.128
Menopausal status,42.0,46.0,14.0,25.0,0.67,<0.01,0.078,0.038,0.013,<0.01
Exposure to pesticides,61.0,65.0,33.0,39.0,0.722,<0.01,0.48,0.028,0.011,<0.01
Chemoresistance,8.0,15.0,10.0,11.0,0.144,0.317,0.827,0.491,0.433,0.637


In [16]:
# Saving the tables obtained for the population exposed to pesticides
#tabela_exp = comp_exposto_soma_sm.to_excel("tabela_qquadrado_expost.xlsx")

In [17]:
comp_exposto_med_sm = tabela_freq_med_sub(exposto, coluna_grupo_med)
comp_exposto_med_sm

Molecular subtype,A Luminal,B Luminal,HER2-amplified,Triple-negative,A/B,B/H,H/T,T/A,B/T,H/A
Tumor size,23.85,33.05,34.24,33.26,0.223,0.885,0.905,0.213,0.979,0.173
Age at diagnosis,56.92,57.87,51.72,54.92,0.929,0.557,0.757,0.85,0.781,0.618
Weight,68.38,71.39,75.98,72.9,0.799,0.705,0.801,0.704,0.9,0.527
Height,1.59,1.6,1.62,1.6,0.996,0.991,0.991,0.996,1.0,0.987
BMI,27.27,27.25,28.81,28.15,0.998,0.835,0.93,0.906,0.904,0.837


### Calculating for the population not exposed to pesticides

#### Are patients not exposed to pesticides different from each other according to RISK STRATIFICATION?

In [18]:
comp_n_exposto_soma = tabela_freq_soma(n_exposto, coluna_grupo_sum)
comp_n_exposto_soma

  terms = (f_obs_float - f_exp)**2 / f_exp


Risk stratification,High,Low,Medium,H/L,L/M,H/M
Estrogen receptor expression,15.0,10.0,70.0,0.317,<0.01,<0.01
Progesterone receptor expression,10.0,9.0,51.0,0.819,<0.01,<0.01
HER2-amplified,14.0,0.0,2.0,<0.01,0.157,<0.01
KI67,31.0,1.0,42.0,<0.01,<0.01,0.198
Angiolymphatic emboli,14.0,1.0,15.0,<0.01,<0.01,0.853
Lymph node,16.0,0.0,17.0,<0.01,<0.01,0.862
Distant metastasis,17.0,0.0,19.0,<0.01,<0.01,0.739
Menopausal status,24.0,7.0,48.0,<0.01,<0.01,<0.01
Exposure to pesticides,0.0,0.0,0.0,,,
Chemoresistance,9.0,0.0,14.0,<0.01,<0.01,0.297


In [19]:
comp_n_exposto_med = tabela_freq_med(n_exposto, coluna_grupo_med)
comp_n_exposto_med

Risk stratification,High,Low,Medium,H/L,L/M,H/M
Tumor size,40.34,18.2,24.48,<0.01,0.336,0.049
Age at diagnosis,53.97,60.89,56.73,0.518,0.701,0.793
Weight,70.24,74.25,73.31,0.739,0.938,0.798
Height,1.61,1.63,1.6,0.991,0.987,0.996
BMI,27.5,26.86,28.3,0.931,0.846,0.915


#### Are patients without exposure to pesticides distinct in comparison to MOLECULAR SUBTYPE?

In [20]:
comp_n_exposto_soma_sm = tabela_freq_soma_sub(n_exposto, coluna_grupo_sum)
comp_n_exposto_soma_sm

  terms = (f_obs_float - f_exp)**2 / f_exp


Molecular subtype,A Luminal,B Luminal,HER2-amplified,Triple-negative,A/B,B/H,H/T,T/A,B/T,H/A
Estrogen receptor expression,51.0,46.0,8.0,1.0,0.612,<0.01,0.02,<0.01,<0.01,<0.01
Progesterone receptor expression,35.0,35.0,7.0,1.0,1.0,<0.01,0.034,<0.01,<0.01,<0.01
HER2-amplified,0.0,0.0,18.0,0.0,,<0.01,<0.01,,,<0.01
KI67,6.0,45.0,18.0,15.0,<0.01,<0.01,0.602,0.05,<0.01,0.014
Angiolymphatic emboli,7.0,16.0,4.0,4.0,0.061,<0.01,1.0,0.366,<0.01,0.366
Lymph node,9.0,15.0,3.0,5.0,0.221,<0.01,0.48,0.285,0.025,0.083
Distant metastasis,10.0,16.0,4.0,5.0,0.239,<0.01,0.739,0.197,0.016,0.109
Menopausal status,39.0,25.0,9.0,12.0,0.08,<0.01,0.513,<0.01,0.033,<0.01
Exposure to pesticides,0.0,0.0,0.0,0.0,,,,,,
Chemoresistance,6.0,11.0,3.0,3.0,0.225,0.033,1.0,0.317,0.033,0.317


In [21]:
# Saving the tables obtained for the population exposed to pesticides
#tabela_n_exp = comp_n_exposto_soma_sm.to_excel("tabela_qquadrado_naoexpost.xlsx")

In [22]:
comp_n_exposto_med_sm = tabela_freq_med_sub(n_exposto, coluna_grupo_med)
comp_n_exposto_med_sm

Molecular subtype,A Luminal,B Luminal,HER2-amplified,Triple-negative,A/B,B/H,H/T,T/A,B/T,H/A
Tumor size,21.96,27.27,28.95,43.82,0.449,0.823,0.081,<0.01,0.05,0.327
Age at diagnosis,58.46,54.8,50.89,55.35,0.731,0.704,0.665,0.771,0.958,0.469
Weight,74.87,73.15,63.94,74.14,0.888,0.432,0.385,0.952,0.935,0.354
Height,1.62,1.59,1.58,1.64,0.987,0.996,0.973,0.991,0.978,0.982
BMI,28.43,28.07,25.58,28.15,0.962,0.734,0.726,0.97,0.991,0.698
