# Descriptive statistics

## Importing useful libraries

In [1]:
# Importing the pandas library for data manipulation
import pandas as pd

# Import numpy library for efficient numeric operations
import numpy as np

# Import statistics library for basic statistical functions
import statistics as st

## Importing the data used

In [2]:
df = pd.read_excel('dados_cancerpositivo.xlsx', index_col = 'Número')
df

Unnamed: 0_level_0,Medical record,Date of collection,Histopathological,Estrogen receptor expression,Progesterone receptor expression,HER2-amplified,KI67,Molecular subtype,Tumor size,Grade,...,Menopausal status,Weight,Height,BMI,Exposure to pesticides,Chemoresistance,Recurrence,Death,Municipality,HER
Número,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,26214,2015-05-27,1.0,1.0,1.0,0.0,1.0,2.0,20.0,1.0,...,1.0,57.0,1.60,22.265625,1.0,0.0,0.0,0.0,Dois Vizinhos,
3,24773,2015-04-06,1.0,0.0,0.0,0.0,1.0,5.0,14.0,2.0,...,1.0,84.0,1.62,32.007316,1.0,0.0,0.0,0.0,Capanema,
5,26248,2015-06-08,1.0,0.0,0.0,0.0,1.0,5.0,30.0,2.0,...,1.0,64.0,1.52,27.700831,1.0,1.0,1.0,0.0,Planalto,
7,25778,2015-06-10,1.0,1.0,1.0,1.0,1.0,4.0,25.0,1.0,...,0.0,52.0,1.55,21.644121,1.0,1.0,0.0,0.0,Dois Vizinhos,
8,15847,2015-06-10,1.0,1.0,1.0,0.0,1.0,2.0,16.0,2.0,...,1.0,52.0,1.55,21.644121,1.0,1.0,1.0,1.0,Dois Vizinhos,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,44730,2022-12-06,1.0,1.0,1.0,1.0,1.0,4.0,15.0,2.0,...,,,,,,0.0,0.0,0.0,,
942,46377,2022-12-23,1.0,1.0,1.0,0.0,0.0,1.0,17.0,1.0,...,,,,,,0.0,0.0,0.0,,
946,44770,2023-02-08,1.0,1.0,0.0,0.0,1.0,2.0,8.0,3.0,...,,,,,,0.0,0.0,0.0,,
948,45672,2023-02-09,1.0,1.0,1.0,0.0,1.0,1.0,15.0,2.0,...,,,,,,0.0,0.0,0.0,,


## Defining functions for the population description

In [3]:
def media_idade(dados):
    ''' Function for calculating the average age of the population and its range '''
    idade = dados['Age at diagnosis'].dropna()
    contag_idade = len(idade)
    idade_min = idade.min()
    idade_max = idade.max()
    idade_med = round(st.mean(idade))
    conj_idade = idade_med, contag_idade
    return conj_idade

In [4]:
def imc_media(dados):
    ''' Function for calculating the average BMI of the population and its range '''
    imc = dados['BMI'].dropna()
    cont_imc = len(imc)
    imc_min = round(imc.min(), 2)
    imc_max = round(imc.max(), 2)
    imc_medio = round(st.mean(imc),2)
    conj_imc = imc_medio, cont_imc
    return conj_imc

In [5]:
def tamanho_tumor(dados):
    ''' Function for calculating the average tumor size of the population and its range '''
    tam_tumor = dados['Tumor size'].dropna()
    cont_tamanho = len(tam_tumor)
    tam_min = tam_tumor.min()
    tam_max = tam_tumor.max()
    tam_medio = round(st.mean(tam_tumor),2)
    conj_tam = tam_medio, cont_tamanho
    return conj_tam

In [6]:
def exposicao_agro(dados):
    ''' Function to calculate the percentage of patients exposed to pesticides in the population '''
    exp = dados['Exposure to pesticides'].dropna()
    cont_exp = len(exp)
    pacientes_exp = (exp == 1.0).sum()
    porc_exp = round(pacientes_exp/cont_exp * 100, 2)
    nada = ( ' ')
    conj_exp = "{}%".format(porc_exp), pacientes_exp
    return conj_exp

In [7]:
def estratificacao_baixo(dados):
    ''' Function to calculate the percentage of patients classified as Low risk in the population '''
    estratifica = dados['Risk stratification'].dropna()
    cont_estrat = len(estratifica)
    pacientes_baixo = (estratifica == 'Low').sum()
    porc_baixo = round(pacientes_baixo/cont_estrat * 100, 2)
    nada = (' ')
    conj_baixo = "{}%".format(porc_baixo), pacientes_baixo
    return conj_baixo

In [8]:
def estratificacao_interm(dados):
    ''' Function to calculate the percentage of patients classified as Medium risk in the population '''
    estratifica = dados['Risk stratification'].dropna()
    cont_estrat = len(estratifica)
    pacientes_inter = (estratifica == 'Medium').sum()
    porc_inter = round(pacientes_inter/cont_estrat * 100, 2)
    nada = (' ')
    conj_inter = "{}%".format(porc_inter), pacientes_inter
    return conj_inter

In [9]:
def estratificacao_alto(dados):
    ''' Function to calculate the percentage of patients classified as High risk in the population '''
    estratifica = dados['Risk stratification'].dropna()
    cont_estrat = len(estratifica)
    pacientes_alto = (estratifica == 'High').sum()
    porc_alto = round(pacientes_alto/cont_estrat * 100, 2)
    nada = (' ')
    conj_alto = "{}%".format(porc_alto), pacientes_alto
    return conj_alto

In [10]:
def subtipo_luminala(dados):
    ''' Function to calculate the percentage of patients classified as Luminal A in the population '''
    subtipo = dados['Molecular subtype'].dropna()
    cont_subtipo = len(subtipo)
    pacientes_luminala = (subtipo == 1).sum()
    porc_luminala = round(pacientes_luminala/cont_subtipo * 100, 2)
    nada = (' ')
    conj_luminala = "{}%".format(porc_luminala), pacientes_luminala
    return conj_luminala

In [11]:
def subtipo_luminalb(dados):
    ''' Function to calculate the percentage of patients classified as Luminal B in the population '''
    subtipo = dados['Molecular subtype'].dropna()
    cont_subtipo = len(subtipo)
    pacientes_luminalb = (subtipo == 2).sum()
    porc_luminalb = round(pacientes_luminalb/cont_subtipo * 100, 2)
    nada = (' ')
    conj_luminalb = "{}%".format(porc_luminalb),pacientes_luminalb
    return conj_luminalb

In [12]:
def subtipo_her(dados):
    ''' Function to calculate the percentage of patients classified as HER2 in the population '''
    subtipo = dados['Molecular subtype'].dropna()
    cont_subtipo = len(subtipo)
    pacientes_her = (subtipo == 4).sum()
    porc_her = round(pacientes_her/cont_subtipo * 100, 2)
    nada = (' ')
    conj_her = "{}%".format(porc_her), pacientes_her
    return conj_her

In [13]:
def subtipo_triplo(dados):
    ''' Function to calculate the percentage of patients classified as Triplo negativo in the population '''
    subtipo = dados['Molecular subtype'].dropna()
    cont_subtipo = len(subtipo)
    pacientes_triplo = (subtipo == 5).sum()
    porc_triplo = round(pacientes_triplo/cont_subtipo * 100, 2)
    nada = (' ')
    conj_triplo = "{}%".format(porc_triplo), pacientes_triplo
    return conj_triplo

In [14]:
def grau_1(dados):
    ''' Function to calculate the percentage of patients classified as grade 1 in the population '''
    grau = dados['Grade'].dropna()
    cont_grau = len(grau)
    pacientes_1 = (grau == 1).sum()
    porc_1 = round(pacientes_1/cont_grau * 100, 2)
    nada = (' ')
    conj_1 = "{}%".format(porc_1), pacientes_1
    return conj_1

In [15]:
def grau_2(dados):
    ''' Function to calculate the percentage of patients classified as grade 2 in the population '''
    grau = dados['Grade'].dropna()
    cont_grau = len(grau)
    pacientes_2 = (grau == 2).sum()
    porc_2 = round(pacientes_2/cont_grau * 100, 2)
    nada = (' ')
    conj_2 = "{}%".format(porc_2), pacientes_2
    return conj_2

In [16]:
def grau_3(dados):
    ''' Function to calculate the percentage of patients classified as grade 3 in the population '''
    grau = dados['Grade'].dropna()
    cont_grau = len(grau)
    pacientes_3 = (grau == 3).sum()
    porc_3 = round(pacientes_3/cont_grau * 100, 2)
    nada = (' ')
    conj_3 = "{}%".format(porc_3), pacientes_3
    return conj_3

In [17]:
def incidencia_recorr(dados):
    ''' Function to calculate the percentage of patients who had breast cancer recurrence in the population '''
    recor = dados['Recurrence'].dropna()
    cont_recor = len(recor)
    pacientes_recor = (recor == 1.0).sum()
    porc_recor = round(pacientes_recor/cont_recor * 100, 2)
    nada = (' ')
    conj_recor = "{}%".format(porc_recor), pacientes_recor
    return conj_recor

In [18]:
def incidencia_quimio(dados):
    ''' Function to calculate the percentage of patients who had chemoresistance to treatment in the population '''
    quimio = dados['Chemoresistance'].dropna()
    cont_quimio = len(quimio)
    pacientes_quimio = (quimio == 1.0).sum()
    porc_quimio = round(pacientes_quimio/cont_quimio * 100, 2)
    nada = (' ')
    conj_quimio = "{}%".format(porc_quimio), pacientes_quimio
    return conj_quimio

In [19]:
def incidencia_obito(dados):
    ''' Function to calculate the percentage of patients who died in the population '''
    obito = dados['Death'].dropna()
    cont_obito = len(obito)
    pac_obito = (obito == 1.0).sum()
    porc_obito = round(pac_obito/cont_obito * 100, 2)
    nada = ( ' ')
    conj_obito = "{}%".format(porc_obito), pac_obito
    return conj_obito

In [20]:
def estruturacao(dados) -> pd.DataFrame:
    ''' Function to gather the results of other functions into a table '''
    dic_dados = {
        "Mean age at diagnosis (years)": media_idade(dados),
        "BMI (kg/m²)": imc_media(dados),
        "Occupational pesticide exposure": exposicao_agro(dados),
        "Average tumor size (mm)": tamanho_tumor(dados),
        "Luminal A": subtipo_luminala(dados),
        "Luminal B": subtipo_luminalb(dados),
        "HER2-amplified": subtipo_her(dados),
        "Triple-negative":subtipo_triplo(dados),
        "Grade 1":grau_1(dados),
        "Grade 2": grau_2(dados),
        "Grade 3": grau_3(dados),
        "Low risk": estratificacao_baixo(dados),
        "Medium risk": estratificacao_interm(dados),
        "High risk": estratificacao_alto(dados),     
        "Recurrence": incidencia_recorr(dados),
        "Chemoresistance": incidencia_quimio(dados),
        "Death": incidencia_obito(dados)
    }
    
    tabela = pd.DataFrame(data = dic_dados).T
    tabela.columns = ['Total population', 'Number of population']
    
    return tabela

## Applying the functions

### Total population (n=386)

In [21]:
len(df)

386

In [22]:
# Performing calculations for descriptive statistics
pop_geral = estruturacao(df)
pop_geral

Unnamed: 0,Total population,Number of population
Mean age at diagnosis (years),56,353.0
BMI (kg/m²),27.95,259.0
Occupational pesticide exposure,59.6%,208.0
Average tumor size (mm),29.21,357.0
Luminal A,33.24%,118.0
Luminal B,34.37%,122.0
HER2-amplified,16.62%,59.0
Triple-negative,15.77%,56.0
Grade 1,28.21%,101.0
Grade 2,51.68%,185.0


In [23]:
# Saving the tables obtained for the general population
#tabela_anaexpl = pop_geral.to_excel("tabela_análise_exploratória.xlsx")

### Patients exposed to pesticides (n=208)

In [24]:
# Separating the population exposed to pesticides
pac_agrot = df.loc[(df['Exposure to pesticides'] == 1.0)]

In [25]:
len(pac_agrot)

208

In [26]:
# Performing calculations for descriptive statistics
pop_exp = estruturacao(pac_agrot)
pop_exp

Unnamed: 0,Total population,Number of population
Mean age at diagnosis (years),56,197.0
BMI (kg/m²),27.82,150.0
Occupational pesticide exposure,100.0%,208.0
Average tumor size (mm),30.82,201.0
Luminal A,30.81%,61.0
Luminal B,33.33%,66.0
HER2-amplified,16.67%,33.0
Triple-negative,19.19%,38.0
Grade 1,29.15%,58.0
Grade 2,52.26%,104.0


In [27]:
# Saving the tables obtained for the population exposed to pesticides
#tabela_anaexpl_exp = pop_exp.to_excel("tabela_análise_exploratória_exposto.xlsx")

### Patients without exposure to pesticides (n=141)

In [28]:
# Separating the population not exposed to pesticides
pac_nao_agrot = df.loc[(df['Exposure to pesticides'] == 0.0)]
len(pac_nao_agrot)

141

In [29]:
# Performing calculations for descriptive statistics
pop_nao_exp = estruturacao(pac_nao_agrot)
pop_nao_exp

Unnamed: 0,Total population,Number of population
Mean age at diagnosis (years),56,134.0
BMI (kg/m²),27.92,97.0
Occupational pesticide exposure,0.0%,0.0
Average tumor size (mm),28.11,134.0
Luminal A,37.78%,51.0
Luminal B,34.81%,47.0
HER2-amplified,15.56%,21.0
Triple-negative,11.85%,16.0
Grade 1,29.2%,40.0
Grade 2,48.91%,67.0


In [30]:
# Saving the tables obtained for the population not exposed to pesticides
#tabela_anaexpl_nao_exp = pop_nao_exp.to_excel("tabela_análise_exploratória_nao_exposto.xlsx")

## Combining the tables of total population, population exposed to pesticides and population unexposed to pesticides

In [31]:
tabela_geral = pd.merge(pop_geral, pop_exp, on = pop_geral.index)
tabela_geral = tabela_geral.set_index('key_0')
tabela_geral

Unnamed: 0_level_0,Total population_x,Number of population_x,Total population_y,Number of population_y
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mean age at diagnosis (years),56,353.0,56,197.0
BMI (kg/m²),27.95,259.0,27.82,150.0
Occupational pesticide exposure,59.6%,208.0,100.0%,208.0
Average tumor size (mm),29.21,357.0,30.82,201.0
Luminal A,33.24%,118.0,30.81%,61.0
Luminal B,34.37%,122.0,33.33%,66.0
HER2-amplified,16.62%,59.0,16.67%,33.0
Triple-negative,15.77%,56.0,19.19%,38.0
Grade 1,28.21%,101.0,29.15%,58.0
Grade 2,51.68%,185.0,52.26%,104.0


In [32]:
tabela_geral_completa = pd.merge(tabela_geral, pop_nao_exp,  on = tabela_geral.index)
tabela_geral_completa

Unnamed: 0,key_0,Total population_x,Number of population_x,Total population_y,Number of population_y,Total population,Number of population
0,Mean age at diagnosis (years),56,353.0,56,197.0,56,134.0
1,BMI (kg/m²),27.95,259.0,27.82,150.0,27.92,97.0
2,Occupational pesticide exposure,59.6%,208.0,100.0%,208.0,0.0%,0.0
3,Average tumor size (mm),29.21,357.0,30.82,201.0,28.11,134.0
4,Luminal A,33.24%,118.0,30.81%,61.0,37.78%,51.0
5,Luminal B,34.37%,122.0,33.33%,66.0,34.81%,47.0
6,HER2-amplified,16.62%,59.0,16.67%,33.0,15.56%,21.0
7,Triple-negative,15.77%,56.0,19.19%,38.0,11.85%,16.0
8,Grade 1,28.21%,101.0,29.15%,58.0,29.2%,40.0
9,Grade 2,51.68%,185.0,52.26%,104.0,48.91%,67.0


In [33]:
tabela_geral_completa = tabela_geral_completa.set_index('key_0')
tabela_geral_completa = tabela_geral_completa.rename(columns={'Total population_x': 'Total population', 'Number of population_x': 'Number of patients', 'Total population_y':'Population exposed to pesticides', 'Number of population_y':'Number of patients exposed to pesticides', 'Total population':'Population unexposed to pesticides', 'Number of population':'Number of patients unexposed to pesticides' })
tabela_geral_completa

Unnamed: 0_level_0,Total population,Number of patients,Population exposed to pesticides,Number of patients exposed to pesticides,Population unexposed to pesticides,Number of patients unexposed to pesticides
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mean age at diagnosis (years),56,353.0,56,197.0,56,134.0
BMI (kg/m²),27.95,259.0,27.82,150.0,27.92,97.0
Occupational pesticide exposure,59.6%,208.0,100.0%,208.0,0.0%,0.0
Average tumor size (mm),29.21,357.0,30.82,201.0,28.11,134.0
Luminal A,33.24%,118.0,30.81%,61.0,37.78%,51.0
Luminal B,34.37%,122.0,33.33%,66.0,34.81%,47.0
HER2-amplified,16.62%,59.0,16.67%,33.0,15.56%,21.0
Triple-negative,15.77%,56.0,19.19%,38.0,11.85%,16.0
Grade 1,28.21%,101.0,29.15%,58.0,29.2%,40.0
Grade 2,51.68%,185.0,52.26%,104.0,48.91%,67.0


In [34]:
#salvando_tabela_geral_completa = tabela_geral_completa.to_excel("tabela_análise_exploratória_completa.xlsx")