Usando a base de dados da [Pesquisa Nacional de Saúde do Escolar - PeNSE](https://www.ibge.gov.br/estatisticas/sociais/educacao/9134-pesquisa-nacional-de-saude-do-escolar.html) do IBGE.  
Base de dados: [PENSE_2015_AMOSTRA2](https://www.ibge.gov.br/estatisticas/downloads-estatisticas.html?caminho=pense/2015/microdados/).

In [1]:
# imports
import pandas as pd
from zipfile import ZipFile
import requests
import io

In [2]:
# faz download do dataset
url = 'https://github.com/LucasGabrielB/Alura-Bootcamp-Data-Science-Aplicada/raw/main/Modulo-03/datasets/PeNSE_2015_AMOSTRA2.zip'

req = requests.get(url)
zip_file = ZipFile(io.BytesIO(req.content))
zip_file.extractall("datasets")

In [3]:
df = pd.read_csv('/content/datasets/arquivos csv/PENSE_AMOSTRA2_ALUNO.CSV', sep=';', decimal=',')

df.head()

Unnamed: 0,ANOPESQ,PAIS,REGEOGR,VB00004,VB01001,VB01002,VB01003,VB01004,VB01005,VB01006,VB01007,VB01008A,VB01010A,VB01011,VB01012,VB01013,VB01014,VB01015A,VB01016,VB01017,VB01018,VB01019,VB01020A,VB01021,VB01022,VB01023,VB01024,VB01025,VB01026,VB02001,VB02002,VB02004A,VB02010,VB02011,VB02013,VB02017A,VB02018A,VB02019A,VB02020A,VB02021,...,VB11005,VB11006,VB11007,VB12001,VB12002,VB12003,VB13001,VB13002A,VB13004A,VB13005,VB13006,VB13007,VB13008,VB13009,VB14001,VB14002,VB16001A01,VB16001A02,VB16001A03,VB16001A04,VB16001A05,VB16001A06,VB16001A07,VB16001A08,VB17001,VB17002,VB17003,VB17004,VB17005,VB17006,ESTRATO_EXP,ESTRATOGEOREG,PESO,V0006,V0007,V0008,V0041,aluno,escola,turma
0,2015,76,1,1,2,1,13,7,7,1,2,7,3,2,-1,1,1,1,1,1,2,4,2,4,1,2,2,5,1,8,3,4,3,6,6,1,6,1,-1,2,...,2,1,2,1,1,4,2,-1,3,1,1,-1,1,1,2,2,1,2,2,2,2,2,2,2,371.0,1570.0,37.1,157.0,2,1,1223,1,299.735235,1,2,4,1,1,1,296
1,2015,76,1,1,2,1,14,4,7,1,1,7,3,2,-1,1,1,1,1,1,2,2,2,4,1,2,2,5,3,8,2,1,8,8,6,1,1,5,-1,2,...,2,3,2,4,4,4,1,2,3,3,1,-1,1,1,2,2,2,2,2,1,2,2,2,2,502.0,1520.0,50.2,152.0,2,2,1223,1,355.170081,1,2,4,1,2,1,296
2,2015,76,1,1,1,4,13,6,7,1,1,5,5,2,-1,2,1,1,1,1,2,3,1,4,1,2,2,5,3,2,3,1,8,2,8,1,1,5,5,1,...,2,1,2,1,2,3,1,2,1,2,5,99,1,-1,2,2,1,1,1,1,1,1,1,1,482.0,1614.0,48.2,161.4,2,2,1223,1,299.735235,1,2,4,1,3,1,296
3,2015,76,1,1,1,1,14,6,7,1,1,7,4,2,-1,1,1,1,1,1,2,4,2,4,1,2,2,6,3,2,4,1,3,2,7,1,5,1,-1,2,...,2,1,2,2,2,4,1,10,3,1,2,-1,1,-1,2,2,1,2,2,2,1,2,2,2,694.0,1725.0,69.4,172.5,2,3,1223,1,355.170081,1,2,4,1,4,1,296
4,2015,76,1,1,1,1,13,9,7,1,2,5,4,2,-1,2,1,1,1,1,2,5,1,4,1,2,2,6,5,6,3,5,2,6,2,1,5,1,5,1,...,2,1,4,2,2,4,1,3,2,2,2,13,1,-1,1,2,1,1,1,1,1,1,1,1,789.0,1675.0,78.9,167.5,2,4,1223,1,299.735235,1,2,4,1,5,1,296


In [4]:
df.shape

(16556, 181)

In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16556 entries, 0 to 16555
Columns: 181 entries, ANOPESQ to turma
dtypes: float64(10), int64(171)
memory usage: 22.9 MB


# Analisando os dados

In [13]:
def generate_frequency_table(dataframe: pd.DataFrame, colum_name,
                             convert_index_names :dict=None, index_name=None) -> pd.DataFrame:
    ''' cria uma tabela de frequencias de determinada coluna de um dataframe.

        Parametros

        dataframe : pandas Dataframe.
        colum_name : nome da coluna do dataframe da qual sera gerada a tabela de frequencias.
        convert_index_names: dicionario para renomear o index da tabela de frequencias.
        index_name: nome que sera usado para a coluna index da tabela de frequencias.
    '''

    freq_df = pd.DataFrame({'Frequencia absoluta': dataframe[colum_name].value_counts(),
                            'Frequencia relativa': dataframe[colum_name].value_counts(normalize=True) * 100,
                            'Frequencia acumulada': dataframe[colum_name].value_counts().cumsum(),
                            'Frequencia relativa acumulada': (dataframe[colum_name].value_counts(normalize=True) * 100).cumsum()})
    
    if convert_index_names:
        freq_df.rename(index=convert_index_names, inplace=True)

    if index_name:
        freq_df.rename_axis(index_name, axis=1, inplace=True)

    return freq_df

Qual seu sexo?

In [14]:
sex_responses_conversion = {
    1: 'Masculino',
    2: 'Feminino'
}

df_sex = generate_frequency_table(df, 'VB01001',
                                  sex_responses_conversion, 'Sexo')

display(df_sex)

Sexo,Frequencia absoluta,Frequencia relativa,Frequencia acumulada,Frequencia relativa acumulada
Masculino,8287,50.054361,8287,50.054361
Feminino,8269,49.945639,16556,100.0


Como você se sente em relação ao seu corpo?

In [15]:
body_satisfaction_responses_conversion = {
    1: 'Muito satisfeito',
    2: 'Satisfeito',
    3: 'Indiferente',
    4: 'Insatisfeito',
    5: 'Muito insatisfeito',
    99: 'Não informado'
}

df_body_satisfaction =  generate_frequency_table(df, 'VB11007',
                                                 body_satisfaction_responses_conversion,
                                                 'Sentimento em relação ao corpo')

display(df_body_satisfaction)

Sentimento em relação ao corpo,Frequencia absoluta,Frequencia relativa,Frequencia acumulada,Frequencia relativa acumulada
Satisfeito,6920,41.797536,6920,41.797536
Muito satisfeito,4608,27.83281,11528,69.630345
Insatisfeito,2326,14.049287,13854,83.679633
Indiferente,1807,10.914472,15661,94.594105
Muito insatisfeito,714,4.312636,16375,98.906741
Não informado,181,1.093259,16556,100.0
