Usando a base de dados da [Pesquisa Nacional de Saúde do Escolar - PeNSE](https://www.ibge.gov.br/estatisticas/sociais/educacao/9134-pesquisa-nacional-de-saude-do-escolar.html) do IBGE.  
Base de dados: [PENSE_2015_AMOSTRA2](https://www.ibge.gov.br/estatisticas/downloads-estatisticas.html?caminho=pense/2015/microdados/).

# Importação dos dados

In [1]:
# imports
import numpy as np
import pandas as pd
from zipfile import ZipFile
import requests
import io
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

In [2]:
# faz download do dataset
url = 'https://github.com/LucasGabrielB/Alura-Bootcamp-Data-Science-Aplicada/raw/main/Modulo-03/datasets/PeNSE_2015_AMOSTRA2.zip'

req = requests.get(url)
zip_file = ZipFile(io.BytesIO(req.content))
zip_file.extractall("datasets")

In [3]:
df = pd.read_csv('/content/datasets/arquivos csv/PENSE_AMOSTRA2_ALUNO.CSV', sep=';', decimal=',')

df.head()

Unnamed: 0,ANOPESQ,PAIS,REGEOGR,VB00004,VB01001,VB01002,VB01003,VB01004,VB01005,VB01006,VB01007,VB01008A,VB01010A,VB01011,VB01012,VB01013,VB01014,VB01015A,VB01016,VB01017,VB01018,VB01019,VB01020A,VB01021,VB01022,VB01023,VB01024,VB01025,VB01026,VB02001,VB02002,VB02004A,VB02010,VB02011,VB02013,VB02017A,VB02018A,VB02019A,VB02020A,VB02021,...,VB11005,VB11006,VB11007,VB12001,VB12002,VB12003,VB13001,VB13002A,VB13004A,VB13005,VB13006,VB13007,VB13008,VB13009,VB14001,VB14002,VB16001A01,VB16001A02,VB16001A03,VB16001A04,VB16001A05,VB16001A06,VB16001A07,VB16001A08,VB17001,VB17002,VB17003,VB17004,VB17005,VB17006,ESTRATO_EXP,ESTRATOGEOREG,PESO,V0006,V0007,V0008,V0041,aluno,escola,turma
0,2015,76,1,1,2,1,13,7,7,1,2,7,3,2,-1,1,1,1,1,1,2,4,2,4,1,2,2,5,1,8,3,4,3,6,6,1,6,1,-1,2,...,2,1,2,1,1,4,2,-1,3,1,1,-1,1,1,2,2,1,2,2,2,2,2,2,2,371.0,1570.0,37.1,157.0,2,1,1223,1,299.735235,1,2,4,1,1,1,296
1,2015,76,1,1,2,1,14,4,7,1,1,7,3,2,-1,1,1,1,1,1,2,2,2,4,1,2,2,5,3,8,2,1,8,8,6,1,1,5,-1,2,...,2,3,2,4,4,4,1,2,3,3,1,-1,1,1,2,2,2,2,2,1,2,2,2,2,502.0,1520.0,50.2,152.0,2,2,1223,1,355.170081,1,2,4,1,2,1,296
2,2015,76,1,1,1,4,13,6,7,1,1,5,5,2,-1,2,1,1,1,1,2,3,1,4,1,2,2,5,3,2,3,1,8,2,8,1,1,5,5,1,...,2,1,2,1,2,3,1,2,1,2,5,99,1,-1,2,2,1,1,1,1,1,1,1,1,482.0,1614.0,48.2,161.4,2,2,1223,1,299.735235,1,2,4,1,3,1,296
3,2015,76,1,1,1,1,14,6,7,1,1,7,4,2,-1,1,1,1,1,1,2,4,2,4,1,2,2,6,3,2,4,1,3,2,7,1,5,1,-1,2,...,2,1,2,2,2,4,1,10,3,1,2,-1,1,-1,2,2,1,2,2,2,1,2,2,2,694.0,1725.0,69.4,172.5,2,3,1223,1,355.170081,1,2,4,1,4,1,296
4,2015,76,1,1,1,1,13,9,7,1,2,5,4,2,-1,2,1,1,1,1,2,5,1,4,1,2,2,6,5,6,3,5,2,6,2,1,5,1,5,1,...,2,1,4,2,2,4,1,3,2,2,2,13,1,-1,1,2,1,1,1,1,1,1,1,1,789.0,1675.0,78.9,167.5,2,4,1223,1,299.735235,1,2,4,1,5,1,296


In [4]:
df.shape

(16556, 181)

In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16556 entries, 0 to 16555
Columns: 181 entries, ANOPESQ to turma
dtypes: float64(10), int64(171)
memory usage: 22.9 MB


# Analisando os dados

VB01001: Sexo do aluno  
VB17004: Altura do aluno

In [6]:
df.groupby('VB01001')['VB17004'].mean()

VB01001
1    163.860758
2    157.558617
Name: VB17004, dtype: float64

In [7]:
df.groupby(['VB01001', 'VB01003'])['VB17004'].mean()

VB01001  VB01003
1        11         147.617753
         12         153.696764
         13         160.719801
         14         166.406922
         15         170.336297
         16         172.424933
         17         173.814341
         18         173.024561
         19         172.143158
2        11         149.522512
         12         154.965021
         13         157.673105
         14         159.340320
         15         160.759848
         16         160.386527
         17         160.907937
         18         159.438462
         19         158.977193
Name: VB17004, dtype: float64

In [8]:
df_crosstab = pd.crosstab(index=[df['VB01001'], df['VB01003']],
                            values=df['VB17004'],
                            columns='',
                            aggfunc= 'mean')

df_crosstab.rename(index={1: 'Masculino', 2: 'Feminino'})

VB01001    VB01003
Masculino  11         147.617753
           12         153.696764
           13         160.719801
           14         166.406922
           15         170.336297
           16         172.424933
           17         173.814341
           18         173.024561
           19         172.143158
Feminino   11         149.522512
           12         154.965021
           13         157.673105
           14         159.340320
           15         160.759848
           16         160.386527
           17         160.907937
           18         159.438462
           19         158.977193
Name: __dummy__, dtype: float64

## Analisando medidas de disperção

In [9]:
df['MEDIA_ALTURA'] = df['VB17004'].mean()
df[['VB17004', 'MEDIA_ALTURA']].head()

Unnamed: 0,VB17004,MEDIA_ALTURA
0,157.0,160.713113
1,152.0,160.713113
2,161.4,160.713113
3,172.5,160.713113
4,167.5,160.713113


### Desvio médio absoluto

In [13]:
df['DESVIO_MEDIO_ABSOLUTO'] = abs(df['MEDIA_ALTURA'] - df['VB17004'])
df[['VB17004', 'MEDIA_ALTURA', 'DESVIO_MEDIO_ABSOLUTO']].head()

Unnamed: 0,VB17004,MEDIA_ALTURA,DESVIO_MEDIO_ABSOLUTO
0,157.0,160.713113,3.713113
1,152.0,160.713113,8.713113
2,161.4,160.713113,0.686887
3,172.5,160.713113,11.786887
4,167.5,160.713113,6.786887


### Quadrado da diferença

In [14]:
df['DIFERENCA_QUADRADO'] = df['DESVIO_MEDIO_ABSOLUTO'] ** 2
df[['VB17004', 'MEDIA_ALTURA', 'DIFERENCA_QUADRADO']].head()

Unnamed: 0,VB17004,MEDIA_ALTURA,DIFERENCA_QUADRADO
0,157.0,160.713113,13.787209
1,152.0,160.713113,75.918339
2,161.4,160.713113,0.471814
3,172.5,160.713113,138.930703
4,167.5,160.713113,46.061834


### Variância

Populacional:

In [20]:
variance_populational = df['DIFERENCA_QUADRADO'].mean()
variance_populational

106.9105945368652

Amostral:

In [21]:
variance_sample = (1/(len(df)-1)) * df['DIFERENCA_QUADRADO'].sum()
variance_sample

106.91705244049119

### Desvio padrão

Populacional:

In [22]:
variance_populational ** (1/2)

10.339757953495102

Amostral:

In [23]:
variance_sample ** (1/2)

10.340070233827776

## Função que criar um dicionário para facilitar a busca da pergunta

In [11]:
df_dict = pd.read_excel('/content/datasets/Dicionario_PENSE_Microdados_AMOSTRA2.xls')

def get_question_description(colum_name) -> dict:
    ''' Retorna um dicionario contendo a descrição de uma determinada coluna do dataset do PeNSE,
        seguindo o seguinte padrao (exemplo com colum_name='VB01001'):
            {
            'Descrição': {
                '1': 'Masculino',
                '2': 'Feminino'},
            'Pergunta': 'Qual é o seu sexo?'
            }

        Parametros:
            colum_name: nome da coluna do DataFrame
    '''
    return_dict = dict()

    description_row = df_dict.query(f'VARIÁVEL == "{colum_name}"')['QUESTIONÁRIO DO ALUNO']
    return_dict['Pergunta'] = description_row.values[0]
   
    values = list()
    index = description_row.index[0] + 1
    while True:
        value = df_dict.iloc[index]['VARIÁVEL']
        # removendo ' a ' pois alguns numeros estão em intervalos
        # exemplo: 0 a 2999
        if type(value) is int or value.replace(' a ', '').isnumeric():
            values.append([value, df_dict.iloc[index]['QUESTIONÁRIO DO ALUNO']])
            index += 1
        else:
            break
    return_dict['Descrição'] = dict(values)
    
    return return_dict

In [12]:
get_question_description('VB01001')

{'Descrição': {'1': 'Masculino', '2': 'Feminino'},
 'Pergunta': 'Qual é o seu sexo?'}