# Exploração - Censo Escolar - Gestor

**OBJETIVOS**: Explorar a base de Gestor do censo escolar e fazer o rascunho do processamento de dados

---

## Cabeçalho 

### Imports 

In [1]:
import os
import pyunpack
import rarfile
import shutil
import zipfile
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm

### Caminhos 

In [2]:
PASTA_PROJETO = Path("C:/Users/pedro.forli/PycharmProjects/curso-ciencia-dados")
PASTA_DADOS = PASTA_PROJETO / "dados"
PASTA_SAIDA = PASTA_PROJETO / "saidas"
PASTA_NOTEBOOK = PASTA_PROJETO / "notebooks"
os.chdir(PASTA_PROJETO)

### Variáveis 

### Configurações 

In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

### Código Próprio 

In [4]:
import src.io.le_dados as le_dados
import src.io.caminho as caminho
import src.io.data_store as data_store

---

## Carregamento de Dados 

In [34]:
os.chdir(PASTA_PROJETO)
dados = {
    f: le_dados.le_dados_comprimidos(
        f"dados/completo/externo/censo_escolar/{f}",
        ext="zip",
        como_df=True,
        padrao_comp="(gestor|GESTOR|Gestor)[.](csv|CSV|rar|RAR|zip|ZIP)",
        sep="|",
        encoding="latin-1",
    )
    for f in os.listdir("dados/completo/externo/censo_escolar")
}
dados = {k: v for k, v in dados.items() if v is not None}

---

## Exploração 

Visualiza como os dados aparecem na base

In [19]:
dados["2020.zip"].head()

Unnamed: 0,NU_ANO_CENSO,ID_GESTOR,NU_MES,NU_ANO,NU_IDADE_REFERENCIA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,CO_PAIS_ORIGEM,...,IN_MANT_ESCOLA_PRIVADA_EMP,IN_MANT_ESCOLA_PRIVADA_ONG,IN_MANT_ESCOLA_PRIV_ONG_OSCIP,IN_MANT_ESCOLA_PRIVADA_OSCIP,IN_MANT_ESCOLA_PRIVADA_SIND,IN_MANT_ESCOLA_PRIVADA_SIST_S,IN_MANT_ESCOLA_PRIVADA_S_FINS,TP_REGULAMENTACAO,TP_LOCALIZACAO_DIFERENCIADA,IN_EDUCACAO_INDIGENA
0,2020,0806A47682ED9B2623545A093A78E251,6,1963,56,57,2,0,1,76,...,,,,,,,,0,0,0
1,2020,EAB8B462AF980DD0573716BA48E4CF65,2,1968,52,52,2,4,1,76,...,,,,,,,,1,0,0
2,2020,524EBEBA473F580794138450E3D14280,12,1976,43,44,2,1,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
3,2020,5CA8772535CE419ED1C8A3562D5836A4,12,1945,74,75,2,0,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
4,2020,EF19AAEE073A092AB527A0F324BDFEFD,1,1976,44,44,1,1,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0


In [20]:
dados["2020.zip"].tail()

Unnamed: 0,NU_ANO_CENSO,ID_GESTOR,NU_MES,NU_ANO,NU_IDADE_REFERENCIA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,CO_PAIS_ORIGEM,...,IN_MANT_ESCOLA_PRIVADA_EMP,IN_MANT_ESCOLA_PRIVADA_ONG,IN_MANT_ESCOLA_PRIV_ONG_OSCIP,IN_MANT_ESCOLA_PRIVADA_OSCIP,IN_MANT_ESCOLA_PRIVADA_SIND,IN_MANT_ESCOLA_PRIVADA_SIST_S,IN_MANT_ESCOLA_PRIVADA_S_FINS,TP_REGULAMENTACAO,TP_LOCALIZACAO_DIFERENCIADA,IN_EDUCACAO_INDIGENA
188356,2020,C56EA63CD1C53FEEB5B986F1EBF71DDA,1,1979,41,41,2,1,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
188357,2020,48F3D91B9A349C8770FAF0A2C4042DD0,6,1981,38,39,1,0,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
188358,2020,19AE62541E495EC2E5B049CFC0347B3A,2,1959,61,61,2,1,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
188359,2020,1A9A32E31AC49FC739A41AF1C2B74943,7,1975,44,45,1,0,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
188360,2020,9803FFD807940C340616AD4B095ADBEF,5,1967,53,53,2,3,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0


In [21]:
dados["2020.zip"].sample(5)

Unnamed: 0,NU_ANO_CENSO,ID_GESTOR,NU_MES,NU_ANO,NU_IDADE_REFERENCIA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,CO_PAIS_ORIGEM,...,IN_MANT_ESCOLA_PRIVADA_EMP,IN_MANT_ESCOLA_PRIVADA_ONG,IN_MANT_ESCOLA_PRIV_ONG_OSCIP,IN_MANT_ESCOLA_PRIVADA_OSCIP,IN_MANT_ESCOLA_PRIVADA_SIND,IN_MANT_ESCOLA_PRIVADA_SIST_S,IN_MANT_ESCOLA_PRIVADA_S_FINS,TP_REGULAMENTACAO,TP_LOCALIZACAO_DIFERENCIADA,IN_EDUCACAO_INDIGENA
23002,2020,C3ABC5F16A61C170C596F05CAE96BDB3,5,1975,45,45,2,1,1,76,...,,,,,,,,1,0,0
114966,2020,40F7FA44F6DC257BFADB077018101D77,7,1976,43,44,2,1,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
38834,2020,D29C8E6E211FD78B2A8950995C0E85CA,2,1985,35,35,1,3,1,76,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,0
172431,2020,B2EB080119C15903A41F95029CDCAA09,7,1964,55,56,2,0,1,76,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0
175591,2020,D5EEF10D35C995A3245F016B84429B7E,9,1972,47,48,2,1,1,76,...,,,,,,,,1,0,0


Verifica informações básicas de tamanho

In [22]:
dados["2020.zip"].shape

(188361, 83)

Tipagem

In [None]:
ID_GESTOR
ANO
DT_NASCIMENTO
NU_IDADE
TP_SEXO

In [33]:
dados["2020.zip"]["CO_IES_1"].min()

1.0

In [31]:
dados["2020.zip"]["CO_AREA_CURSO_1"].unique()

array([ 0., nan,  1.])

In [23]:
dados["2020.zip"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188361 entries, 0 to 188360
Data columns (total 83 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   NU_ANO_CENSO                   188361 non-null  int64  
 1   ID_GESTOR                      188361 non-null  object 
 2   NU_MES                         188361 non-null  int64  
 3   NU_ANO                         188361 non-null  int64  
 4   NU_IDADE_REFERENCIA            188361 non-null  int64  
 5   NU_IDADE                       188361 non-null  int64  
 6   TP_SEXO                        188361 non-null  int64  
 7   TP_COR_RACA                    188361 non-null  int64  
 8   TP_NACIONALIDADE               188361 non-null  int64  
 9   CO_PAIS_ORIGEM                 188361 non-null  int64  
 10  CO_UF_NASC                     188106 non-null  float64
 11  CO_MUNICIPIO_NASC              188106 non-null  float64
 12  IN_NECESSIDADE_ESPECIAL       

In [48]:
pd.read_parquet("dados/completo/aquisicao/censo_gestor.parquet").count()

ID_GESTOR                      376101
CO_ENTIDADE                    376101
CO_PAIS_ORIGEM                 376101
CO_IES_1                       376101
CO_CURSO_1                     321089
CO_IES_2                       376101
CO_CURSO_2                      29084
CO_MUNICIPIO_NASC              376101
DT_NASCIMENTO                  376101
NU_IDADE                       376101
IN_NECESSIDADE_ESPECIAL        376101
IN_BAIXA_VISAO                 376101
IN_CEGUEIRA                    376101
IN_DEF_AUDITIVA                376101
IN_DEF_FISICA                  376101
IN_DEF_INTELECTUAL             376101
IN_SURDEZ                      376101
IN_SURDOCEGUEIRA               376101
IN_DEF_MULTIPLA                376101
IN_AUTISMO                     376101
IN_SUPERDOTACAO                376101
IN_ESPECIALIZACAO              376101
IN_MESTRADO                    376101
IN_DOUTORADO                   376101
IN_POS_NENHUM                  376101
IN_ESPECIFICO_CRECHE           376101
IN_ESPECIFIC

---