# Download Microdados ENEM 2021-2023

Notebook responsável por baixar os microdados do ENEM (2021, 2022 e 2023), armazenar os arquivos ZIP em `zips/` e extrair o conteúdo para o diretório `microdados/DADOS`.

In [None]:
from __future__ import annotations
import shutil
from pathlib import Path, PurePosixPath
from typing import Dict, List
from urllib.parse import urlparse
from urllib.request import urlopen, Request
from zipfile import ZipFile
import requests

import pandas as pd

RAW_DATA_DIR = Path("zips")
EXTRACT_DIR = Path("microdados")
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

MICRODATA_URLS: Dict[int, str] = {
    2021: "https://download.inep.gov.br/microdados/microdados_enem_2021.zip",
    2022: "https://download.inep.gov.br/microdados/microdados_enem_2022.zip",
    2023: "https://download.inep.gov.br/microdados/microdados_enem_2023.zip",
}
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/121.0.0.0 Safari/537.36"
)
DEFAULT_HEADERS = {"User-Agent": USER_AGENT}


In [None]:
def download_microdata(year: int, url: str) -> Path:
    zip_filename = Path(urlparse(url).path).name or f"microdados_enem_{year}.zip"
    zip_path = RAW_DATA_DIR / zip_filename

    if zip_path.exists():
        print(f"{year}: arquivo ZIP já existe em {zip_path}, pulando download.")
        return zip_path

    print(f"{year}: baixando {url}...")

    for attempt in range(3):
        try:
            with requests.get(url, headers=DEFAULT_HEADERS, stream=True, timeout=60) as response:
                response.raise_for_status()

                with open(zip_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            print(f"{year}: download concluído ({zip_path.stat().st_size / (1024 ** 2):.2f} MB).")
            return zip_path

        except requests.exceptions.RequestException as e:
            print(f"{year}: erro no download ({e}). Tentativa {attempt + 1}/3.")
            if attempt == 2:
                raise

    return zip_path

def extract_microdata(year: int, zip_path: Path) -> Path:
    target_dir = EXTRACT_DIR / f"enem_{year}"
    if target_dir.exists() and any(target_dir.iterdir()):
        print(f"{year}: diretório de extração já contém arquivos em {target_dir}, pulando extração.")
        return target_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    print(f"{year}: extraindo para {target_dir}...")
    with ZipFile(zip_path, "r") as archive:
        members = [info for info in archive.infolist() if "DADOS" in PurePosixPath(info.filename).parts]
        for info in members:
            archive.extract(info, target_dir)
    print(f"{year}: extração concluída ({len(members)} itens da pasta DADOS).")
    return target_dir

def process_year(year: int, url: str) -> Dict[str, str]:
    zip_path = download_microdata(year, url)
    extracted_dir = extract_microdata(year, zip_path)
    return {
        "year": str(year),
        "zip_path": str(zip_path.resolve()),
        "extracted_dir": str(extracted_dir.resolve()),
    }


In [None]:
summaries: List[Dict[str, str]] = []
for year, url in MICRODATA_URLS.items():
    try:
        summaries.append(process_year(year, url))
    except Exception as exc:
        print(f"{year}: erro - {exc}")

pd.DataFrame(summaries)