In [1]:
import csv
import os
import re
from datetime import datetime

import pandas as pd
import requests
import unidecode
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
import zipfile

from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

import boto3

In [3]:
s3 = boto3.client('s3')
url = 'https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-da-educacao-superior'
bucket_path = 'aws-censo-ed-superior-censo-ed-superior-landing-741358071637'

In [4]:
def download_file(url, full_path, chunk_size=128):
    """
    Download file from url to save_path

    Parameters
    ----------
    url : str
        

    full_path: str
        
    
    chunk_size: str
        
        
    Returns
    -------
        None
    """
    r = requests.get(url, stream=True, verify=False)
    with open(full_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

In [5]:
def download_and_unzip(url, save_path):
    """
    Download and unzip file from url to save_path

    Parameters
    ----------
    url : str
        

    save_path: str
        

    Returns
    -------
        None
    """
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    head, tail = os.path.split(url)
    full_path = f'{save_path}/{tail}'
    download_file(url, full_path)
    with zipfile.ZipFile(full_path, "r") as zip_ref:
        dir_path = full_path.replace('.zip', '')
        zip_ref.extractall(dir_path)
    return dir_path


In [6]:
def get_files(url, file_format='zip' , years_interval=6):
    """
    Get URL from files

    Parameters
    ----------
    url : str
        

    file_format: str
        

    years_interval: int
        
    Returns
    -------
        - list[str]:
        List of file URL's
    """
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    pattern_to_find = 'microdados_censo_da_educacao_superior'
    files = [
        i["href"]
        for i in soup.find_all("a", href=True)
        if i.get("href")
        and i["href"].endswith(f".{file_format}")
        and i["href"].find(pattern_to_find) > -1
    ]
    return filter_files_by_year(files, years_interval)

In [7]:
def filter_files_by_year(files, years_interval):
    """
    Get URL from files

    Parameters
    ----------
    files : list[str]
        

    years_interval: int
        

    Returns
    -------
        list[str]:
        List of latest URL files
    """
    years = [int(re.search("[0-9]{4}", year).group(0)) for year in files if re.findall("[0-9]{4}", year)]
    range_years = [str(year) for year in range(max(years)-(years_interval-1), max(years)+1)]
    selected_files = []
    for file in files:
        if any(year in file for year in range_years):
            selected_files = selected_files + [file]
    return selected_files

In [8]:
def upload_directory(path, bucketname):
    for root, dirs, files in os.walk(path):
        for file in files:
            dir_path = unidecode.unidecode(root.replace(' ', '_'))
            s3.upload_file(os.path.join(root, file), bucketname, dir_path+file)

In [9]:
def run():
    files = get_files(url)
    for file in files:
        try:
            today = datetime.today()
            full_path = download_and_unzip(file, f"{today.year}_{today.month}")
            # upload_directory(full_path, bucket_path)
        except Exception as e:
            print(e)

In [10]:
run()