modified https://github.com/simonloew/mlops_on_gcp/blob/main/part1/download_data.py

bucket name must match the one from setup.sh script

notes:
- there could be errors with month/year parsing due to (1) at the end of duplicated files. So make sure there is no files with the same name as a file to be created in /data/raw/.

In [1]:
import subprocess
from tempfile import TemporaryDirectory
from shutil import unpack_archive, move
from pathlib import Path
import wget
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


BTS_ROOT_URL = "https://transtats.bts.gov/PREZIP"
Year = 2020
Month = 5

In [2]:
def download_monthly_data(year: int, month: int, ouput_dir: Path) -> Path:
    file_download_url = f"{BTS_ROOT_URL}/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{year}_{month}.zip"

    output_file_path = ouput_dir / f"{year}_{month:02}.zip"
    output_file_path.parent.mkdir(exist_ok=True, parents=True)

    wget.download(file_download_url, output_file_path.as_posix())

    return output_file_path


def extract_zipfile(zip_file: Path, output_dir: Path) -> Path:
    with TemporaryDirectory() as tmpdir:
        unpack_archive(zip_file, extract_dir=tmpdir)

        csv_files = list(Path(tmpdir).glob("*.csv"))
        assert len(csv_files) == 1, "There should be exactly one *.csv file per zip"

        year = int(zip_file.name.split(".")[0].split("_")[-2])
        month = int(zip_file.name.split(".")[0].split("_")[-1])
        output_file = output_dir / f"{year}" / f"{year}-{month:02}.csv"

        output_file.parent.mkdir(exist_ok=True, parents=True)

        move(csv_files[0], output_file)

        return output_file


def extract_data(input_dir: Path, output_dir: Path):
    for zip_filepath in input_dir.glob("*.zip"):
        output_file = extract_zipfile(zip_filepath, output_dir)
        print("Extracted:", output_file)


if __name__ == "__main__":
    INPUT_DIR = Path("./data/raw")
    OUTPUT_DIR = Path("./data/processed")
    BUCKET = "mpg3-testflights-polished-vault-379315"  # TODO: Replace with your bucket name

    download_monthly_data(Year, Month, INPUT_DIR)
    extract_data(INPUT_DIR, OUTPUT_DIR)
    subprocess.check_call(["gsutil", "cp", "-r", OUTPUT_DIR.absolute(), f"gs://{BUCKET}/data"])

Extracted: data/processed/2020/2020-05.csv
Extracted: data/processed/2021/2021-03.csv


Copying file:///home/jupyter/flights_pipeline/data/processed/2020/2020-05.csv [Content-Type=text/csv]...
Copying file:///home/jupyter/flights_pipeline/data/processed/2021/2021-03.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

| [2 files][267.2 MiB/267.2 MiB]                                                
Operation completed ove

In [3]:
# subsample a file:

