modified https://github.com/simonloew/mlops_on_gcp/blob/main/part1/download_data.py

bucket name must match the one from setup.sh script

notes:
- there could be errors with month/year parsing due to (1) at the end of duplicated files. So make sure there is no files with the same name as a file to be created in /data/raw/.

In [25]:
import subprocess
from tempfile import TemporaryDirectory
from shutil import unpack_archive, move
from pathlib import Path
import wget
import ssl
import pandas as pd

ssl._create_default_https_context = ssl._create_unverified_context


BTS_ROOT_URL = "https://transtats.bts.gov/PREZIP"
Year = 2020
Month = 5

In [26]:
def download_monthly_data(year: int, month: int, ouput_dir: Path) -> Path:
    file_download_url = f"{BTS_ROOT_URL}/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{year}_{month}.zip"

    output_file_path = ouput_dir / f"{year}_{month:02}.zip"
    output_file_path.parent.mkdir(exist_ok=True, parents=True)

    wget.download(file_download_url, output_file_path.as_posix())

    return output_file_path


def extract_zipfile(zip_file: Path, output_dir: Path) -> Path:
    with TemporaryDirectory() as tmpdir:
        unpack_archive(zip_file, extract_dir=tmpdir)

        csv_files = list(Path(tmpdir).glob("*.csv"))
        assert len(csv_files) == 1, "There should be exactly one *.csv file per zip"

        year = int(zip_file.name.split(".")[0].split("_")[-2])
        month = int(zip_file.name.split(".")[0].split("_")[-1])
        output_file = output_dir / f"{year}" / f"{year}-{month:02}.csv"

        output_file.parent.mkdir(exist_ok=True, parents=True)

        move(csv_files[0], output_file)

        return output_file


def extract_data(input_dir: Path, output_dir: Path):
    for zip_filepath in INPUT_DIR.glob("*.zip"):
        output_file = extract_zipfile(zip_filepath, OUTPUT_DIR)
        subsampled_file = pd.read_csv(output_file)
        smallfile_path = str(output_file)[:-4]+'small.csv'
        subsampled_file.head(5000).to_csv(smallfile_path)
        print("Extracted:", output_file, smallfile_path)


if __name__ == "__main__":
    INPUT_DIR = Path("./data/raw")
    OUTPUT_DIR = Path("./data/processed")
    BUCKET = "mpg3-testflights-polished-vault-379315"  

    download_monthly_data(Year, Month, INPUT_DIR)
    extract_data(INPUT_DIR, OUTPUT_DIR)
    subprocess.check_call(["gsutil", "cp", "-r", OUTPUT_DIR.absolute(), f"gs://{BUCKET}/data"])



Extracted: data/processed/2020/2020-05.csv data/processed/2020/2020-05small.csv


Copying file:///home/jupyter/flights_pipeline/data/processed/2020/2020-05small.csv [Content-Type=text/csv]...
Copying file:///home/jupyter/flights_pipeline/data/processed/2020/2020-05.csv [Content-Type=text/csv]...
- [2 files][ 78.4 MiB/ 78.4 MiB]                                                
Operation completed over 2 objects/78.4 MiB.                                     


In [22]:
# subsample a file:
from pathlib import Path
from tempfile import TemporaryDirectory
from shutil import unpack_archive, move
import pandas as pd


BTS_ROOT_URL = "https://transtats.bts.gov/PREZIP"
Year = 2020
Month = 5
INPUT_DIR = Path("./data/raw")
OUTPUT_DIR = Path("./data/processed")

def extract_zipfile(zip_file: Path, output_dir: Path) -> Path:
    with TemporaryDirectory() as tmpdir:
        unpack_archive(zip_file, extract_dir=tmpdir)

        csv_files = list(Path(tmpdir).glob("*.csv"))
        assert len(csv_files) == 1, "There should be exactly one *.csv file per zip"

        year = int(zip_file.name.split(".")[0].split("_")[-2])
        month = int(zip_file.name.split(".")[0].split("_")[-1])
        output_file = output_dir / f"{year}" / f"{year}-{month:02}.csv"

        output_file.parent.mkdir(exist_ok=True, parents=True)

        move(csv_files[0], output_file)

        return output_file

for zip_filepath in INPUT_DIR.glob("*.zip"):
    output_file = extract_zipfile(zip_filepath, OUTPUT_DIR)
    subsampled_file = pd.read_csv(output_file)
    smallfile_path = str(output_file)[:-4]+'small.csv'
    subsampled_file.head(5000).to_csv(smallfile_path)
    print("Extracted:", output_file, smallfile_path)



Extracted: data/processed/2020/2020-05.csv data/processed/2020/2020-05small.csv


In [11]:
zip_filepath

PosixPath('data/raw/2020_05.zip')

In [12]:
output_file

PosixPath('data/processed/2020/2020-05.csv')

In [19]:
smallfile_path = str(output_file)[:-4]+'small.csv'
smallfile_path

'data/processed/2020/2020-05small.csv'