In [10]:
import requests
from pyquery import PyQuery as pq
from tqdm import tqdm

import gzip
import csv
import os
from collections import OrderedDict
import io
import codecs

In [11]:
datadir = "data/"
filepre = "Annual"
baseurl = "https://www1.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/"
filespec = "StormEvents_details-ftp_v1.0_d"
destination = "stormeventshistorical.csv"
tempfile = "this-is-a-annual-report-tempfile"

In [12]:
html = requests.get(baseurl).content

In [13]:
todownload = []
for link in pq(html)("a"):
    href = pq(link)("a").attr('href').strip()
    if filespec in href:
        todownload.append(href)

In [14]:
gzipfiles = []
WorkToDo = False
for item in tqdm(todownload):
    filename = datadir + filepre + item.replace(filespec, "")
    gzipfiles.append(filename)
    if not os.path.exists(filename):   # If we haven't already downloaded this ...
        WorkToDo = True
        remoteurl = baseurl + item
        with open(filename, "wb") as f:
            f.write(requests.get(remoteurl).content)

100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [00:01<00:00, 52.09it/s]


In [15]:
if not WorkToDo:
    print("Nothing new was downloaded.")
    if not os.path.exists(destination):
        WorkToDo = True
        # If we don't have a target CSV, we still want to build that out.

In [16]:
if WorkToDo:
    print("Building out report. This will take a while.")
    headers = None
    with open(destination, "w", newline="") as f:
        writer = csv.writer(f)
        # Now we have our results file open for writing. Now we're going to open a bunch of files.
        # GZIP processing seemed horribly slow. This is an attempt to work faster.
        # This uses GZIP to extract a binary file, then reopen it as a text file.
        # Using GZIP to process as a text file directly seemed to result in line-level buffering.
        for gzipfile in tqdm(gzipfiles):
            masterlist = []
            with gzip.open(gzipfile, mode="rb") as f:
                with open(tempfile, "wb") as moarf:
                    moarf.write(f.read())
            with open(tempfile, "r") as textfilehandle:
                reader = csv.DictReader(textfilehandle)
                for row in reader:
                    masterlist.append(row)
            if not headers:
                headers = list(row.keys())
            writer.writerow(headers)
            for row in masterlist:
                writer.writerow(list(row.values()))

Building out report. This will take a while.


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:51<00:00,  3.59s/it]


In [17]:
if os.path.exists(tempfile):
    os.remove(tempfile)