In [1]:
import requests
from requests_html import HTMLSession
import arrow
from pathlib import Path
from dask.distributed import Client, LocalCluster
import dns.resolver
import os

CI = True

cluster = LocalCluster(
    n_workers=1 if CI else 16, threads_per_worker=1, processes=not CI
)
client = Client(cluster)  # start local workers as processes

session = HTMLSession()
CWD = Path.cwd()
PDFS_DIR = CWD / "pdfs"
PDFS_DIR.mkdir(exist_ok=True)

In [2]:
# manually resolving domain name cause of https://github.community/t/cannot-resolve-travel-state-gov-hostname-in-github-actions-with-default-dns-server/180625
resolver = dns.resolver.Resolver()
resolver.nameservers = ["1.1.1.1", "8.8.8.8"]
ip_address = resolver.resolve("travel.state.gov", "A")[0].to_text()

pdf_list_r = session.get(
    f"https://{ip_address}/content/travel/en/legal/visa-law0/visa-statistics/immigrant-visa-statistics/monthly-immigrant-visa-issuances.html",
    headers={"Host": "travel.state.gov"},
    verify=False,
)
pdf_list_content = pdf_list_r.html.find(".contentbody", first=True)
pdf_links = [
    link for link in pdf_list_content.find("a") if link.attrs["href"].endswith(".pdf")
]
niv_pdf_list_r = session.get(
    f"https://{ip_address}/content/travel/en/legal/visa-law0/visa-statistics/nonimmigrant-visa-statistics/monthly-nonimmigrant-visa-issuances.html",
    headers={"Host": "travel.state.gov"},
    verify=False,
)
niv_pdf_list_content = niv_pdf_list_r.html.find(".contentbody", first=True)
niv_pdf_links = [
    link
    for link in niv_pdf_list_content.find("a")
    if link.attrs["href"].endswith(".pdf")
]



In [3]:
urls_by_month = {}
for link in [*pdf_links, *niv_pdf_links]:
    link_text = link.text.replace(" - ", " - ").replace(" – ", " - ")

    month, _, category = link_text.partition(" - ")
    category = category.replace("Visa Cass", "Visa Class")  # lol

    if "NIV Issuances by Post and Visa Class" in category:
        urls_by_month["NIV-" + arrow.get(month, "MMMM YYYY").format("YYYY-MM")] = list(
            link.absolute_links
        )[0]
    elif "IV Issuances by Post and Visa Class" in category:
        urls_by_month["IV-" + arrow.get(month, "MMMM YYYY").format("YYYY-MM")] = list(
            link.absolute_links
        )[0]

In [4]:
from itertools import product

for kind, month in product(
    ["NIV", "IV"], arrow.Arrow.range("month", arrow.get("2017-03-01"), arrow.get())
):
    outfile = PDFS_DIR / f"{kind}-{month.format('YYYY-MM')}.pdf"
    if outfile.exists():
        print(f"skippy {outfile}")
        continue
    print(f"doin {outfile}")
    try:
        url = urls_by_month[f"{kind}-{month.format('YYYY-MM')}"]
    except KeyError:
        print(f"but no {outfile}!")
        continue
    r = requests.get(
        url.replace("travel.state.gov", ip_address),
        headers={"Host": "travel.state.gov"},
        verify=False,
    )
    outfile.write_bytes(r.content)

doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-03.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-04.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-05.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-06.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-07.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-08.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-09.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-10.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-11.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2017-12.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-01.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-02.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-03.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-04.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-05.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-06.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-07.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-08.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-09.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-10.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-11.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2018-12.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-01.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-02.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-03.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-04.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-05.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-06.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-07.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-08.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-09.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-10.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-11.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2019-12.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-01.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-02.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-03.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-04.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-05.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-06.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-07.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-08.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-09.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-10.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-11.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2020-12.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-01.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-02.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-03.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-04.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-05.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-06.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-07.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-08.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-09.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-10.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-10.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-11.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/NIV-2021-11.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-03.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-04.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-05.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-06.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-07.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-08.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-09.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-10.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-11.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2017-12.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-01.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-02.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-03.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-04.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-05.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-06.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-07.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-08.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-09.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-10.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-11.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2018-12.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-01.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-02.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-03.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-03.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-04.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-04.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-05.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-05.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-06.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-06.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-07.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-07.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-08.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-08.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/p



doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2019-12.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-01.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-02.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-03.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-04.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-05.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-06.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-07.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-08.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-09.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-10.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-11.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2020-12.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-01.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-02.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-03.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-04.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-05.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-06.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-07.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-08.pdf
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-09.pdf




doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-10.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-10.pdf!
doin /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-11.pdf
but no /home/runner/work/visawhen/visawhen/data/consulates/pdfs/IV-2021-11.pdf!


In [5]:
import camelot
import pandas as pd
import PyPDF2

client.restart()


def process_path(path: Path, page_no: int):
    parsed = camelot.read_pdf(str(path), pages=str(page_no))
    if len(parsed) == 0:
        return
    table = parsed[0].df
    table.columns = ["Post", "Visa Class", "Issuances"]
    table["Issuances"] = pd.to_numeric(
        table["Issuances"].replace(r"\D", "", regex=True),
        errors="coerce",
        downcast="integer",
    )
    table = table.replace("", pd.NA).dropna().reset_index(drop=True)
    table["Post"] = pd.Series(table["Post"], dtype="string")
    table["Visa Class"] = pd.Series(table["Visa Class"], dtype="string")
    table["Month"] = pd.Timestamp(
        arrow.get(
            path.name.replace("NIV-", "").replace("IV-", "").replace(".pdf", "")
        ).format("YYYYMMDD")
    )
    table["Issuances"] = table["Issuances"].astype("uint16")
    return table


futures = []
for path in PDFS_DIR.glob("*.pdf"):
    page_count = PyPDF2.PdfFileReader(path.open("rb")).getNumPages()
    for page_no in range(1, page_count + 1):
        futures.append(client.submit(process_path, path, page_no))

results = [result for result in client.gather(futures) if result is not None]
all_months = pd.concat(results).sort_values("Month").reset_index(drop=True)

tornado.application - ERROR - Exception in callback <bound method Worker.trigger_profile of <Worker: 'inproc://10.1.0.125/4597/4', 0, Status.running, stored: 48, running: 1/1, ready: 6957, comm: 0, waiting: 0>>
Traceback (most recent call last):
  File "/home/runner/.cache/pypoetry/virtualenvs/consulates-YiFvjhQe-py3.10/lib/python3.10/site-packages/distributed/profile.py", line 115, in process
    d = state["children"][ident]
KeyError: 'text_in_bbox;/home/runner/.cache/pypoetry/virtualenvs/consulates-YiFvjhQe-py3.10/lib/python3.10/site-packages/camelot/utils.py;342'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/runner/.cache/pypoetry/virtualenvs/consulates-YiFvjhQe-py3.10/lib/python3.10/site-packages/tornado/ioloop.py", line 905, in _run
    return self.callback()
  File "/home/runner/.cache/pypoetry/virtualenvs/consulates-YiFvjhQe-py3.10/lib/python3.10/site-packages/distributed/worker.py", line 3623, in trigger_p

In [6]:
# cluster.scale(0)
cluster

0,1
Dashboard: http://10.1.0.125:8787/status,Workers: 1
Total threads: 1,Total memory: 6.79 GiB
Status: running,Using processes: False

0,1
Comm: inproc://10.1.0.125/4597/1,Workers: 1
Dashboard: http://10.1.0.125:8787/status,Total threads: 1
Started: 1 hour ago,Total memory: 6.79 GiB

0,1
Comm: inproc://10.1.0.125/4597/4,Total threads: 1
Dashboard: http://10.1.0.125:44651/status,Memory: 6.79 GiB
Nanny: None,
Local directory: /home/runner/work/visawhen/visawhen/data/consulates/dask-worker-space/worker-od3hde07,Local directory: /home/runner/work/visawhen/visawhen/data/consulates/dask-worker-space/worker-od3hde07


In [7]:
all_months[all_months["Issuances"] > 1000].head()

Unnamed: 0,Post,Visa Class,Issuances,Month
56,Colombo,B1/B2,1297,2017-03-01
103,Doha,B1/B2,1221,2017-03-01
151,Dhahran,B1/B2,1406,2017-03-01
169,Casablanca,B1/B2,1048,2017-03-01
197,Chengdu,B1/B2,13972,2017-03-01


In [8]:
all_months.to_pickle("all_months.pkl")

In [9]:
bud_marriage = (
    all_months.loc[all_months["Post"] == "Budapest"]
    .loc[all_months["Visa Class"].isin(["CR1", "IR1"])]
    .groupby(by=all_months["Month"])
    .sum()
)
bud_marriage

Unnamed: 0_level_0,Issuances
Month,Unnamed: 1_level_1
2017-03-01,3
2017-04-01,2
2017-05-01,6
2017-06-01,1
2017-07-01,1
2017-08-01,3
2017-09-01,2
2017-10-01,3
2017-11-01,3
2017-12-01,4
