In [1]:
%reload_ext autoreload
%autoreload 2


#This is needed to add the repo dir to the path so jupyter
# can load the republic modules directly from the notebooks
import os
import sys
import re
# sys.path.append('/Users/rikhoekstra/develop/republic_clean/')

from collections import defaultdict
import pandas as pd
from collections import Counter, defaultdict
import json
# import republic.model.republic_document_model as rdm


# repo_dir = os.path.split(os.getcwd())[0]

# if repo_dir not in sys.path:
#     sys.path.append(repo_dir)
    
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
pip install recordsearch-data-scraper

Note: you may need to restart the kernel to use updated packages.


In [3]:
from recordsearch_data_scraper.scrapers import *


In [4]:
import json
import string
import time
from datetime import datetime
from pathlib import Path

import pandas as pd
import requests
from IPython.display import HTML, FileLink, display
from recordsearch_data_scraper.scrapers import RSItemSearch
from slugify import slugify
from tqdm.auto import tqdm

# This is a workaround for a problem with tqdm adding space to cells
HTML(
    """
    <style>
    .p-Widget.jp-OutputPrompt.jp-OutputArea-prompt:empty {
      padding: 0;
      border: 0;
    }
    </style>
"""
)

In [5]:
# This is basically a list of letters and numbers that we can use to build up control symbol values.
control_range = (
    [str(number) for number in range(0, 10)]
    + [letter for letter in string.ascii_uppercase]
    + ["/"]
)


def get_results(data_dir, **kwargs):
    """
    Save all the results from a search using the given parameters.
    If there are more than 20,000 results, return False.
    Otherwise, return the harvested items.
    """
    s = RSItemSearch(**kwargs)
    if s.total_results == "20,000+":
        return False
    else:
        with tqdm(total=s.total_results, leave=False) as pbar:
            more = True
            while more:
                data = s.get_results()
                if data["results"]:
                    save_to_ndjson(data_dir, data["results"])
                    pbar.update(len(data["results"]))
                    time.sleep(0.5)
                else:
                    more = False
        return True


def refine_controls(current_control, data_dir, **kwargs):
    """
    Add additional letters/numbers to the control symbol wildcard search
    until the number of results is less than 20,000.
    Then harvest the results.
    Returns:
        * the RSItemSearch object (containing the search params, total results etc)
        * a list containing the harvested items
    """
    for control in control_range:
        new_control = current_control.strip("*") + control + "*"
        # print(new_control)
        kwargs["control"] = new_control
        results = get_results(data_dir, **kwargs)
        # print(total)
        if results is False:
            refine_controls(new_control, data_dir, **kwargs)


def create_data_dir(search, today):
    """
    Create a directory for the harvested data -- using the date and search parameters.
    """
    params = search.params.copy()
    params.update(search.kwargs)
    search_param_str = slugify(
        "_".join(
            sorted(
                [
                    f"{k}_{v}"
                    for k, v in params.items()
                    if v is not None and k not in ["results_per_page", "sort"]
                ]
            )
        )
    )
    data_dir = Path("harvests", f'{today.strftime("%Y%m%d_%H%M%S")}_{search_param_str}')
    data_dir.mkdir(exist_ok=True, parents=True)
    return data_dir


def save_to_ndjson(data_dir, results):
    """
    Save results into a single, newline delimited JSON file.
    """
    output_file = Path(data_dir, "results.ndjson")
    with output_file.open("a") as ndjson_file:
        for result in results:
            ndjson_file.write(json.dumps(result) + "\n")


def save_metadata(search, data_dir, today, totals):
    """
    Save information about the harvest to a JSON file.
    """
    metadata = {
        "date_harvested": today.isoformat(),
        "search_params": search.params,
        "search_kwargs": search.kwargs,
        "total_results": search.total_results,
        "total_harvested": totals["harvested"],
        "total_after_deduplication": totals["deduped"],
    }

    with Path(data_dir, "metadata.json").open("w") as md_file:
        json.dump(metadata, md_file)


def save_csv(data_dir):
    """
    Save the harvested results as a CSV file, removing any duplicates.
    """
    output_file = Path(data_dir, "results.csv")
    input_file = Path(data_dir, "results.ndjson")
    df = pd.read_json(input_file, lines=True)
    harvested = df.shape[0]
    # Flatten list
    try:
        df["access_decision_reasons"] = (
            df["access_decision_reasons"].dropna().apply(lambda l: " | ".join(l))
        )
    except KeyError:
        pass
    # Remove any duplicates
    df.drop_duplicates(inplace=True)
    df.to_csv(output_file, index=False)
    deduped = df.shape[0]
    return {"harvested": harvested, "deduped": deduped}


def harvest_search(**kwargs):
    """
    Harvest all the items from a search using the supplied parameters.
    If there are more than 20,000 results, it will use control symbol
    wildcard values to try and split the results into harvestable chunks.
    """
    # Initialise the search
    search = RSItemSearch(**kwargs)
    today = datetime.now()
    data_dir = create_data_dir(search, today)
    # If there are more than 20,000 results, try chunking using control symbols
    if search.total_results == "20,000+":
        # Loop through the letters and numbers
        for control in control_range:
            # print(control)
            # Add letter/number as a wildcard value
            kwargs["control"] = f"{control}*"
            # Try getting the results
            results = get_results(data_dir, **kwargs)
            # print(results)
            if results is False:
                # If there's still more than 20,000, add more letters/numbers to the control symbol!
                refine_controls(control, data_dir, **kwargs)
    # If there's less than 20,000 results, save them all
    else:
        get_results(data_dir, **kwargs)
    totals = save_csv(data_dir)
    save_metadata(search, data_dir, today, totals)
    print(f"Harvest directory: {data_dir}")
    display(FileLink(Path(data_dir, "metadata.json")))
    display(FileLink(Path(data_dir, "results.ndjson")))
    display(FileLink(Path(data_dir, "results.csv")))
    return data_dir


def save_images(harvest_dir):
    df = pd.read_csv(Path(harvest_dir, "results.csv"))
    with tqdm(
        total=df.loc[df["digitised_status"] == True].shape[0], desc="Files"
    ) as pbar:
        for item in df.loc[df["digitised_status"] == True].itertuples():
            image_dir = Path(
                f"{harvest_dir}/images/{slugify(item.series)}-{slugify(str(item.control_symbol))}-{item.identifier}"
            )

            # Create the folder (and parent if necessary)
            image_dir.mkdir(exist_ok=True, parents=True)

            # Loop through the page numbers
            for page in tqdm(
                range(1, int(item.digitised_pages) + 1), desc="Images", leave=False
            ):

                # Define the image filename using the barcode and page number
                filename = Path(f"{image_dir}/{item.identifier}-{page}.jpg")

                # Check to see if the image already exists (useful if rerunning a failed harvest)
                if not filename.exists():
                    # If it doens't already exist then download it
                    img_url = f"https://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={item.identifier}&S={page}&T=P"
                    response = requests.get(img_url)
                    try:
                        response.raise_for_status()
                    except requests.exceptions.HTTPError:
                        pass
                    else:
                        filename.write_bytes(response.content)

                    time.sleep(0.5)
            pbar.update(1)


def save_pdfs(harvest_dir):
    df = pd.read_csv(Path(harvest_dir, "results.csv"))
    pdf_dir = Path(harvest_dir, "pdfs")
    pdf_dir.mkdir(exist_ok=True, parents=True)
    with tqdm(
        total=df.loc[df["digitised_status"] == True].shape[0], desc="Files"
    ) as pbar:
        for item in df.loc[df["digitised_status"] == True].itertuples():
            pdf_file = Path(
                pdf_dir,
                f"{slugify(item.series)}-{slugify(str(item.control_symbol))}-{item.identifier}.pdf",
            )
            if not pdf_file.exists():
                pdf_url = f"https://recordsearch.naa.gov.au/SearchNRetrieve/NAAMedia/ViewPDF.aspx?B={item.identifier}&D=D"
                response = requests.get(pdf_url)
                try:
                    response.raise_for_status()
                except requests.exceptions.HTTPError:
                    pass
                else:
                    pdf_file.write_bytes(response.content)
                time.sleep(0.5)
            pbar.update(1)

In [6]:
from slugify import slugify

In [8]:
search, items = harvest_search(kw='DUTCH', series='A2571', digital=True)

  0%|          | 0/14832 [00:00<?, ?it/s]

Harvest directory: harvests/20231025_155130_b'digital_true_kw_dutch_record_detail_brief_series_a2571'


TypeError: cannot unpack non-iterable PosixPath object

In [40]:
csv_results = pd.read_csv("harvests/20231025_155130_b'digital_true_kw_dutch_record_detail_brief_series_a2571'/results.csv")

In [41]:
csv_results

Unnamed: 0,series,control_symbol,title,identifier,access_status,location,contents_date_str,contents_start_date,contents_end_date,digitised_status,retrieved
0,A2571,ACHTEREN V TEUNIS,"ACHTEREN V, Teunis : Year of Birth - 1917 : Na...",8119360,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00
1,A2571,ACHTEREN V ELISABETH,"ACHTEREN V, V Elisabeth : Year of Birth - 1927...",8119361,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00
2,A2571,ACHTEREN V PAULUS,"ACHTEREN V, V Paulus : Year of Birth - 1950 : ...",8119362,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00
3,A2571,ACHTEREN V ELEISABETH,"ACHTEREN V, V Eleisabeth : Year of Birth - 195...",8119363,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00
4,A2571,ACKER VAN WALTER AUGUSTUS,"ACKER VAN, Walter Augustus : Year of Birth - 1...",8119375,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00
...,...,...,...,...,...,...,...,...,...,...,...
14827,A2571,PANHUYZEN ANDREAS,"PANHUYZEN, Andreas : Year of Birth - 1941 : Na...",203932862,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00
14828,A2571,PANHUYZEN GODEFRIDA,"PANHUYZEN, Godefrida : Year of Birth - 1940 : ...",203932863,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00
14829,A2571,PANHUYZEN PENIONELLA,"PANHUYZEN, Penionella : Year of Birth - 1919 :...",203932864,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00
14830,A2571,PANHUYZEN MARTINES,"PANHUYZEN, Martines : Year of Birth - 1948 : N...",203932865,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00


In [42]:
import re

In [43]:
 pat = "[A-Z].*: Year of Birth - 1917 : Nationality - DUTCH : Travelled per - 7SEAS2 : Number -"

In [44]:
with pd.option_context('display.max_colwidth', 800): # for displaying large columns
    display(csv_results.title)

0                        ACHTEREN V, Teunis : Year of Birth - 1917 : Nationality - DUTCH : Travelled per - 7SEAS2 : Number -
1                   ACHTEREN V, V Elisabeth : Year of Birth - 1927 : Nationality - DUTCH : Travelled per - 7SEAS2 : Number -
2                      ACHTEREN V, V Paulus : Year of Birth - 1950 : Nationality - DUTCH : Travelled per - 7SEAS2 : Number -
3                  ACHTEREN V, V Eleisabeth : Year of Birth - 1953 : Nationality - DUTCH : Travelled per - 7SEAS2 : Number -
4        ACKER VAN, Walter Augustus : Year of Birth - 1915 : Nationality - DUTCH : Travelled per - F.SEA X : Number - 329105
                                                                ...                                                         
14827         PANHUYZEN, Andreas : Year of Birth - 1941 : Nationality - DUTCH : Travelled per - NELLY 8 : Number - [UNKNOWN]
14828       PANHUYZEN, Godefrida : Year of Birth - 1940 : Nationality - DUTCH : Travelled per - NELLY 8 : Number - [UNKNOWN]


In [62]:
csv_extra = csv_results.title.str.extractall('(?P<name>[A_Z]+.*) 
                                    : Year of Birth - (?P<birthyear>\d+) : Nationality - DUTCH : Travelled per - (?P<ship>.*) : Number - (?P<number>.*)')[['birthyear','ship','number']]

In [63]:
csv_extra

Unnamed: 0_level_0,Unnamed: 1_level_0,birthyear,ship,number
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,0,1915,F.SEA X,329105
5,0,1916,F. SEA X,329106
9,0,1925,F. SEA 9,324322
12,0,1914,F. SEA X,329502
13,0,1920,F. SEA X,329501
...,...,...,...,...
14827,0,1941,NELLY 8,[UNKNOWN]
14828,0,1940,NELLY 8,[UNKNOWN]
14829,0,1919,NELLY 8,349535
14830,0,1948,NELLY 8,[UNKNOWN]


In [33]:
csv_results[['birth_year','ship', 'number']] = 

In [34]:
csv_results

Unnamed: 0,series,control_symbol,title,identifier,access_status,location,contents_date_str,contents_start_date,contents_end_date,digitised_status,retrieved,birth_year,ship,number
0,A2571,ACHTEREN V TEUNIS,"ACHTEREN V, Teunis : Year of Birth - 1917 : Na...",8119360,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00,,,
1,A2571,ACHTEREN V ELISABETH,"ACHTEREN V, V Elisabeth : Year of Birth - 1927...",8119361,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00,,,
2,A2571,ACHTEREN V PAULUS,"ACHTEREN V, V Paulus : Year of Birth - 1950 : ...",8119362,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00,,,
3,A2571,ACHTEREN V ELEISABETH,"ACHTEREN V, V Eleisabeth : Year of Birth - 195...",8119363,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00,,,
4,A2571,ACKER VAN WALTER AUGUSTUS,"ACKER VAN, Walter Augustus : Year of Birth - 1...",8119375,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T00:43:57.273658+11:00,1915,F.SEA X,329105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14827,A2571,PANHUYZEN ANDREAS,"PANHUYZEN, Andreas : Year of Birth - 1941 : Na...",203932862,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00,1941,NELLY 8,[UNKNOWN]
14828,A2571,PANHUYZEN GODEFRIDA,"PANHUYZEN, Godefrida : Year of Birth - 1940 : ...",203932863,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00,1940,NELLY 8,[UNKNOWN]
14829,A2571,PANHUYZEN PENIONELLA,"PANHUYZEN, Penionella : Year of Birth - 1919 :...",203932864,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00,1919,NELLY 8,349535
14830,A2571,PANHUYZEN MARTINES,"PANHUYZEN, Martines : Year of Birth - 1948 : N...",203932865,Open,Canberra,1947 - 1956,1947,1956,True,2023-10-26T01:06:29.873897+11:00,1948,NELLY 8,[UNKNOWN]


In [22]:
csv_results.merge(exres)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False