Import poem texts from reocr files

In [1]:
import pathlib
import sys
import pandas as pd
from norn.poems import Poem
from norn.alto_tools import get_text
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

OCR_FILES = pathlib.Path("/mnt/md1/new_2023/bok")

NORN_DATA = pathlib.Path("/mnt/md1/new_2023/norn")

done = OCR_FILES / "done"
failed = OCR_FILES / "failed"
poems = pathlib.Path("poems.csv")


In [2]:
# Import list of poems
df = pd.read_csv(poems)
poem_list = df.apply(lambda x: Poem(*x), axis=1).to_list()
urns = set([p.urn for p in poem_list]) # get unique URNs, references to books

In [3]:
# Get list of files that should be processed
done_urn = [x.split("/")[-1]
 for x in     
    glob.glob(str(done / '*'))]

failed_urn = [x.split("/")[-1] for x in glob.glob(str(failed / '*'))]

In [4]:
urns_in_done = [x for x in urns if x.split(":")[-1] in done_urn] # Get the URNs that are in the done folder
urns_in_failed = [x for x in urns if x.split(":")[-1] in failed_urn] # Get the URNs that are in the failed folder

In [16]:
len(urns), len(urns_in_done), len(urns_in_failed), len(urns_in_done) + len(urns_in_failed) #  Compare the number of URNs in the list of poems and the number of URNs in the done and failed folders

(71, 67, 4, 71)

In [8]:
urns_in_failed

['URN:NBN:no-nb_digibok_2013080706153',
 'URN:NBN:no-nb_digibok_2014041406092',
 'URN:NBN:no-nb_digibok_2013041105048',
 'URN:NBN:no-nb_digibok_2013060306081']

In [24]:
type(nan)

NameError: name 'nan' is not defined

In [25]:
poem_list[1].pages??

[0;31mType:[0m        float
[0;31mString form:[0m nan
[0;31mDocstring:[0m   Convert a string or number to a floating point number, if possible.

In [22]:
type(poem_list[1].pages)

float

In [26]:
from bs4 import BeautifulSoup

def get_alto_path(urn, page):
    
    
    filename = urn.split(":")[-1]
    
    if urn in urns_in_failed:
        subfolder = "failed"
    elif urn in urns_in_done:
        subfolder = "done"    
    else:
        raise ValueError("URN {} not in done or failed".format(urn))
    
    path = OCR_FILES / subfolder / filename / "ocr"
    page = str(page).zfill(4)
    alto_path = path / f"{filename}_{page}.xml"
    return alto_path

def get_alto(urn, page):
    alto_path = get_alto_path(urn, page)
    with open(alto_path) as f:
        soup = BeautifulSoup(f, "lxml-xml")
    return soup


alto_list = []
errors = []
for poem in tqdm(poem_list):
    if poem.urn in errors:
        continue
    
    for r in range(poem.page_start-1, poem.page_end):
        try: 
            alto = get_alto(poem.urn, r)
            
            if isinstance(poem.pages, float):
                poem.pages = []
            
            poem.pages.append(get_text(alto))
            
        except Exception as e:
            print("Error with", poem.urn, poem.title, r, e)
            errors.append(poem.urn)
            break
            
        # alto_list.append(alto)

  0%|          | 0/2940 [00:00<?, ?it/s]

In [29]:
from typing import Iterable

def show_start_page(poem: Poem) -> str:    
    return f"https://urn.nb.no/{poem.urn}?searchText=&page={poem.page_start}"

def to_txt(poem: Poem) -> str:
    return "\n".join(poem.pages)

def collection_to_txt(poems: Iterable[Poem], path: str):  
    
    for poem in poems:
        poem_path = pathlib.Path(path) / poem.urn
        poem_path.mkdir(exist_ok=True, parents=True)  
        with open(poem_path / f"{poem.dhlabid}.txt", "w") as f:
            for page in poem.pages:
                f.write(page)
                f.write("\n\n")   
    
    

## Export poems

In [30]:
# collection_to_txt(poem_list, NORN_DATA / "reocr_poems2") # Write the poems to disk

In [10]:
urns_in_failed

['URN:NBN:no-nb_digibok_2013080706153',
 'URN:NBN:no-nb_digibok_2014041406092',
 'URN:NBN:no-nb_digibok_2013041105048',
 'URN:NBN:no-nb_digibok_2013060306081']

# Compare old and new results

In [11]:
urns_in_failed

['URN:NBN:no-nb_digibok_2013080706153',
 'URN:NBN:no-nb_digibok_2014041406092',
 'URN:NBN:no-nb_digibok_2013041105048',
 'URN:NBN:no-nb_digibok_2013060306081']

In [12]:
new_files = NORN_DATA / "reocr_poems"
old_files = NORN_DATA / "texts_joined"


In [13]:
poem_dct = {}

for poem in poem_list:
    if poem.urn not in poem_dct:
        poem_dct[poem.urn] = {}
    poem_dct[poem.urn][poem.title] = poem

In [14]:
target_poems = [poem for poem in poem_list if poem.urn in urns_in_failed]


urn = urns_in_failed[0]

def get_list():
    return list(poem_dct[urn].keys())

# def get_old_and_new_path(urn: str, poem_i: int = 0):
    
#     old_file_path = old_files / urn / f'{get_list()[poem_i]}.txt'
#     new_file_path = new_files / urn / f"{get_list()[poem_i]}.txt"
    
#     return old_file_path, new_file_path
    

def get_old_and_new_path(poem: Poem):
    
    old_file_path = old_files / poem.urn / f'{poem.title}.txt'
    new_file_path = new_files / poem.urn / f'{poem.title}.txt'
    
    return old_file_path, new_file_path
    

    

In [15]:
import subprocess

poem = target_poems[70]

old, new = get_old_and_new_path(poem)

subprocess.run(["code" , old])
subprocess.run(["code" ,"--add", new, "--goto"])


show_start_page(poem)

'https://urn.nb.no/URN:NBN:no-nb_digibok_2014041406092?searchText=&page=36'