In [200]:
import requests
from bs4 import BeautifulSoup
from typing import NamedTuple, Optional, Iterable, TypedDict
import pandas as pd
import os


def format_number(number):
    return "{:04d}".format(number)


def get_alto(urn: str, page: int = 1):
    """Get alto from a page in a book from the National Library of Norway.
    Args:
        urn (str): URN number for the book
        page (int): page number
    Returns:
        str: text from the page
    """
    r = requests.get(api_str.format(urn=urn, page=format_number(page)))
    
    if r.status_code != 200:
        print(r.status_code, api_str.format(urn=urn, page=format_number(page)))
    return r.text

def check_alto_style(soup: BeautifulSoup) -> str:
    if soup.find("ComposedBlock"):
        return "alto_2"
    elif soup.find("composedblock"):
        return "alto_3"
    elif soup.find("PrintSpace"):
        return "alto_1"
    else:
        raise ValueError("Could not find alto style")
    


def get_text(soup: BeautifulSoup) -> str:
    """Get text from a alto soup object.

    Args:
        soup (BeautifulSoup): soup object from alto xml

    Returns:
        str: text from the page
    """
    alto_dict : dict = {
        "composed_block": {
            "alto_2" : "ComposedBlock",
            "alto_3" : "composedblock",
            "alto_1" : "PrintSpace"
            },
        "text_block": {
            "alto_2" : "TextBlock",
            "alto_3" : "textblock",
            "alto_1" : "TextBlock"
            },
        "text_line": {
            "alto_2" : "TextLine",
            "alto_3" : "textline",
            "alto_1" : "TextLine"
            },
        "string": {
            "alto_2" : "String",
            "alto_3" : "string",
            "alto_1" : "String"
            },
        "content": {
            "alto_2" : "CONTENT",
            "alto_3" : "content",
            "alto_1" : "CONTENT"
        }
    }   
    
    alto_style = check_alto_style(soup)
    
    text = ""
    for composed_block in soup.find_all(alto_dict["composed_block"][alto_style]):
        for text_block in composed_block.find_all(alto_dict["text_block"][alto_style]):
            for text_line in text_block.find_all(alto_dict["text_line"][alto_style]):
                for string in text_line.find_all(alto_dict["string"][alto_style]):
                    # print(string["CONTENT"])
                    text += string[alto_dict["content"][alto_style]] + " "
                text += "\n"
            text += "\n"
        text += "\n"

    return text


urn = "URN:NBN:no-nb_digibok_2014110308039"
api_str = "https://api.nb.no/catalog/v1/metadata/{urn}/altos/{urn}_{page}"

In [223]:
alto = get_alto("URN:NBN:no-nb_digibok_2012103106111", 17)

In [231]:
soup = BeautifulSoup(alto, "lxml")



In [232]:
soup.find("PrintSpace")

In [230]:
get_text(soup)

'£)g nåar eg fjetrer ben gjenta talar — \nb\'er fom be Utfar baab\' berg og balar; \nog nåar eg fjøtirer ben gjenta føe — \nbe tittar alt, fom paa jort e\\ \n\n«Soffefionbt. \n\n9tte foarte fpenne, teffje fring fjatten, \nme foarte buffur og plagg fring [jalfert, \nme totte foffar og tøpebonb**), — \nfo mett\' eg guten min mang et gong. \n\n$bft at eg ljøbrbe, (jan paa fjoren robbe, \neg tjøPrbe maate, eg fjenbe tjoe. \n\n@g fjøprbe maate ao guten min — \neg dart foranbra i mit fjeite lio. \n\n©ei foffebonbi, eg dit \'om jlengje, \nbei flat l)an tjaoa, fo eg fan f)an fjernte. \n$)uf!ann f!at t>era ao fttfetraa; \n\nfo fet eg mit namn og f)an3 uppaa. \n\n•) \n\nfoffebonb au „føt)e", fint utatanbS ullgarn. 2; ei ajefl \nti, fan bu fjaa. \n\nmc fneßrof \n\nben \n\n•?G \n\n29 \n\n2 \n\n\n'

In [205]:
df.loc[df.urn == "URN:NBN:no-nb_digibok_2012103106111"]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
38,URN:NBN:no-nb_digibok_2012103106111,[introdikt],11,11,,,obs: dramatisk fortelling; fotnoter s. 12
39,URN:NBN:no-nb_digibok_2012103106111,Fyrestev,12,13,,,
40,URN:NBN:no-nb_digibok_2012103106111,Vaaren. Sigri syng,14,14,,,
41,URN:NBN:no-nb_digibok_2012103106111,Gunnar køyrer Sigri i joli,15,15,x,,fotn
42,URN:NBN:no-nb_digibok_2012103106111,Gunnar høyrer Sigri hjala,15,16,,,fotn
43,URN:NBN:no-nb_digibok_2012103106111,Sigri,16,17,x,,fotn
44,URN:NBN:no-nb_digibok_2012103106111,Gunnar,17,18,x,,fotn
45,URN:NBN:no-nb_digibok_2012103106111,Sokkebondi,18,19,x,,fotn
46,URN:NBN:no-nb_digibok_2012103106111,Bryllaupsmorgonen,19,19,x,,fotn
47,URN:NBN:no-nb_digibok_2012103106111,I bryllaupe paa Nes,20,21,x,,


In [204]:
alto

'<?xml version="1.0" encoding="UTF-8"?>\n<alto xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://schema.ccs-gmbh.com/metae/alto-1-2.xsd" xmlns:xlink="http://www.w3.org/1999/xlink">\n\t<Description>\n\t\t<MeasurementUnit>mm10</MeasurementUnit>\n\t\t<sourceImageInformation>\n\t\t\t<fileName>//Produksjon5.nb.no/ocr-output/Monografi_Default_Antiqua_Auto/digibok_2012103106111/images/digibok_2012103106111_0025.TIF</fileName>\n\t\t</sourceImageInformation>\n\t\t<OCRProcessing ID="OCRPROCESSING_1">\n\t\t\t<preProcessingStep>\n\t\t\t\t<processingSoftware>\n\t\t\t\t\t<softwareCreator>CCS Content Conversion Specialists GmbH, Germany</softwareCreator>\n\t\t\t\t\t<softwareName>CCS docWORKS</softwareName>\n\t\t\t\t\t<softwareVersion>6.5-1.28</softwareVersion>\n\t\t\t\t</processingSoftware>\n\t\t\t</preProcessingStep>\n\t\t\t<ocrProcessingStep>\n\t\t\t\t<processingSoftware>\n\t\t\t\t\t<softwareCreator>ABBYY (BIT Software), Russia</softwareCreator>\n\t\t\t\t\t

In [201]:
class Poem(NamedTuple):
    urn: str
    title: str
    page_start: int
    page_end: int
    overlapp: str
    digital_visning: str
    comment: str
    pages : Optional[Iterable[str]] = None


df = pd.read_csv("poems.csv")

# Get poem objects
poem_list = []
for ind, row in df.iterrows():
    poem = Poem(*row)
    poem_list.append(poem)

In [241]:

poem_list_w_pages = []
for poem in poem_list:
    txts = []
    
    for r in range(poem.page_start-1, poem.page_end):
            #print(r)
            #print(p.urn, p.title)
        try:    
            alto = get_alto(poem.urn, r)
            txt = get_text(BeautifulSoup(alto, "lxml-xml"))
            txts.append(txt)

        except Exception as e:
            print("Error with", poem.urn, poem.title, r, e)
            continue
        
    poem_list_w_pages.append(Poem(*poem[:-1], txts))
        # poem["pages"] = txts
        #poem_list_w_pages.append(poem)
    # except Exception as e:
    #     print("Error with", poem.urn, poem.title, e)
    #     # p.pages = None

401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0007
Error with URN:NBN:no-nb_digibok_2016070808151 Syttende mai 7 Could not find alto style
401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0008
Error with URN:NBN:no-nb_digibok_2016070808151 En fremmed fugl 8 Could not find alto style


  txt = get_text(BeautifulSoup(alto, "lxml-xml"))


401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0009
Error with URN:NBN:no-nb_digibok_2016070808151 En fremmed fugl 9 Could not find alto style
401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0009
Error with URN:NBN:no-nb_digibok_2016070808151 Da rosentræet døde 9 Could not find alto style
401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0010
Error with URN:NBN:no-nb_digibok_2016070808151 Gjöken 10 Could not find alto style
401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0011
Error with URN:NBN:no-nb_digibok_2016070808151 Gjöken 11 Could not find alto style
401 https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2016070808151/altos/URN:NBN:no-nb_digibok_2016070808151_0011
Error with URN:NBN:no-nb

In [242]:
for poem in poem_list_w_pages:
    base_path = path = os.path.join("texts", poem.urn, poem.title)
    count = 1
    for page in poem.pages:
        path = os.path.join(base_path, str(count) + ".txt")
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "w") as f:
            f.write(page)
        count += 1

In [243]:
df.urn.value_counts()

urn
URN:NBN:no-nb_digibok_2006112300015    93
URN:NBN:no-nb_digibok_2013092407013    43
URN:NBN:no-nb_digibok_2014030707024    41
URN:NBN:no-nb_digibok_2016070808151    38
URN:NBN:no-nb_digibok_2006082400061    28
URN:NBN:no-nb_digibok_2012103106111    26
URN:NBN:no-nb_digibok_2017102348005    20
URN:NBN:no-nb_digibok_2014110308039    16
Name: count, dtype: int64

In [245]:
pd.DataFrame(poem_list_w_pages).urn.value_counts()

urn
URN:NBN:no-nb_digibok_2013092407013    143
URN:NBN:no-nb_digibok_2014030707024    142
URN:NBN:no-nb_digibok_2006082400061    130
URN:NBN:no-nb_digibok_2006112300015     94
URN:NBN:no-nb_digibok_2012103106111     76
URN:NBN:no-nb_digibok_2017102348005     69
URN:NBN:no-nb_digibok_2014110308039     39
Name: count, dtype: int64

In [246]:
df

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
0,URN:NBN:no-nb_digibok_2016070808151,Syttende mai,8,8,,x,
1,URN:NBN:no-nb_digibok_2016070808151,En fremmed fugl,9,10,x,,
2,URN:NBN:no-nb_digibok_2016070808151,Da rosentræet døde,10,10,x,,
3,URN:NBN:no-nb_digibok_2016070808151,Gjöken,11,12,x,,
4,URN:NBN:no-nb_digibok_2016070808151,Maaltrost,12,12,x,,
...,...,...,...,...,...,...,...
300,URN:NBN:no-nb_digibok_2014110308039,Gul-Spurven,25,27,x,,
301,URN:NBN:no-nb_digibok_2014110308039,Fagerfuglen i Frommerede,27,30,x,,
302,URN:NBN:no-nb_digibok_2014110308039,Ægte Perler,30,32,x,,
303,URN:NBN:no-nb_digibok_2014110308039,En Dag paa Hope,32,35,x,,


In [247]:
target_df = pd.DataFrame(poem_list_w_pages)

In [248]:
len(set(df.title) - set(target_df.title))

38

In [253]:
poem_list_w_pages

[Poem(urn='URN:NBN:no-nb_digibok_2012103106111', title='[introdikt]', page_start=11, page_end=11, overlapp=nan, digital_visning=nan, comment='obs: dramatisk fortelling; fotnoter s. 12', pages=['\n\n']),
 Poem(urn='URN:NBN:no-nb_digibok_2012103106111', title='Fyrestev', page_start=12, page_end=13, overlapp=nan, digital_visning=nan, comment=nan, pages=["JFpefteu. \n\nafttne fict» bei flal albri trjott*); \n\nfer fom ein tof beim paa Slaraféfjoren. \n■iUcine ftcD gjet>cr albri traut; \n\nfer Kfforø regne or ff ljom raut**). \n\nOg mine difur bei er fo mange; \nfom tøegajebtomann bei br&3 i fangje. \n©o rjeo be bore i al fi ti: \n\nbefS meir bei foe bei, bcfS meir be bli'. \n\nOg tor 3We futtar i forg l)elb glea, \nog fofs be gjengjer, fo lat 3Ke foea. \nDg fo ftier ftetri paa belemaal, \nlifiom fulur trillar i tøtøerjlaal. \n\nDg fo er ftebi og fo er maale, \nfom fljote***) heftar paa ifen fjaale. \n<3o lb er ftetri paa belebté, \n\nliffom fljote tjeftar paa fjaale té \n\nenbe (berao: ein

In [252]:
len(poem_list_w_pages)

693

In [249]:
df.loc[df.title.isin(set(df.title) - set(target_df.title))]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
0,URN:NBN:no-nb_digibok_2016070808151,Syttende mai,8,8,,x,
1,URN:NBN:no-nb_digibok_2016070808151,En fremmed fugl,9,10,x,,
2,URN:NBN:no-nb_digibok_2016070808151,Da rosentræet døde,10,10,x,,
3,URN:NBN:no-nb_digibok_2016070808151,Gjöken,11,12,x,,
4,URN:NBN:no-nb_digibok_2016070808151,Maaltrost,12,12,x,,
5,URN:NBN:no-nb_digibok_2016070808151,En livslod,13,14,x,,
6,URN:NBN:no-nb_digibok_2016070808151,Tröst og haab,14,14,x,,
7,URN:NBN:no-nb_digibok_2016070808151,En sommerkveld ved graven,15,15,x,,
8,URN:NBN:no-nb_digibok_2016070808151,Taaren,15,15,x,,
9,URN:NBN:no-nb_digibok_2016070808151,Paa Vagt,16,16,x,,


In [250]:
target_df.title.count()

693

In [251]:
target_df

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,pages
0,URN:NBN:no-nb_digibok_2012103106111,[introdikt],11,11,,,obs: dramatisk fortelling; fotnoter s. 12,[\n\n]
1,URN:NBN:no-nb_digibok_2012103106111,Fyrestev,12,13,,,,[JFpefteu. \n\nafttne fict» bei flal albri trj...
2,URN:NBN:no-nb_digibok_2012103106111,Fyrestev,12,13,,,,[JFpefteu. \n\nafttne fict» bei flal albri trj...
3,URN:NBN:no-nb_digibok_2012103106111,Vaaren. Sigri syng,14,14,,,,"[$aareit, «Sigri tøng. \n\n$ [fog og marf og i..."
4,URN:NBN:no-nb_digibok_2012103106111,Gunnar køyrer Sigri i joli,15,15,x,,fotn,"[*) fengja, „HUn"", \n\n\n]"
...,...,...,...,...,...,...,...,...
688,URN:NBN:no-nb_digibok_2014110308039,En Dag paa Hope,32,35,x,,,[23 \n\n\nMen hør Du liden Gunhild min og agt ...
689,URN:NBN:no-nb_digibok_2014110308039,En Dag paa Hope,32,35,x,,,[23 \n\n\nMen hør Du liden Gunhild min og agt ...
690,URN:NBN:no-nb_digibok_2014110308039,En Aften paa Sætre,35,37,x,,,"[26 \n\n\nHøsten staar i dybe Eftertanker, \n\..."
691,URN:NBN:no-nb_digibok_2014110308039,En Aften paa Sætre,35,37,x,,,"[26 \n\n\nHøsten staar i dybe Eftertanker, \n\..."


urn
URN:NBN:no-nb_digibok_2006112300015    93
URN:NBN:no-nb_digibok_2016070808151    38
URN:NBN:no-nb_digibok_2014030707024    29
URN:NBN:no-nb_digibok_2012103106111    26
URN:NBN:no-nb_digibok_2013092407013    23
URN:NBN:no-nb_digibok_2017102348005    20
URN:NBN:no-nb_digibok_2006082400061    17
URN:NBN:no-nb_digibok_2014110308039    16
Name: count, dtype: int64

In [117]:
len(txts)

1

In [91]:
soup = BeautifulSoup(alto, "lxml")

In [97]:
soup

<html><body><alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
<description>
<measurementunit>mm10</measurementunit>
<sourceimageinformation>
<filename>https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2006112300015_0040/full/pct:50/0/native.jpg</filename>
</sourceimageinformation>
<ocrprocessing id="OCR_0">
<ocrprocessingstep>
<processingsoftware>
<softwarename>tesseract 5.2.0</softwarename>
</processingsoftware>
</ocrprocessingstep>
</ocrprocessing>
</description>
<styles><textstyle fontfamily="Times Roman" fontsize="32" id="TXT_0"></textstyle><paragraphstyle align="Block" id="PAR_BLOCK"></paragraphstyle></styles><layout>
<page height="1686" id="page_0" physical_img_nr="0" width="1192">
<printspace height="1686" hpos="0" vpos="0" width="1192">
<graphicalelement height="

In [104]:
bool(soup.find("ComposedBlock"))

False

In [99]:
soup.find_all("composedblock")

[<composedblock height="30" hpos="640" id="cblock_1" vpos="256" width="40">
 <textblock height="30" hpos="640" id="block_0" stylerefs="TXT_0 PAR_BLOCK" vpos="256" width="40">
 <textline height="30" hpos="640" id="line_0" vpos="256" width="40">
 <string content="36." height="30" hpos="640" id="string_0" stylerefs="TXT_0" vpos="256" wc="0.92" width="40"></string>
 </textline>
 </textblock>
 </composedblock>,
 <composedblock height="154" hpos="300" id="cblock_2" vpos="330" width="692">
 <textblock height="154" hpos="300" id="block_1" stylerefs="TXT_0 PAR_BLOCK" vpos="330" width="692">
 <textline height="38" hpos="302" id="line_1" vpos="330" width="692">
 <string content="Hvor" height="26" hpos="302" id="string_1" stylerefs="TXT_0" vpos="330" wc="0.92" width="69"></string><sp hpos="371" vpos="330" width="17"></sp>
 <string content="længe" height="34" hpos="388" id="string_2" stylerefs="TXT_0" vpos="330" wc="0.93" width="86"></string><sp hpos="475" vpos="330" width="17"></sp>
 <string conte

In [92]:
get_text(soup)

''

In [87]:
print(alto)

<alto xmlns="http://www.loc.gov/standards/alto/ns-v3#" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd">
	<Description>
		<MeasurementUnit>mm10</MeasurementUnit>
		<sourceImageInformation>
			<fileName>https://www.nb.no/services/image/resolver/URN:NBN:no-nb_digibok_2006112300015_0040/full/pct:50/0/native.jpg</fileName>
		</sourceImageInformation>
		<OCRProcessing ID="OCR_0">
			<ocrProcessingStep>
				<processingSoftware>
					<softwareName>tesseract 5.2.0</softwareName>
				</processingSoftware>
			</ocrProcessingStep>
		</OCRProcessing>
	</Description>
	<Styles><TextStyle ID="TXT_0" FONTSIZE="32" FONTFAMILY="Times Roman"/><ParagraphStyle ID="PAR_BLOCK" ALIGN="Block"/></Styles><Layout>
		<Page WIDTH="1192" HEIGHT="1686" PHYSICAL_IMG_NR="0" ID="page_0">
			<PrintSpace HPOS="0" VPOS="0" WIDTH="1192" HEIGHT="1686">
				<GraphicalElement ID=

In [82]:
txt

''

In [76]:
p

Poem(urn='URN:NBN:no-nb_digibok_2006112300015', title='36.', page_start=41, page_end=41, overlapp=nan, digital_visning=nan, comment=nan)

In [61]:
soup = BeautifulSoup(altos, "lxml-xml")

In [66]:
txt = get_text(soup)

In [68]:
print(txt)

37 


Nei «fram og atter* er ei Veien lige; 
thi den, der ei gaar frem, han gaar tilbage, 
og den, der ikke giver, han maa tage, 


naar Trinet slutter paa hans Himmelstige. 


Thi ,udenom* formaar han ei at vige, 
saalidtsom tage sine Tvivl afdage, 
og derfor maa hån rundtom Lyset jage 


og stadig brænde sine Vingeflige. 


Men der, hvor Lyset kun formaar at svide, 
er Skylden tit en Safternes Forarmen, 


en Tidsmarasmes Peg mod det Senile. 


Man oparbeider sig og tror at tvivle, 
men væk er Viljekraft og Hjertevarmen, 
og Tvivlen glattet til en Tankesklide. 


41 





In [52]:
p

Poem(urn='URN:NBN:no-nb_digibok_2016070808151', title='Sigrdrifa', page_start=30, page_end=31, overlapp='x', digital_visning=nan, comment=nan)

In [7]:
res = get_alto(urn, 25)

In [11]:
soup = BeautifulSoup(res, "lxml-xml")

In [27]:
print(text)

17 


Ingen Rullader, ingen høie Sving, 
Med sænkte Vinger 

Sidder hun i Ro 

I Herrens Bo 

Og sagte nynner. 

Fast som et Suk, næsten som en Bøn 
Ingen, slet Ingen 

Ved hvad som rører sig i Løn 
Indenfor Vingen. 


I Vintermorgen aarle og trist 

Som Graven derude 

Utænkt hun kommer saa som med Iist 
Og pikker paa min Rude. 

Ol velkommen mumler jeg igjen: 

Er vi alt vaagen? 

Aarle af Seng, min udvalgte Ven, 
Angrer ei Nogen! 


Himmel! hvad Lyst i tidligste Gry 
At høre din Vise, 

Da vil Tanken høit over Sky, 
Herren at prise. 

O, den, der kunde saa ha' i Stel 
Strax paa Stunden 

Bøn og Suk i Morgen og Kvel 
Med Sang i Munden. 


Tysl hvad var det?  En Rovfugleflok, 
Skjærende Luften, 

Liden Spurv over Sten, over Stok 

Flyr til Kirke-Tuften, 

Kjendt i hver Grind 

Smutter hun ind 



