In [2]:
import pandas as pd
import sys
from norn.config import DATA_BASEPATH
from pymongo import MongoClient
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from glob import glob
import dhlab as dh
from pathlib import Path
from tqdm.notebook import tqdm

In [3]:
re_ocr_path = Path("/mnt/md1/new_2023/bok_txt")

In [4]:
c1800 = pd.read_excel(DATA_BASEPATH / '1800-1839.xlsx')
c1840 = pd.read_excel(DATA_BASEPATH /'1840-1869.xlsx')
kk = pd.read_excel( DATA_BASEPATH / "konsensus-korpus.xlsx")
kka = pd.read_excel( DATA_BASEPATH / "konsensus_korpus_annotert.xlsx")

In [5]:
db = MongoClient()["norn"]

In [6]:
@dataclass
class NornText:
    urn: str
    path: str
    korpus: str
    metadata: Dict[str, Any]
    files: list
    text: Optional[str] = None
    
def norn_text_from_korpus_row(row: pd.Series, korpus: str = "c1800") -> NornText:
    urn = row['urn']
    base_urn = urn.split(":")[-1]
    path = str(re_ocr_path / f"{base_urn}")
    #path = row['path']
    #korpus = row['korpus']
    files = sorted(glob(f"{path}/*"))
    metadata = row.to_dict()
    
    if len(files) == 0:
        raise ValueError(f"No files found for {urn} at {path}")
    return NornText(urn, path, korpus, metadata, files)


def get_texts(files: List[str]) -> str:
    txt = ""
    for x in sorted(files):
        with open(x) as f:
            txt += f.read()
            txt += "\n\n"
    return txt

In [7]:
dhc1800 = dh.Corpus()
dhc1800.extend_from_identifiers(c1800.urn)
assert len(dhc1800.frame) == len(c1800)


dhc1840 = dh.Corpus()
dhc1840.extend_from_identifiers(c1840.urn)
assert len(dhc1840.frame) == len(c1840)

AssertionError: 

In [14]:
dhkka = dh.Corpus()
dhkka.extend_from_identifiers(kka.urn)
assert len(dhkka.frame) == len(kka)

AssertionError: 

In [9]:
kka.loc[kka.urn == "URN:NBN:no-nb_digibok_2017020148100"].head(5)

Unnamed: 0,Kontekst-nasjonalromantisk,Realistisk,Ikke nasjonalromantisk,Nasjonalromantisk,Kontekst-nasjonalromantisk-realisme,Nasjonalromantisk-realisme,Verk,Forfatter,urn,Ikke angitt i Lit.Hist.,Samling
15,2,0,0,1,1,0,Blomster-Ole,"Moe, J.",URN:NBN:no-nb_digibok_2017020148100,0,0
39,2,0,0,0,1,0,Slædefart,"Moe, J.",URN:NBN:no-nb_digibok_2017020148100,0,0
41,0,0,0,1,0,0,Knud og Birgit,"Moe, J.",URN:NBN:no-nb_digibok_2017020148100,0,0
101,1,0,1,0,0,0,Sæterjentens søndag,"Moe, J.",URN:NBN:no-nb_digibok_2017020148100,0,0
124,0,0,0,1,1,0,Det lysned i Skoven,"Moe, J.",URN:NBN:no-nb_digibok_2017020148100,0,0


In [10]:
c1840.loc[c1840.duplicated(subset=['urn'], keep=False)]

Unnamed: 0,urn
3,URN:NBN:no-nb_digibok_2008102912002
12,URN:NBN:no-nb_digibok_2008102912002


In [11]:
dhc1840.frame.loc[dhc1840.frame.urn == "URN:NBN:no-nb_digibok_2008102912002"]

Unnamed: 0,urn,dhlabid,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
35,URN:NBN:no-nb_digibok_2008102912002,100628905.0,"De tre Bjørne , eller Besøget i Bjørnestuen : ...","Asbjørnsen , P. Chr . ( Peter Christen )",oai:nb.bibsys.no:990315083834702202,ab433a503c2bdf18e77d17ed7e08cc3c,,Christiania,18640101.0,1864.0,,nob,,,,Uklassifisert,digibok,dhlab,20221209.0


In [15]:
def update_db(df : pd.DataFrame, korpus: str):
    urns_not_in_reocr = []
    
    for _, row in df.iterrows():
        try:
            norn_text = norn_text_from_korpus_row(row, korpus)
            if len(norn_text.files) == 0:
                print(f"No files found for {norn_text.urn}")
            else:
                norn_text.text = get_texts(norn_text.files)
                
                
                db.norn_texts_temp.insert_one(norn_text.__dict__)
        except ValueError as e:
            print(e)
            print(row['urn'])
            urns_not_in_reocr.append(row['urn'])
            continue
        
    return urns_not_in_reocr

In [16]:
korpus = {
    "c1800": dhc1800,
    "c1840": dhc1840,
    "kk": dhkka
}

urns_not_in_reocr = []
for key, value in korpus.items():
    e = update_db(value.frame, key)
    urns_not_in_reocr.extend(e)

No files found for URN:NBN:no-nb_digibok_2020022048003 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2020022048003
URN:NBN:no-nb_digibok_2020022048003
No files found for URN:NBN:no-nb_digibok_2006120501037 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2006120501037
URN:NBN:no-nb_digibok_2006120501037
No files found for URN:NBN:no-nb_digibok_2007110600049 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2007110600049
URN:NBN:no-nb_digibok_2007110600049
No files found for URN:NBN:no-nb_digibok_2008040800072 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2008040800072
URN:NBN:no-nb_digibok_2008040800072
No files found for URN:NBN:no-nb_digibok_2010061806033 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2010061806033
URN:NBN:no-nb_digibok_2010061806033
No files found for URN:NBN:no-nb_digibok_2011040508147 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2011040508147
URN:NBN:no-nb_digibok_2011040508147
No files found for URN:NBN:no-nb_digibok_2011040508148 at /mnt/md1/new_2023/bok_txt/no-nb_digibok_2011040508148


In [30]:
dhc1840.frame.loc[dhc1840.frame.urn.isin(urns_not_in_reocr)].urn.nunique()

1

In [31]:
dhc1840.frame.loc[dhc1840.frame.urn.isin(urns_not_in_reocr)]

Unnamed: 0,urn,dhlabid,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
226,URN:NBN:no-nb_digibok_2020022048003,100484948.0,Foraarsgave for Damer,"Monsen , Chr . / Borg , Thorkild",oai:nb.bibsys.no:990313612184702202,bef7a63f78fb1b98e6accd652b601c53,,,18490101.0,1849.0,Trykt hos Thorkild Borg,dan,Norsk litteratur,,fiction,Skjønnlitteratur,digibok,nb,20060101.0


In [28]:
dhkka.frame.loc[dhkka.frame.urn.isin(urns_not_in_reocr)].urn.nunique()

22

In [19]:
with open("konsensus_urns_not_in_reocr.txt", "w") as f:
    f.write("\n".join(urns_not_in_reocr))

In [34]:
c = dh.Corpus()
c.extend_from_identifiers(urns_not_in_reocr)
c.sort("year")

Unnamed: 0,urn,dhlabid,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
20,URN:NBN:no-nb_digibok_2021121448574,100604100.0,Salmer,"Landstad , M. B. / Baklid , Herleik / Haavik ,...",oai:nb.bibsys.no:999920079479002202,ebcbd7c60f2d7e8d66cfb35d4b8d2019,,,2020.0,2020.0,Novus forlag,nob,Salmer / Kulturhistorie / Kulturhistorie / Fol...,264.23,bibliography / biography / tekst,Faglitteratur,digibok,nb,20060101.0
12,URN:NBN:no-nb_digibok_2012051805111,100071871.0,Henrik Ibsens skrifter . 16 : Sakprosa,"Ibsen , Henrik / Janss , Christian / Ystad , V...",oai:nb.bibsys.no:991119062104702202,71f9e6f983531e8293c9eff8a34ec28e,,Oslo,20100101.0,2010.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,839.82s / 839.822,fiction,Skjønnlitteratur,digibok,nb,20060101.0
9,URN:NBN:no-nb_digibok_2011062708033,100544647.0,Henrik Ibsens skrifter . 11 : Dikt,"Ibsen , Henrik / Ystad , Vigdis / Janss , Chri...",oai:nb.bibsys.no:991002464354702202,0b00adf4dbd448568e5874ae5f9d81ae,,Oslo,20090101.0,2009.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,829.822,poetry,Skjønnlitteratur,digibok,nb,20060101.0
6,URN:NBN:no-nb_digibok_2011061708062,100542693.0,Henrik Ibsens skrifter . 4 : Svanhild ; Kjærli...,"Ibsen , Henrik / Ystad , Vigdis",oai:nb.bibsys.no:990801754144702202,ef963a4d7ad20d96d8a881f2d6c70108,8203189547 / 8203189636 / 8203189814,Oslo,20080101.0,2008.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,839.82s / 839.822,drama,Skjønnlitteratur,digibok,nb,20060101.0
10,URN:NBN:no-nb_digibok_2011091208031,100055374.0,Henrik Ibsens skrifter . 5 : Episk Brand ( 186...,"Ibsen , Henrik / Ystad , Vigdis",oai:nb.bibsys.no:990707950454702202,7397db0c8a04ba3a83a34f063fc0ba1b,8203189881 / 8203190022 / 8203189814,Oslo,20070101.0,2007.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,839.822,fiction,Skjønnlitteratur,digibok,nb,20060101.0
8,URN:NBN:no-nb_digibok_2011061708156,100542769.0,Henrik Ibsens skrifter . 3 : Fru Inger til Øst...,"Ibsen , Henrik / Ystad , Vigdis / Janss , Chri...",oai:nb.bibsys.no:990610088004702202,108b42e66b6a6f96ee06404a2e91b75c,8203189598 / 8203190014 / 8203189814,Oslo,20060101.0,2006.0,[Universitetet i Oslo],nob,norske / skuespill / 1850-tallet,839.822,fiction,Skjønnlitteratur,digibok,nb,20060101.0
7,URN:NBN:no-nb_digibok_2011061708155,100542768.0,Henrik Ibsens skrifter . 2 : Gildet paa Solhou...,"Ibsen , Henrik / Ystad , Vigdis",oai:nb.bibsys.no:990609739194702202,9bde2e808695a8a6c04a74dccef77a88,8203189571 / 8203190006 / 8203189814,Oslo,20060101.0,2006.0,[Universitetet i Oslo],nob,norske / skuespill,839.822,fiction,Skjønnlitteratur,digibok,nb,20060101.0
5,URN:NBN:no-nb_digibok_2011040508148,100538545.0,Henrik Ibsens skrifter . 12 : Brev 1844-1871,"Ibsen , Henrik / Ystad , Vigdis / Fulsås , Nar...",oai:nb.bibsys.no:990601518034702202,f3b9cddaf8a1837af19e04e5dbe34fb0,8203190103 / 8203189458 / 8203189814,Oslo,20050101.0,2005.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,839.822,letter,Faglitteratur,digibok,nb,20060101.0
4,URN:NBN:no-nb_digibok_2011040508147,100535717.0,Henrik Ibsens skrifter . 1 : Catilina ( 1850 )...,"Ibsen , Henrik / Ystad , Vigdis / Janss , Chri...",oai:nb.bibsys.no:990600822164702202,b7ad880c0370c04e8229b732c1544f26,8203189555 / 8203189849 / 8203189814,Oslo,20050101.0,2005.0,[Universitetet i Oslo],nob,norske / skuespill / 1800-tallet / drama,839.822,fiction,Skjønnlitteratur,digibok,nb,20060101.0
19,URN:NBN:no-nb_digibok_2021042148807,100440317.0,Norsk tro og tanke . B. 2 : 1800-1940,,oai:nb.bibsys.no:999825325634702202,a51aed33dd72398bf66315f6def736d2,8215001726 / 8251838118 / 8251838096,,19980101.0,1998.0,Tano Aschehoug,mul / nob / nno / dan,Mentalitet / Historisk framstilling / Religiøs...,948.1,,Faglitteratur,digibok,nb,20060101.0


In [35]:
c.to_csv("konsensus_urns_not_in_reocr.csv")