# Get texts

Tasks
- [X] Import metadata
- [X] Assign dhlabid
- [X] Assign dhlab metadata
- [X] Import texts
- [X] Check if any are copyrighted


## Kommentarer
URNen URN:NBN:no-nb_digibok_2006092000005 (konsensuskorpus) gir problemer -- virker å være borte fra nettbiblioteket og dhlab

URN:NBN:no-nb_digibok_2013050224022 er slettet fra nettbiblioteket. Den finnes i dhlab og nettbibliotekt. Den kan kanskje erstattes med URN:NBN:no-nb_digibok_2014101308083. Dette er en annen utgave av samme Ibsen-stykke.

## Konsensus korpus

In [27]:
import requests

api_base_path = "https://api.nb.no/catalog/v1/items/{}?fields=accessInfo&expand=false"


def check_urn(urn_to_check: str) -> bool | dict:
    """Check if a text is free to use.

    Args:
        urn_to_check (str): National library text id (urn) to check

    Returns:
        bool: True if the text is free to use
    """
    response = requests.get(api_base_path.format(urn_to_check))
    if response.status_code == 200:
        return response.json()["accessInfo"]["isPublicDomain"]
    else:
        return {"error" : f"Could not find urn {urn_to_check}"}


In [2]:
from norn.config import DATA_BASEPATH
import pandas as pd
import dhlab as dh

In [None]:
source_metadata_file = DATA_BASEPATH / "consensus_corpus_annotated.xlsx"
df = pd.read_excel(source_metadata_file)
# Noen urn har [] og whitespace rundt seg
df["urn"] = df.urn.str.strip("[] ")
df.urn.nunique()

In [65]:
c = dh.Corpus()
c.extend_from_identifiers(df.urn)
c_df = c.frame

In [None]:
merged = df.merge(c_df, on="urn", how="outer", indicator=True)
merged._merge.value_counts()

In [69]:
merged[merged._merge == "left_only"].iloc[:, :19]

Unnamed: 0,x,y,z,w,x/y,w/y,Work,Author,urn,I,C,dhlabid,title,authors,oaiid,sesamid,isbn10,city,timestamp
0,1,0,0,0,0,0,"Frygt ei, min Sjæl! Guds Ord staar fast","Jensen, P. A.",-,0,0,,,,,,,,
1,1,0,0,0,0,0,Samlinger til det norske Folks Sprog og Histor...,"Berg, J. K.",Tidsskrift,0,0,,,,,,,,
2,1,0,0,0,0,0,Urda [tidsskrift],"Christie, W. F. K. m.fl.",Tidsskrift,0,0,,,,,,,,
3,1,0,0,1,0,0,Norsk Tidsskrift for Videnskab og Litteratur,"Lange, C.",Tidsskrift,0,0,,,,,,,,
4,1,0,0,0,0,0,Juletræ,"Asbjørnsen, P. C.",Tidsskrift,0,0,,,,,,,,
5,1,0,0,0,0,0,Ydal. Et vinterskrift,"Asbjørnsen, P. C.",Tidsskrift,0,0,,,,,,,,
6,2,0,0,0,0,0,Folkevennen [tidsskrift],"Vig, O. [red.]",Tidsskrift,0,0,,,,,,,,
7,2,0,0,0,0,0,Budstikken [tidsskrift om landbruk],"Asbjørnsen, P. C. [medredaktør]",Tidsskrift,0,0,,,,,,,,
14,1,0,0,0,0,0,Norsk Landmandsbog [naturvitenskapelig tidsskr...,"Asbjørnsen, P. C.",URN:NBN:no-nb_digibok_2006092000005,0,0,,,,,,,,
338,0,0,0,1,0,0,Rypen i Justedal [utrykt],"Ibsen, H.",Utrykt,0,0,,,,,,,,


In [74]:
merged.drop(columns="_merge", inplace=True)

## Add public domain info

In [None]:
import numpy as np
from time import sleep
from tqdm.notebook import tqdm


# Check urns for public domain
lst = []
for urn in tqdm(merged.urn):
    if "URN" in urn:
        res = check_urn(urn)
    else:
        res = np.NAN # No URN
        
    lst.append((urn, res))
    sleep(0.001)
    

In [None]:
df = pd.DataFrame(lst, columns=["urn", "public_domain"])
df.public_domain.value_counts(dropna=False)

In [103]:
merged = merged.merge(df, on="urn", how="outer")

In [118]:
lst = merged.loc[merged.public_domain == False, ["dhlabid"]].drop_duplicates().astype(int).dhlabid.to_list()
from glob import glob

files = []
for dhlabid in lst:
    files += glob("texts/*/*{}*".format(dhlabid))
    
files

['texts/konsensus_korpus/dhlab_100616406.txt',
 'texts/konsensus_korpus/dhlab_100637801.txt']

In [123]:
merged.to_excel("metadata/konsensus_korpus_annotated_with_metadata.xlsx", index=False)

## Get texts

In [154]:
import pathlib

In [159]:
destination = pathlib.Path("texts/konsensus_korpus")
pathlib.Path(destination).mkdir(exist_ok=True, parents=True)

In [24]:
def get_text_dhlab(urn: str, dhlabid: str, destination: str = destination):
    text_endpoint = "http://sprakbankdb2.lx.nb.no:5002/get_text"
    
    res = requests.get(text_endpoint, params={"urn": urn})
    if res.status_code == 200:
        txt = res.json()
        if isinstance(txt, dict):
            print(urn, txt)
        elif isinstance(txt, str):
            with open(f"{destination}/{dhlabid}.txt", "w") as f:
                f.write(txt)
    else:
        print(f"Could not get text for urn {urn}")
 
def get_text_alto(urn: str, dhlabid: str, destination: str = destination):
    """Get from Alto

    Args:
        urn (str): _description_
        dhlabid (str): _description_
        destination (str, optional): _description_. Defaults to destination.
    """
    source = pathlib.Path(f"/mnt/md1/new_2023/bok_txt/{urn.split(':')[-1]}")
    if source.exists():
        # print(f"Found {source}")
        files = sorted(source.glob("*.txt"))
        # print(files)
        txt = ""
        for file in files:
            with open(file, "r") as f:
                txt += f.read()
                txt += "\n"
                
        with open(f"{destination}/{dhlabid}.txt", "w") as f:
            f.write(txt)
            
    else:
        print(f"Could not find {source}")  
                          
    

In [167]:
target_urns = merged.loc[merged["public_domain"] == True, ["urn", "dhlabid"]].drop_duplicates()
target_urns.dropna(inplace=True)
target_urns["dhlabid"] = target_urns["dhlabid"].astype(int)

In [22]:
for urn, dhlabid in tqdm(target_urns.values):
    get_text_dhlab(urn, dhlabid)
    get_text_alto(urn, dhlabid)

NameError: name 'target_urns' is not defined

## Get texts for 1800 and 1840 corpora

In [None]:
import dhlab as dh
import pandas as pd
import pathlib
from tqdm.notebook import tqdm
import requests
from time import sleep

k1800_path = pathlib.Path("metadata/1800-1839_metadata.xlsx")
k1800 = pd.read_excel(k1800_path)

k1840 = pd.read_excel("metadata/1840-1869_metadata.xlsx")


# Check urns for public domain
def check_all_urn(df: pd.DataFrame):
    if "urn" not in df.columns:
        print("No urn column")
        return
    
    lst = []
    for urn in tqdm(df.urn):
        if "URN" in urn:
            res = check_urn(urn)
        else:
            res = np.NAN # No URN
            
        lst.append((urn, res))
        sleep(0.001)
        
    pd_df = pd.DataFrame(lst, columns=["urn", "public_domain"])
    
    return df.merge(pd_df, on="urn", how="outer")

# Check urns for public domain
k1800_pd = check_all_urn(k1800)
k1840_pd = check_all_urn(k1840)

# Export metadtata
k1800_pd.to_excel("metadata/1800-1839_metadata.xlsx", index=False)
k1840_pd.to_excel("metadata/1840-1869_metadata.xlsx", index=False)

In [41]:
k1840_pd.public_domain.value_counts(dropna=False), k1800_pd.public_domain.value_counts(dropna=False)

(public_domain
 True                                                                   240
 {'error': 'Could not find urn URN:NBN:no-nb_digibok_2013050224022'}      1
 Name: count, dtype: int64,
 public_domain
 True    101
 Name: count, dtype: int64)

In [42]:
k1840_pd.loc[k1840_pd.urn == "URN:NBN:no-nb_digibok_2013050224022"]


Unnamed: 0.1,Unnamed: 0,urn,dhlabid,title,authors,oaiid,sesamid,isbn10,city,timestamp,...,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp,category,public_domain
118,95,URN:NBN:no-nb_digibok_2013050224022,100622091,Hærmændene paa Helgeland : Skuespil i fire Akter,"Ibsen , Henrik",oai:nb.bibsys.no:999405420484702202,77a228a6cbe227e297c7f2253e81237b,,Christiania,18580101,...,nob,,,drama,Skjønnlitteratur,digibok,dhlab,20221201,Diktning: Dramatikk,{'error': 'Could not find urn URN:NBN:no-nb_di...


In [18]:
destination = pathlib.Path("texts/1800-1839")
destination.mkdir(exist_ok=True, parents=True)


for urn, dhlabid in tqdm(k1800.loc[:,["urn", "dhlabid"]].values):
    get_text_dhlab(urn, dhlabid, destination=destination)
    get_text_alto(urn, dhlabid,destination=destination)

  0%|          | 0/101 [00:00<?, ?it/s]

In [25]:
destination = pathlib.Path("texts/1840-1869")
destination.mkdir(exist_ok=True, parents=True)


for urn, dhlabid in tqdm(k1840.loc[:,["urn", "dhlabid"]].values):
    get_text_dhlab(urn, dhlabid, destination=destination)
    get_text_alto(urn, dhlabid,destination=destination)

  0%|          | 0/239 [00:00<?, ?it/s]

URN:NBN:no-nb_digibok_2013050224022 {'error': 'Text is not free to use'}
Could not find /mnt/md1/new_2023/bok_txt/no-nb_digibok_2020022048003
