# Parse excel from norn collaborators

In [2]:
import pandas as pd
import numpy as np
import re
from typing import List, Iterable, Tuple, NamedTuple, TypedDict

from norn.poems import Poem, PoemCollection, create_list_of_books, PoemsTester
from norn.utils import is_valid_digibok_urn

In [3]:
# Import Ranveig's data
data1 = "pabegynt_1880_dikt__sidetall.xlsx"
data2 = "1890 enkeltdikt.xlsx"
data = data2


df = pd.read_excel(data, header=None)


In [4]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,1890 enkeltdikt,,,,,,
1,URN:NBN:no-nb_digibok_2009032303011,"Andersen, Tryggve",Digte,1898,,,
2,,Tittel på dikt,Sidetall START,Sidetall SLUTT,Overlapp,Mangler digital visning,Kommentarer
3,,Variation,8,9,,,
4,,Pigen med fuglefælden,10,12,,,
...,...,...,...,...,...,...,...
3222,,I kvelden,91,92,x,,
3223,,Ljose netter,92,93,x,,
3224,,17de mai 1892,93,95,x,,
3225,,Matpoesi,95,96,x,,


In [5]:
poems = create_list_of_books(df) # Create a list of poems using the parser script
poems[:2]

[Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Variation', page_start=8, page_end=9, overlapp=nan, digital_visning=nan, comment=nan, dhlabid=None, pages=None),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Pigen med fuglefælden', page_start=10, page_end=12, overlapp=nan, digital_visning=nan, comment=nan, dhlabid=None, pages=None)]

In [6]:
df = pd.DataFrame(poems)

In [7]:
df.loc[df.page_start.apply(lambda x: isinstance(x, str))] # Find the poems that have a page number not int

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,dhlabid,pages
2592,URN:NBN:no-nb_digibok_2009032303010,Ein Grust,droppe,,,,På tysk!,,


In [8]:
PoemsTester(poems) # Test poems

AssertionError: Page start not valid

In [9]:
df.loc[df.urn.apply(is_valid_digibok_urn)] # Check if the urns are valid

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,dhlabid,pages
0,URN:NBN:no-nb_digibok_2009032303011,Variation,8,9.0,,,,,
1,URN:NBN:no-nb_digibok_2009032303011,Pigen med fuglefælden,10,12.0,,,,,
2,URN:NBN:no-nb_digibok_2009032303011,Troldsøstre,13,14.0,,,,,
3,URN:NBN:no-nb_digibok_2009032303011,I Sarons dal,15,16.0,,,,,
4,URN:NBN:no-nb_digibok_2009032303011,Idyl,17,18.0,,,,,
...,...,...,...,...,...,...,...,...,...
2936,URN:NBN:no-nb_digibok_2010081610001,Et elskovsdrama,17,18.0,x,,,,
2937,URN:NBN:no-nb_digibok_2010081610001,Eneboerens livshistorie,18,21.0,x,,Del I til V,,
2938,URN:NBN:no-nb_digibok_2010081610001,Forkröblet kludder,22,32.0,,,"Replikker, dramatisk dikt",,
2939,URN:NBN:no-nb_digibok_2010081610001,For sent,33,33.0,,,,,


## Remove error

In [10]:
df.dtypes # Check the data types

urn                 object
title               object
page_start          object
page_end           float64
overlapp            object
digital_visning     object
comment             object
dhlabid             object
pages               object
dtype: object

In [11]:
df.loc[df["page_end"].isna()] # Check for NaN values in page_end

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,dhlabid,pages
2592,URN:NBN:no-nb_digibok_2009032303010,Ein Grust,droppe,,,,På tysk!,,


In [12]:
df = df.loc[df["page_end"].notna()] # Drop NaN values in page_end

In [13]:
df['page_end'] = df['page_end'].astype(int) # Convert page_end to int

In [14]:
df.loc[df["page_end"].apply(lambda x: not isinstance(x, int))] # Check for non-int values in page_end

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,dhlabid,pages


In [22]:
poems = [Poem(*x) for x in df.values] # Create a list of poems from the df

In [23]:
PoemsTester(poems) # Test the poems

All tests passed


<norn.poems.PoemsTester at 0x7fb81af9a9b0>

## Assign an ID to each row

each norn poem should have a dhlab id

dhlab_norn_00001

In [24]:
def assing_dhlabid(poems: List[Poem]) -> List[Poem]:
    """Assign a dhlabid to each poem"""
    template = "dhlab_norn_poem_{:05}"
    
    for i, poem in enumerate(poems):
        poem.dhlabid = template.format(i + 1)
    return poems

In [25]:
assing_dhlabid(poems)

PoemsTester(poems)

df = pd.DataFrame(poems)
df

All tests passed


Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment,dhlabid,pages
0,URN:NBN:no-nb_digibok_2009032303011,Variation,8,9,,,,dhlab_norn_poem_00001,
1,URN:NBN:no-nb_digibok_2009032303011,Pigen med fuglefælden,10,12,,,,dhlab_norn_poem_00002,
2,URN:NBN:no-nb_digibok_2009032303011,Troldsøstre,13,14,,,,dhlab_norn_poem_00003,
3,URN:NBN:no-nb_digibok_2009032303011,I Sarons dal,15,16,,,,dhlab_norn_poem_00004,
4,URN:NBN:no-nb_digibok_2009032303011,Idyl,17,18,,,,dhlab_norn_poem_00005,
...,...,...,...,...,...,...,...,...,...
2935,URN:NBN:no-nb_digibok_2010081610001,Et elskovsdrama,17,18,x,,,dhlab_norn_poem_02936,
2936,URN:NBN:no-nb_digibok_2010081610001,Eneboerens livshistorie,18,21,x,,Del I til V,dhlab_norn_poem_02937,
2937,URN:NBN:no-nb_digibok_2010081610001,Forkröblet kludder,22,32,,,"Replikker, dramatisk dikt",dhlab_norn_poem_02938,
2938,URN:NBN:no-nb_digibok_2010081610001,For sent,33,33,,,,dhlab_norn_poem_02939,


## Export

In [26]:
df.to_csv("poems.csv", index=False)

File page  https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2014110308039/altos/URN:NBN:no-nb_digibok_2014110308039_C1

https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2014110308039/altos/URN:NBN:no-nb_digibok_2014110308039_I1

