# Parse excel from norn collaborators

In [64]:
import pandas as pd
import numpy as np
import re
from typing import List, Iterable, Tuple, NamedTuple, TypedDict

In [65]:
data1 = "pabegynt_1880_dikt__sidetall.xlsx"
data2 = "/home/larsm/projects/NORN/Extract_poems/1890 enkeltdikt.xlsx"
data = data2


df = pd.read_excel(data, header=None)


In [66]:
class Poem(NamedTuple):
    urn: str
    title: str
    page_start: int
    page_end: int
    overlapp: str
    digital_visning: str
    comment: str
    
    
    
class PoemCollection(NamedTuple):
    poems: List[Poem]
    urn : str
    author: str
    title: str
    year: int
    publisher: str
    publisher_place: str
    
    
def is_valid_urn(string: str):
    pattern = r'^URN:NBN:no-nb_digibok_\d+$'
    return re.match(pattern, string) is not None

In [67]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,1890 enkeltdikt,,,,,,
1,URN:NBN:no-nb_digibok_2009032303011,"Andersen, Tryggve",Digte,1898,,,
2,,Tittel på dikt,Sidetall START,Sidetall SLUTT,Overlapp,Mangler digital visning,Kommentarer
3,,Variation,8,9,,,
4,,Pigen med fuglefælden,10,12,,,
...,...,...,...,...,...,...,...
3222,,I kvelden,91,92,x,,
3223,,Ljose netter,92,93,x,,
3224,,17de mai 1892,93,95,x,,
3225,,Matpoesi,95,96,x,,


In [68]:
# Create list of books

book_list = []
book = None
for i, row in df.iterrows():
    
    #if row[0] is not np.nan:
    if is_valid_urn(str(row[0])):
        if book is not None:
            book_list.append(book)
        book = PoemCollection([], *row[:6])
    else:
        if row[1] == "Tittel på dikt":
            continue
        elif row[1] is np.nan:
            continue
        else:
            book.poems.append(Poem(book.urn, *row[1:7]))

In [69]:
poem_list = [x.poems for x in book_list if len(x.poems) > 0] # Remove books without poems added

In [70]:
poems = [x for sublist in poem_list for x in sublist] # Flatten list of lists

In [71]:
poems

[Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Variation', page_start=8, page_end=9, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Pigen med fuglefælden', page_start=10, page_end=12, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Troldsøstre', page_start=13, page_end=14, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='I Sarons dal', page_start=15, page_end=16, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Idyl', page_start=17, page_end=18, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='Jeg bygger mit hus', page_start=19, page_end=20, overlapp=nan, digital_visning=nan, comment=nan),
 Poem(urn='URN:NBN:no-nb_digibok_2009032303011', title='To venner', page_start=21, page_end=22, overlapp=nan, d

In [72]:
df = pd.DataFrame(poems)

In [97]:
class PoemsTester:
    
    def __init__(self, poems : pd.DataFrame | Iterable[Poem]):
        
        if isinstance(poems, pd.DataFrame):
            poems = [Poem(*x) for x in poems.values]
        
        self.poems = poems
        
        
        
        assert self.test_urn(), "URN not valid"
        assert self.test_page_start(), "Page start not valid"
        assert self.test_page_end(), "Page end not valid"
        
        print("All tests passed")
        
    def test_urn(self):
        return all([is_valid_urn(x.urn) for x in self.poems])
    
    def test_page_start(self):
        return all([isinstance(x.page_start, int) for x in self.poems])
    
    def test_page_end(self):
        return all([isinstance(x.page_end, int) for x in self.poems])
    
    def test_overlapp(self):
        return all([isinstance(x, str) for x in self.poems])
    
    

In [74]:
df.loc[df.page_start.apply(lambda x: isinstance(x, str))]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
2592,URN:NBN:no-nb_digibok_2009032303010,Ein Grust,droppe,,,,På tysk!


In [98]:
PoemsTester(poems)

All tests passed


<__main__.PoemsTester at 0x7ff6a067d1b0>

In [45]:
is_valid_urn("URN:NBN:no-nb_digibok_2012082908133")

True

In [83]:
df.loc[df.urn.apply(is_valid_urn)]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
0,URN:NBN:no-nb_digibok_2009032303011,Variation,8,9,,,
1,URN:NBN:no-nb_digibok_2009032303011,Pigen med fuglefælden,10,12,,,
2,URN:NBN:no-nb_digibok_2009032303011,Troldsøstre,13,14,,,
3,URN:NBN:no-nb_digibok_2009032303011,I Sarons dal,15,16,,,
4,URN:NBN:no-nb_digibok_2009032303011,Idyl,17,18,,,
...,...,...,...,...,...,...,...
2936,URN:NBN:no-nb_digibok_2010081610001,Et elskovsdrama,17,18,x,,
2937,URN:NBN:no-nb_digibok_2010081610001,Eneboerens livshistorie,18,21,x,,Del I til V
2938,URN:NBN:no-nb_digibok_2010081610001,Forkröblet kludder,22,32,,,"Replikker, dramatisk dikt"
2939,URN:NBN:no-nb_digibok_2010081610001,For sent,33,33,,,


In [20]:
df

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
0,URN:NBN:no-nb_digibok_2009032303011,Variation,8,9,,,
1,URN:NBN:no-nb_digibok_2009032303011,Pigen med fuglefælden,10,12,,,
2,URN:NBN:no-nb_digibok_2009032303011,Troldsøstre,13,14,,,
3,URN:NBN:no-nb_digibok_2009032303011,I Sarons dal,15,16,,,
4,URN:NBN:no-nb_digibok_2009032303011,Idyl,17,18,,,
...,...,...,...,...,...,...,...
2920,URN:NBN:no-nb_digibok_2010081610001,Et elskovsdrama,17,18,x,,
2921,URN:NBN:no-nb_digibok_2010081610001,Eneboerens livshistorie,18,21,x,,Del I til V
2922,URN:NBN:no-nb_digibok_2010081610001,Forkröblet kludder,22,32,,,"Replikker, dramatisk dikt"
2923,URN:NBN:no-nb_digibok_2010081610001,For sent,33,33,,,


## Remove error

In [76]:
df.dtypes

urn                 object
title               object
page_start          object
page_end           float64
overlapp            object
digital_visning     object
comment             object
dtype: object

In [77]:
df.loc[df["page_end"].isna()]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment
2592,URN:NBN:no-nb_digibok_2009032303010,Ein Grust,droppe,,,,På tysk!


In [78]:
df = df.loc[df["page_end"].notna()]

In [79]:
df['page_end'] = df['page_end'].astype(int)

In [90]:
df.loc[df["page_end"].apply(lambda x: not isinstance(x, int))]

Unnamed: 0,urn,title,page_start,page_end,overlapp,digital_visning,comment


In [81]:
poems = [Poem(*x) for x in df.values]

In [95]:
PoemsTester(poems)

All tests passed


<__main__.PoemsTester at 0x7ff6a067e830>

## Export

In [99]:
df.to_csv("poems.csv", index=False)

File page  https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2014110308039/altos/URN:NBN:no-nb_digibok_2014110308039_C1

https://api.nb.no/catalog/v1/metadata/URN:NBN:no-nb_digibok_2014110308039/altos/URN:NBN:no-nb_digibok_2014110308039_I1

