This notebook holds an example on how to extract data from the KB xml files and store the data into a pandas dataframe.

The script reads the KB xml files from the folder _kb_xml_files_ and outputs a dataframe.

Description of tag names.

- 008 - a language_tag.
- 041 - an additional language_tag. It occurs, for example, when 008 is taken equal to 'mul'.
- 096 - There are several fields in 096.
- 100 - author tag 1
- 198 - (300 + 500 + 505 fields that are mapped together) is a field with various metadata
- 245 - title field
- 260 - place of publication, publisher and year of publication
- 650 - subject field1. 
- 651 - subject field2
- 653 - subject field3
- 084 - subject field4
- 700 - author tag 2 (if not content in 100 and sometimes there is data in both fields)

### Inspect the KB collection xml files

In [2]:
from bs4 import BeautifulSoup 
import re
import pandas as pd
import os

In [3]:
os.chdir(r'.\kb_xml_files')

In [16]:
# Reading the xml
with open(r'kb_collections_1.xml', 'r', encoding='utf-8') as f:
    data_in = f.read()

# Passing and storing the returned object
data = BeautifulSoup(data_in, "xml")

# inspect tags
all_tags = [tag.name for tag in data.find_all()]

# filter record tags
record = [i for i in data.find_all('record')]

print (f'The xml holds {len(record)} records.')

The xml holds 10000 records.


### 008 - extract content of language tags

In [70]:
def get_lang(rec):
    content_008 = rec.find('controlfield', {'tag': '008'})
    content_008_split = re.split(r'\d{2}\|', content_008.get_text())[-1]
    content_008_split_lan = content_008_split[-5:-2]    
    return content_008_split_lan
    
language = [get_lang(i) for i in record]

count = 0
for index, item in enumerate(language):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_lan_fields = count    
print(f'{empty_lan_fields} language fields are empty.')
print ('mul = multi-language, grc = greek, und = undefined')
print (f'A set of the first hundred: {set(language1[0:100])}')


0 language fields are empty.
mul = multi-language, grc = greek, und = undefined
A set of the first hundred: {'dan', 'rus', 'heb', 'ger', 'ita', 'nor', 'und', 'fin', 'dut', 'swe', 'lat', 'fre'}


### 041 - extra language info

In [71]:
def get_extra_lang_info(rec):
    try:
        content_041 = rec.find('datafield', {'tag': '041'}).find_all('subfield')
        content_041_list = [i.get_text() for i in content_041]   
        return ', '.join(content_041_list)
    except AttributeError:
        pass

extra_lang_info = [get_extra_lang_info(i) for i in record] 

count = 0
for index, item in enumerate(extra_lang_info):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_ext_lan_fields = count 

print(f'{empty_ext_lan_fields} extra language fields are empty.')

9891 extra language fields are empty.


### 096 - a mixed field
096 holds shelf numbers, subject codes, and more.

In [72]:
def shelf(rec):
    try:
        return rec.find('datafield', {'tag':'096'}).find('subfield').get_text()
    except AttributeError:
        pass
    
shelf = [shelf(i) for i in record]    
    
count = 0
for index, item in enumerate(shelf):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_shelf_fields = count 

print(f'{empty_shelf_fields} empty_shelf_fields are empty.')
print (f'A set of the first hundred values: {set(shelf[0:100])}')

3360 empty_shelf_fields are empty.
A set of the first hundred values: {'67, 214', '54,-352 8°', '151, 366', '58,-351 8 °', 'Folketeatret 8', 'Hielmst. 186 4°', '151, 438', '75:4, 357', '91, 9', None, 'Hielmst. 33 2°', 'Jens Peter Larsens samling', '67, 216', '171, 132', 'ACN MBV Biog', 'Bibliotheca Danica - Retropost', 'Hielmst. 1 8°', '151, 399', '89, 291', '18,-118 8°', 'DA-Småtryk', '1,-28 4°', '151, 344', '151, 401', '67, 220', '86, 241'}


### 198 - miscellaneous meta data

In [73]:
def misc(rec):
    try:
        return rec.find('datafield', {'tag':'198'}).get_text().replace('<biblioteksnr>','')
    except AttributeError:
        pass

misc = [misc(i) for i in record]

count = 0
for index, item in enumerate(misc):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_misc_fields = count 

print(f'{empty_misc_fields} empty_misc_fields are empty.')
print (f'A set of the first twenty {set(misc[0:20])}')

768 empty_misc_fields are empty.
A set of the first twenty {'(28)394 s.', '[8], 323, [5] bl. :portr.', '[1 bd.].', '43 s.', '176 s.', '144 sider.', '522 s.', '1 Bl. ;8°.', '1 bd.Revolutionsår: X.', '1 bd.', '1 bd. :ill.Originalår: 1853.', '16, 102 sider.Skuespil.Oversat fra græsk.', '16 s.', '128 s.'}


### 650 - subject field 1 contains data field "a" that we could have interested in.

In [74]:
def subject_field1(rec):
    try:
        return rec.find('datafield', {'tag': '650' }).find('subfield', {'code':'a'}).get_text()
    except AttributeError:
        pass
    
subject_field1 = [subject_field1(i) for i in record]

count = 0
for index, item in enumerate(subject_field1):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_subject_field1 = count 

print(f'{empty_subject_field1} empty_misc_fields are empty.')
print (f'Set of the first hundred: {set(subject_field1[0:100])}')

9481 empty_misc_fields are empty.
Set of the first hundred: {None, 'Stambger Heste.', 'Kvælstofgødning.', 'Surfaces.', 'belønning.'}


### 651 - subject field 2

In [75]:
def subject_field2(rec):
    try:
        data_651 = rec.find('datafield', {'tag': '651' })
        # Extract text from each subfield within the datafield
        subfield_texts = [subfield.get_text() for subfield in data_651.find_all('subfield')]
        return subfield_texts
    except AttributeError:
        pass

def clean_sub_field2(subject):
    if subject[-2:] == ' z':
        return subject[:-2]    
    
subject_field2_var = [subject_field2(i) for i in record]
subject_field2_var_2 = [' '.join(re.findall(r'\b\S+\b', str(i))) for i in subject_field2_var]
subject_field2 = [clean_sub_field2(i) for i in subject_field2_var_2]

count = 0
for index, item in enumerate(subject_field2):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_subject_field2 = count 

print(f'{empty_subject_field2} empty_misc_fields are empty.')
print (f'Set of values {set(subject_field2)}')


9964 empty_misc_fields are empty.
Set of values {'Pergamon', 'Hyllested', 'Hillerød Frederiksborg Slot', None, 'Karnataka', 'Sverige', 'Rom Galleria Borghese', 'Frankrig', 'kunstsamlinger', 'København Det Kongelige Danske Kunstakademi', 'Italien', 'Island', 'Østrig', 'Danmark', 'Venezia', 'Tyskland', 'Ribe amt', 'København', 'Kattegat', 'Napoli Museo Nazionale', 'Storbritannien', 'København Kunstforeningen', 'Herlufsholm', 'København Thorvaldsens Museum'}


### 084 - subject tag 3 contains three data fields (o, n, a) that we are interested in.

In [76]:
def subject_field3(rec):
    try:
        if rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'o'}):
            return 'code o' , rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'o'}).get_text()
        elif rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'n'}):
            return 'code n' , rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'n'}).get_text()
        elif rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'a'}):
            return 'code a' , rec.find('datafield', {'tag':'084'}).find('subfield', {'code' : 'a'}).get_text()
    except AttributeError:
        pass

subject_field3 = [subject_field3(i) for i in record]

# Inspect the result
count = 0
for index, item in enumerate(subject_field3):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_subject_field3 = count        

print(f'{empty_subject_field3} empty_misc_fields are empty.')
print (f'Set of the first hundred: {set(subject_field3[0:100])}')

5352 empty_misc_fields are empty.
Set of the first hundred: {('code a', '444'), ('code n', '67, 220'), ('code n', '91, 9'), None, ('code n', 'ACN'), ('code o', '1,-27'), ('code n', '86, 241'), ('code n', '75:4, 357'), ('code a', '396'), ('code a', '99.4 Dickens, Charles'), ('code o', '54,-352'), ('code n', '89, 291'), ('code n', 'DA58,-351'), ('code n', '151, 401'), ('code n', '67, 216'), ('code n', '151, 438'), ('code n', '151, 399'), ('code o', '18,-118'), ('code n', '67, 214'), ('code o', '2,-1'), ('code a', '92.4'), ('code n', '151, 344'), ('code n', '151, 366'), ('code o', '1,-28'), ('code n', '171, 132'), ('code a', '96.9'), ('code o', '4,-17')}


### 653 - subject field 4

In [77]:
def subject_field4(rec):
    try:
        data_653 = rec.find('datafield', {'tag': '653' })
        # Extract text from each subfield within the datafield
        subfield_texts = [subfield.get_text() for subfield in data_653.find_all('subfield')]
        subfield_texts = ' '.join(subfield_texts)
        return subfield_texts
    except AttributeError:
        pass
    
subject_field4 = [subject_field4(i) for i in record]
print (f'Set of the first hundred: {set(subject_field4[0:100])}')

Set of the first hundred: {None, 'Theologi Kirkehistorie.', 'Mathematik Ren Mathematisk Analyse.', 'Theologi Den christelige Moral.', 'Danmarks historie.', 'Theologi Indledning til Theologien.', 'Historie. I Danmark.', 'kvindeskildringer.', 'Danmarks litterære forhold.'}


### 245 - Title

In [78]:
def get_title(rec):
    title_tag = rec.find('datafield', {'tag': '245'})
    title_tag_list = [i.get_text() for i in title_tag]
    title_string = ' '.join(title_tag_list)
    return title_string

title= [get_title(i) for i in record]

# Inspect the result
count = 0
for index, item in enumerate(title):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_title_fields = count    
print(f'{empty_title_fields} title fields are empty.')

0 title fields are empty.


### 100 - Author tag1

In [79]:
def get_author1(rec):
    try:
        code_a = rec.find('datafield', {'tag':'100'}).find('subfield', {'code' : 'a'}).get_text()
    
        clean_text_string = code_a\
                    .replace('[',' ')\
                    .replace(']',' ')\
                    .replace(':',' ')\
                    .replace('(',' ')\
                    .replace(')',' ')\
                    .replace(';', ' ')\
                    .replace("}",' ')\
                    .replace("{",' ')\
                    .replace("'",' ')
        author_string = re.sub(r'\.{2,}', ' ', clean_text_string) # sub two or more following full stops
        
        return author_string    
            
    except AttributeError:
        pass

    
author1 = [get_author1(i) for i in record]

# Inspect the result
count = 0
for index, item in enumerate(author1):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_author_fields = count    
print(f'{empty_author_fields} author fields are empty.')

4716 author fields are empty.


### 700 - Author tag2

In [80]:
def get_author2(rec):
    try:
        code_a = rec.find('datafield', {'tag':'700'}).find('subfield', {'code' : 'a'}).get_text()
    
        clean_text_string = code_a\
                    .replace('[',' ')\
                    .replace(']',' ')\
                    .replace(':',' ')\
                    .replace('(',' ')\
                    .replace(')',' ')\
                    .replace(';', ' ')\
                    .replace("}",' ')\
                    .replace("{",' ')\
                    .replace("'",' ')
        author_string = re.sub(r'\.{2,}', ' ', clean_text_string) # sub two or more following full stops
        
        return author_string    
            
    except AttributeError:
        pass

    
author2 = [get_author2(i) for i in record]

# Inspect the result
count = 0
for index, item in enumerate(author2):
    #print(f"Index {index}: {item}")
    if item == None or item == '':
        count = count +1
empty_author_fields = count    
print(f'{empty_author_fields} author fields are empty.')

7204 author fields are empty.


### 260 - extract the place, publisher, and year tags and store in a tuple

In [81]:
# extract the place, publisher, and year tags and store in a tuple 
def get_ppy_tags(rec):
    for i in rec.find_all('datafield'):
        if i['tag'] == '260':
            subfields = i.find_all('subfield', {'code': ['a', 'b', 'c']})
            
            subfield_a = ""
            subfield_b = ""
            subfield_c = ""
            
            for subfield in subfields:
                code = subfield['code']
                text = subfield.text

                if code == 'a':
                    subfield_a = text
                elif code == 'b':
                    subfield_b = text
                elif code == 'c':
                    subfield_c = text
                    
            return subfield_a, subfield_b, subfield_c

ppy= [get_ppy_tags(i) for i in record]




# Extracpt place
# Try extract the content of the place tag and return a clean string except when there is no content then return ''  
def get_place(tup):
    try:
        # clean text string
        text_string = tup[0].replace(',', '')\
                    .replace('[','')\
                    .replace(']','')\
                    .replace(':','')\
                    .replace('(','')\
                    .replace(')','')\
                    .replace(';', '')\
                    .replace("'",'').strip()
        text_string = re.sub(r'\.{2,}', '', text_string) # sub two or more following full stops
        return text_string
    
    except:
        'TypeError'
        return ''
    
place = [get_place(i) for i in ppy]

# Inspect the result
count = 0
for index, item in enumerate(place):
    #print(f"Index {index}: {item}")
    if item == '':
        count = count +1
        
empty_pla_fields = count    
print(f'{empty_pla_fields} place fields are empty.')



# Publisher
# Extract and clean text strings from the publisher tag
def get_publisher(tup):
    try:
        # clean text string
        text_string = tup[1].replace(',', '')\
                    .replace('[','')\
                    .replace(']','')\
                    .replace(':','')\
                    .replace('(','')\
                    .replace(')','')\
                    .replace(';', '')\
                    .replace("'",'').strip()
        text_string = re.sub(r'\.{2,}', '', text_string) # sub two or more following full stops
        return text_string
        
    except:
        'TypeError'
        return ''
    
publisher = [get_publisher(i) for i in ppy]


# Inspect the result
count = 0
for index, item in enumerate(publisher):
    #print(f"Index {index}: {item}")
    if item == '':
        count = count +1
        
empty_pub_fields = count    
print(f'{empty_pub_fields} publisher fields are empty.')



# Year
# Extract and clean text strings from the year tag
def get_year(tup):
    try:
        # clean text string
        text_string = tup[2].replace(',', '')\
                    .replace('[','')\
                    .replace(']','')\
                    .replace(':','')\
                    .replace('(','')\
                    .replace(')','')\
                    .replace(';', '')\
                    .replace("'",'').strip()
        text_string = re.sub(r'\.', '', text_string) # sub two or more following full stops
        return text_string
    
    except:
        'TypeError'
        return ''
    
year = [get_year(i) for i in ppy]

# Inspect the result
count = 0
for index, item in enumerate(year):
    #print(f"Index {index}: {item}")
    if item == '':
        count = count +1
        
empty_year_fields = count    
print(f'{empty_year_fields} year fields are empty.')

# How many year items doesn't have four digits?
pattern = r'^\d{4}$'

count_not_four_digits = 0

for item in year:
    if not re.match(pattern, item):
        count_not_four_digits += 1

print(f"{count_not_four_digits} of the year items are not four-digit numbers.")


344 place fields are empty.
8242 publisher fields are empty.
37 year fields are empty.
1236 of the year items are not four-digit numbers.


### An example of a dataframe build on the xml

In [82]:
df = pd.DataFrame({'author1': author1, \
                  'author2': author2, \
                  'title': title, \
                  'year':year, \
                  'publisher': publisher, \
                  'place': place, \
                  'language': language, \
                  'sub_1': subject_field1, \
                  'sub_2': subject_field2, \
                  'sub_3': subject_field3, \
                  'sub_4': subject_field4, \
                  'misc_comments': misc})

In [83]:
df

Unnamed: 0,author1,author2,title,year,publisher,place,language,sub_1,sub_2,sub_3,sub_4,misc_comments
0,,,Onderwysinge in de christelijcke Religie by Vr...,1664,,Rotterdam,dut,,,"(code n, 91, 9)",,1 bd.
1,"Curtius Rufus, Quintus.",,Hoogberoemde historie van t'leven ende de dade...,1613,Adriaen Gerritsz,Delf,und,,,"(code n, 171, 132)",,"[8], 323, [5] bl. :portr."
2,Sulpicius Severus.,,Sulpicii Severi Historia sacra.,1635,Elsevir,Lugduni Batavorum,lat,,,"(code n, 86, 241)",,1 bd.
3,"Amyraut, Moyse.",,Six livres de la vocation des pasteurs.,1649,,Saumur,fre,,,"(code n, 89, 291)",,1 bd.
4,,"Bazin, R.","Le thermophosphate, sa nature, ses rendements,...",1891,,Paris,fre,Kvælstofgødning.,,"(code a, 396)",,43 s.
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,"Delisle, Leopold.",Mandements et actes divers de Charles V (1364-...,1874,,Paris,fre,,,"(code n, 64, 28e)",,1 bd.
9996,"Kirchheim, R.",,Die neue Exegetenschule : eine kritische Dorne...,1867,,Breslau,ger,,,"(code n, 83, 20)",,40 s.
9997,"Lissauer, Abraham.",,Die praehistorischen Denkmäler der Provinz Wes...,1887,,Leipzig,ger,,,"(code n, 70, 92)",,"XI, 110 s., 9 tav."
9998,,"Jerdan, Will.","Rutland Papers : original Documents, illustrat...",1842,,London,eng,,,"(code n, 65, 30c)",,1 bd.


### An example of a dataframe that select rows with value "dan" in column 'language'.  

In [85]:
df_dan = df[df['language']=='dan'].reset_index(drop=True)

In [86]:
df_dan

Unnamed: 0,author1,author2,title,year,publisher,place,language,sub_1,sub_2,sub_3,sub_4,misc_comments
0,"Cramer, Chr.",,Arithmetica tyronica eller grundig Vejviisning...,1780,,Sorøe,dan,,,"(code o, 18,-118)",Mathematik Ren Mathematisk Analyse.,1 bd.
1,"Cramer, Chr.",,Arithmetica tyronica eller grundig Veiviisning...,1806,,Viborg,dan,,,"(code o, 18,-118)",Mathematik Ren Mathematisk Analyse.,176 s.
2,"Schjørring, Johanne.",,Rige Dage : Fortælling / Johanne Schjørring.,1877,Gyldendal,Kbh.,dan,,,,,144 sider.
3,Aristophanes.,,Ridderne / Aristofanes : Komedie ; oversat af ...,1857,Samfundet til Den Danske Literaturs Fremme,,dan,,,,,"16, 102 sider.Skuespil.Oversat fra græsk."
4,"Andersen, H. C.",,Billedbog uden Billeder.,1899,,Kbh,dan,,,,,[1 bd.].
...,...,...,...,...,...,...,...,...,...,...,...,...
3300,"Rostock, Mads Pedersen.",,Catechismus over Evangelierne og Epistlerne el...,1726,,Kjbh.,dan,,,,,912 s.
3301,"Blume, Adzer Hansen.",,Prædikener til hver Søn- og Helligdag i Kirkea...,1878,Hoffensberg Jespersen & Fr. Traps Etabl.,Kjbh.,dan,,,,,426 s.
3302,"Haas, Jonas.",,Samling af de evangeliske Biskoppers i Siællan...,1761,,Kibh.,dan,,,,,"120 s., [15] tav. (portr.).Rygtitel: Evangelis..."
3303,,,"Den augsburgske Confession, det er den evangel...",1818,,Haderslev,dan,,,,,16 s.
