# Create CKB data for Annif

In [25]:
import os
import pandas as pd
import numpy as np
from functools import reduce
import random
#from sklearn.model_selection import train_test_split
# from ast import literal_eval

## Create CBK thesaurus for Annif

In [2]:
# Import data.
df_cbkthes = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/thesaurus_CBK_genres.csv', sep=';')
df_cbkthes.head(5)

Unnamed: 0,identifier,genre
0,0,Aankleedpoppenboeken
1,1,Aanwijsboeken
2,2,Abc-boeken
3,3,Abc-prenten
4,4,Abecedaria


In [3]:
# The URI does not point to anything - just for testing purposes for now.
df_cbkthes['identifier'] = '<http://kb.nl/thes/cbk/' + df_cbkthes['identifier'].astype(str) + '>'
df_cbkthes

Unnamed: 0,identifier,genre
0,<http://kb.nl/thes/cbk/0>,Aankleedpoppenboeken
1,<http://kb.nl/thes/cbk/1>,Aanwijsboeken
2,<http://kb.nl/thes/cbk/2>,Abc-boeken
3,<http://kb.nl/thes/cbk/3>,Abc-prenten
4,<http://kb.nl/thes/cbk/4>,Abecedaria
...,...,...
468,<http://kb.nl/thes/cbk/468>,Zilveren Zoenen
469,<http://kb.nl/thes/cbk/469>,Zoekplatenboeken
470,<http://kb.nl/thes/cbk/470>,Zondagsschoolboekjes
471,<http://kb.nl/thes/cbk/471>,Zuidafrikaanse verhalen


In [18]:
# Write df to tsv file.
df_cbkthes.to_csv('/Users/haighton_macbook/Desktop/KB_Annif/data/vocabs/cbk_thesaurus_vocab.tsv', sep='\t', header=None)

## Create CBK dataset

In [4]:
# Import data.
df_genre = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_genrelist.csv', sep=';')
df_samenvatting = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_samenvatting_inhoudsopgave.csv', sep=';', usecols=[0, 1])
df_basicinfo = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_basicinfo.csv', sep=';', usecols=['publication_ppn', 'titelvermelding', 'titelvermelding_processed'])
df_thema = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_themalist.csv', sep=';')
df_nur = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_NUR_rubriek.csv', sep=';', dtype={'NUR_rubriek':np.int64})
df_nugi = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_NUGI_genre.csv', sep=';')
df_brinkman = pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/publication_brinkman.csv', sep=';', usecols=[0, 1])

df_brinkmanthes= pd.read_csv('/Users/haighton_macbook/surfdrive/Demosaurus/clean-tables/thesaurus_brinkmantrefwoorden.csv', sep=';')
df_nurthes = pd.read_html('/Users/haighton_macbook/Desktop/KB_Annif/data/thesaurus_NUR_codes.html')
df_nurthes = df_nurthes[0]
df_nurthes = df_nurthes.drop(df_nurthes.columns[[0, 1]], axis=1)

#print(df_basicinfo.head(5)) 

Er zijn maar weinig publicaties met een samenvatting (totaal 4404). Om toch nog wat 'unieke' data toe te voegen maken we gebruik van de bewerkte titel, NUR rubriek, brinkman en CBK thema. 

In [5]:
# Merge all csv files into 1 DF.
# |publication_ppn|titel|genres|thema|(genres_id)
df_tmp = [df_basicinfo, df_genre, df_thema, df_samenvatting, df_nur, df_nugi, df_brinkman]
df = reduce(lambda left, right: pd.merge(left, right, on=['publication_ppn'], how='outer'), df_tmp).fillna('void')

# Get rid of publications without CBK genres.
df = df[df['genres'] != 'void']

# Add brinkman ppn meaning (term naast nummer).
df = pd.merge(df, df_brinkmanthes, left_on='brinkman', right_on='ppn').drop('ppn', axis=1)
# duplicates rows when multiple brinkman terms are assigned - cleaned up later.

# Add NUR-code meaning.
df['NUR_rubriek'] = pd.to_numeric(df["NUR_rubriek"], errors='coerce', downcast='unsigned')
df = pd.merge(df, df_nurthes, left_on='NUR_rubriek', right_on='NUR-code')
df = df.drop('NUR_rubriek', axis=1)

# Change column names.
df.columns = ['publication_ppn', 'titel', 'titel_processed', 'CBK_genre', 'CBK_thema', 'samenvatting', 'NUGI_genre', 'brinkman_ppn', 'brinkman_onderwerp', 'brinkman_type', 'NUR_code', 'NUR_onderwerp']

# Remove '@' from titles.
df['titel'] = df['titel'].str.replace('@', '')

# Deal with duplicates - should've done this after adding brinkman_meaning, but this works.
pd.DataFrame.drop_duplicates(df)
df = df.groupby('publication_ppn').agg({
    'titel' : 'first',
    'titel_processed' : 'first',
    'CBK_genre' : 'first',
    'CBK_thema' : 'first',
    'samenvatting' : 'first',
    'NUGI_genre' : 'first',
    'brinkman_ppn' :', '.join,
    'brinkman_onderwerp' : ', '.join,
    'brinkman_type' : ', '.join,
    'NUR_code' : 'first',
    'NUR_onderwerp' : 'first'
}).reset_index()

df

Unnamed: 0,publication_ppn,titel,titel_processed,CBK_genre,CBK_thema,samenvatting,NUGI_genre,brinkman_ppn,brinkman_onderwerp,brinkman_type,NUR_code,NUR_onderwerp
0,037096060,De Rode Prinses,"['rode', 'prins']","['zilveren griffels', 'zilveren penselen', 'ne...",void,void,220,075610744,jeugdboeken ; verhalen,vorm,282,Fictie 7 - 9 jaar
1,040931013,Wie alles prijsgeeft,"['al', 'prijsef']","['oorlogsverhalen', 'protestants-christelijke ...","['shintoïsme', 'christelijk geloof']",void,221,075610744,jeugdboeken ; verhalen,vorm,283,Fictie 10 - 12 jaar
2,04109770X,"Wipneus, Pim en de zonneparel","['wipneus', 'pim', 'zonneparel']",['kabouterverhalen'],['diefstallen'],void,220,075610744,jeugdboeken ; verhalen,vorm,280,Fictie kinder- en jeugdboeken algemeen
3,047412917,Joekeltjes zwerftocht,"['joekel', 'zwerftocht']","['dierenverhalen', 'protestants-christelijke v...",['verdwalen'],void,221,075610744,jeugdboeken ; verhalen,vorm,281,Fictie 4 - 6 jaar
4,049458221,Snuf en de IJsvogel,"['snuf', 'ijsvogel']",['detectiveverhalen'],void,void,221,075610744,jeugdboeken ; verhalen,vorm,283,Fictie 10 - 12 jaar
...,...,...,...,...,...,...,...,...,...,...,...,...
49621,84014881X,Wipneus en Pim bij de rovers,"['wipneus', 'pim', 'rover']",['kabouterverhalen'],void,void,220,075610744,jeugdboeken ; verhalen,vorm,280,Fictie kinder- en jeugdboeken algemeen
49622,841152926,"Wipneus, Pim en het plaagmannetje","['wipneus', 'pim', 'plaagman']",['kabouterverhalen'],void,void,220,075610744,jeugdboeken ; verhalen,vorm,280,Fictie kinder- en jeugdboeken algemeen
49623,84233310X,Wipneus en Pim en het circus,"['wipneus', 'pim', 'circus']","['kabouterverhalen', 'circusverhalen']",void,void,220,075610744,jeugdboeken ; verhalen,vorm,280,Fictie kinder- en jeugdboeken algemeen
49624,842848568,Prins Wipneus en zijn vriendje Pim,"['prin', 'wipneus', 'vriend', 'pim']",['kabouterverhalen'],void,void,220,075610744,jeugdboeken ; verhalen,vorm,280,Fictie kinder- en jeugdboeken algemeen


In [104]:
# Export dataframe as csv.
df.to_csv('/Users/haighton_macbook/Desktop/kinderboeken_dataset.csv')

### Generate full-text document corpus subject files

In [6]:
# Set output location of created corpus (tsv subject files and text files).
OUTPUT_LOC = '/Users/haighton_macbook/Desktop/KB_Annif/data/corpora/cbk'

In [7]:
# Create subject tsv files.
# Is also needed to run 'Generate full-text document corpus text files' and 'Split dataset'.
# CBK_thes genres - Remove whitespace and turn lowercase. We need this to check if a CBK_genre is present.
df_cbkthes['genre'] = df_cbkthes['genre'].str.lower().str.strip()

dict_pub_tsv = {}
probleem_ppn = []
for ppn in df['publication_ppn']:
    tmp = []
    for genres in df['CBK_genre'][df['publication_ppn'] == ppn].iloc[0].replace('[', '').replace(']', '').replace("'",'').split(','):
        if genres.strip() not in df_cbkthes['genre'].tolist():
            probleem_ppn.append(ppn)
            break
        else:
            ckb_uri = df_cbkthes['identifier'][df_cbkthes['genre'].str.lower().str.strip() == genres.strip()]
            try:
                tmp.append(f'{ckb_uri.iloc[0]}\t{genres.strip()}')
            except IndexError:
                probleem_ppn.append(ppn)
                continue
    if len(tmp):
        dict_pub_tsv[ppn] = tmp

In [209]:
# Write to tsv file for each publication.            
for ppn, lines in dict_pub_tsv.items():
    with open(os.path.join(OUTPUT_LOC, ppn + '.tsv'), 'w') as pub_tsv:
        for line in lines:
            pub_tsv.write(f'{line}\n')

### Generate full-text document corpus text files

In [None]:
# Create text files, with the following data points:
# ppn|titel|titel_processed|NUR_onderwerp|brinkman_onderwerp|samenvatting|CBK_thema
# You need to also run 'create subject tsv files' before - it needs 'dict_pub_tsv' to know the exact publication_ppn's.

dict_pub_txt = {}
for ppn in df['publication_ppn']:
    if ppn in dict_pub_tsv.keys():
        data = []
        data.append(df['titel'][df['publication_ppn'] == ppn].iloc[0])
        data.append((df['titel_processed'][df['publication_ppn'] == ppn].iloc[0]).replace('[', '').replace(']', '').replace("'",''))
        data.append(df['NUR_onderwerp'][df['publication_ppn'] == ppn].iloc[0])
        data.append(df['brinkman_onderwerp'][df['publication_ppn'] == ppn].iloc[0])
        samenvatting = df['samenvatting'][df['publication_ppn'] == ppn]
        if samenvatting.iloc[0] != 'void':
            data.append(samenvatting.iloc[0])
        cbk_thema = df['CBK_thema'][df['publication_ppn'] == ppn]
        if cbk_thema.iloc[0] != 'void':
            data.append(cbk_thema.iloc[0])
        dict_pub_txt[ppn] = data

In [211]:
# Write data points for each publication to a .txt file.
for ppn, datapoints in dict_pub_txt.items():
    with open(os.path.join(OUTPUT_LOC, ppn + '.txt'), 'w') as pub_txt:
        for point in datapoints:
            pub_txt.write(f'{point}\n')

In [213]:
print(f'{len(dict_pub_tsv)} .tsv and {len(dict_pub_txt)} .txt files where generated. A total of {len(dict_pub_tsv) + len(dict_pub_txt)} files.')

49262 .tsv and 49262 .txt files where generated. A total of 98524 files.


### Split dataset

In [34]:
# Split datase into Train, Test and Validate subsets - 80%-10%-10% split.
# Again we need 'dict_pub_tsv'

ppn_datasetc = list(dict_pub_tsv.keys())
ppn_dataset = ppn_datasetc.copy()
ppn_dataset = random.sample(ppn_datasetc, len(ppn_datasetc))

train = ppn_dataset[:int(len(ppn_dataset)*0.8)]
test = ppn_dataset[int(len(ppn_dataset)*0.8):int(len(ppn_dataset)*0.9)]
val = ppn_dataset[int(len(ppn_dataset)*0.9):]

print(f'train subset has {len(train)} publications.')
print(f'test subset has {len(test)} publications.')
print(f'val subset has {len(val)} publications.')

train subset has 39409 publications.
test subset has 4926 publications.
val subset has 4927 publications.


In [33]:
# Create train, test and val folders:
train_dir = os.path.join(OUTPUT_LOC, 'train')
test_dir = os.path.join(OUTPUT_LOC, 'test')
val_dir = os.path.join(OUTPUT_LOC, 'val')

if not os.path.exists(train_dir):
    os.mkdir(train_dir)
if not os.path.exists(test_dir):
    os.mkdir(test_dir)
if not os.path.exists(val_dir):
    os.mkdir(val_dir)

# Move files to respective folder.
for ppn in train:
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.txt'), os.path.join(train_dir, ppn + '.txt'))
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.tsv'), os.path.join(train_dir, ppn + '.tsv'))
for ppn in test:
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.txt'), os.path.join(test_dir, ppn + '.txt'))
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.tsv'), os.path.join(test_dir, ppn + '.tsv'))
for ppn in val:
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.txt'), os.path.join(val_dir, ppn + '.txt'))
    os.rename(os.path.join(OUTPUT_LOC, ppn + '.tsv'), os.path.join(val_dir, ppn + '.tsv'))