# Generate dataset Annif

In [51]:
import os
import csv
import pandas as pd
import numpy as np

In [65]:
# Import GGC-data
df = pd.read_csv('data/vraag_20190620.txt', sep='\t')
print('Number of rows: ' + str(len(df)))
df.head(2)

Number of rows: 12243


Unnamed: 0,maa1,maa2,ppn,jvu_1100,taal_1500_publ,taal_1500_orig,isbn_2000,unesco_1121,unesco_1122,nur_codes_5061,brinkman_520x,brinkman_520x_ppn,onix_7880,prim_auteur_3000,sec_auteur_3011,titel_4000,ondertitel_4000,samenvatting_4207
0,A,Aa,322079640,2015,ned,,9789054000000.0,7,,321,levensbeschrijvingen,075613816,9789054292692,Han/van@Bree$aut$!069567727!Han van Bree 1957-,,De @geest van het Oude Loo,Juliana en haar vriendenkring 1947-1957,In het boek wordt de hofcrisis van 1956 voor h...
1,A,Aa,33015673X,2015,ned,fra,9789490000000.0,b,,736,perceptie | esthetiek,075618451 | 075605503,9789490334086,Gilles@Deleuze$aut$!06860873X!Gilles Deleuze 1...,Walter/van der@Star$trl$!125379315!Walter van ...,@Francis Bacon,logica van de gewaarwording,Esthetische analyse van het werk van de Britse...


## Preprocess GGC-data

### Combine title, subtitle and summary and tidy up data
Combine `titel_4000`, `ondertitel_4000` and `samenvatting_4207` values into a new column called `samenvatting_plus_titel`. 

Convert `brinkman_520x` and `brinkman_520x_ppn` string values into a list of strings split at `|` e.g. `"perceptie | esthetiek"` becomes `[perceptie, esthetiek]`.

In [66]:
# Replace NaN by a space (' ') in titel_4000 and ondertitel_4000 (subtitles) before merging data from the three columns.
df['titel_4000'].fillna(' ', axis=0, inplace=True)
df['ondertitel_4000'].fillna(' ', axis=0, inplace=True)

# Remove @ from title and create new column of combined values.
df['samenvatting_plus_titel'] = df['titel_4000'].str.replace('@', ' ') + ' ' + df['ondertitel_4000'] + ' ' + df['samenvatting_4207']

# Convert string with '|' (pipes) into list of Brinkman subjects and brinkman subjects' ppn's.
df['brinkman_520x_ppn'] = df['brinkman_520x_ppn'].str.replace(' ', '').str.split('|')
df['brinkman_520x'] = df['brinkman_520x'].str.replace(' \| ', '|', regex=True).str.split('|')

# Remove errors. (In exploratory research stage found problems with entries (mostly assigned subjects not in thesaurus))
df = df[~df.ppn.isin(['406248214', '418852146', '421344679', '394609565', '405599978', '406344175', '420510567'])]

df.head(2)

Unnamed: 0,maa1,maa2,ppn,jvu_1100,taal_1500_publ,taal_1500_orig,isbn_2000,unesco_1121,unesco_1122,nur_codes_5061,brinkman_520x,brinkman_520x_ppn,onix_7880,prim_auteur_3000,sec_auteur_3011,titel_4000,ondertitel_4000,samenvatting_4207,samenvatting_plus_titel
0,A,Aa,322079640,2015,ned,,9789054000000.0,7,,321,[levensbeschrijvingen],[075613816],9789054292692,Han/van@Bree$aut$!069567727!Han van Bree 1957-,,De @geest van het Oude Loo,Juliana en haar vriendenkring 1947-1957,In het boek wordt de hofcrisis van 1956 voor h...,De geest van het Oude Loo Juliana en haar vri...
1,A,Aa,33015673X,2015,ned,fra,9789490000000.0,b,,736,"[perceptie, esthetiek]","[075618451, 075605503]",9789490334086,Gilles@Deleuze$aut$!06860873X!Gilles Deleuze 1...,Walter/van der@Star$trl$!125379315!Walter van ...,@Francis Bacon,logica van de gewaarwording,Esthetische analyse van het werk van de Britse...,Francis Bacon logica van de gewaarwording Est...


## Create 2 individual DF's, one of entries with subjects refering to content and one with subjects refering to form

### Import Brinkman subjects that refer to the form of an entry

In [67]:
# Import Brinkman subjects that refer to the form of an entry (e.g. autobiography)
df_vorm = pd.read_csv('data/btr_vorm.tsv', sep='\t')
df_vorm.head(3)

Unnamed: 0,ppn,btr
0,75598612,adresboeken
1,75656876,anekdoten
2,374639280,antiquariaatscatalogi


In [68]:
# Convert to two lists so we can use them to subtract these subjects from the assigned subjects using sets.
ppn_vormtrefwoorden = df_vorm['ppn'].values.tolist()
vormtrefwoorden = df_vorm['btr'].values.tolist()

### Create GGC-dataset with only content subjects (`df_content`)

In [56]:
# Remove Brinkman subjects and subjects' PPN that refer to form.
df_content = df
df_content['brinkman_520x_ppn'] = df_content['brinkman_520x_ppn'].apply(lambda row: list(set(row) - set(ppn_vormtrefwoorden)))
df_content['brinkman_520x'] = df_content['brinkman_520x'].apply(lambda row: list(set(row) - set(vormtrefwoorden)))

In [57]:
# Remove empty lists i.e. remove entries which only have subjects refering to form.
df_content = df_content[df_content.astype(str)['brinkman_520x_ppn'] != '[]']
df_content = df_content[df_content.astype(str)['brinkman_520x'] != '[]']

In [58]:
# Complete DF of entries having only assigned subjects refering to the content (5710 rows).
df_content.head(3)

Unnamed: 0,maa1,maa2,ppn,jvu_1100,taal_1500_publ,taal_1500_orig,isbn_2000,unesco_1121,unesco_1122,nur_codes_5061,brinkman_520x,brinkman_520x_ppn,onix_7880,prim_auteur_3000,sec_auteur_3011,titel_4000,ondertitel_4000,samenvatting_4207,samenvatting_plus_titel
1,A,Aa,33015673X,2015,ned,fra,9789490000000.0,b,,736,"[esthetiek, perceptie]","[075618451, 075605503]",9789490334086,Gilles@Deleuze$aut$!06860873X!Gilles Deleuze 1...,Walter/van der@Star$trl$!125379315!Walter van ...,@Francis Bacon,logica van de gewaarwording,Esthetische analyse van het werk van de Britse...,Francis Bacon logica van de gewaarwording Est...
3,A,Aa,352699566,2015,ned,,9789462000000.0,z,,648,[vakantieverblijven],[075625156],9789462080744,Mieke@Dings$aut$!270022139!Mieke Dings 1979-,,@Tussen tent en villa,het vakantiepark in Nederland 1920-nu,Het vakantiepark: wie heeft er niet weleens ee...,Tussen tent en villa het vakantiepark in Nede...
5,A,Aa,363250565,2016,ned,,9789039000000.0,7,,320,[Marokkanen],[07566111X],9789038898254,Salaheddine@Benchikhi$aut$!296322547!Salaheddi...,,@Salaheddine punt NL,kom maar op met Nederland,In Salaheddine punt NL vertelt Salaheddine hoe...,Salaheddine punt NL kom maar op met Nederland...


### Create GGC-dataset with only form subjects (`df_form`)

In [69]:
# Remove Brinkman subjects and subjects' PPN that refer to content.
df_form = df
df_form['brinkman_520x_ppn'] = df_form['brinkman_520x_ppn'].apply(lambda row: list(set(row) & set(ppn_vormtrefwoorden)))
df_form['brinkman_520x'] = df_form['brinkman_520x'].apply(lambda row: list(set(row) & set(vormtrefwoorden)))

In [70]:
# Remove empty lists i.e. remove entries which only have subjects refering to content.
df_form = df_form[df_form.astype(str)['brinkman_520x_ppn'] != '[]']
df_form = df_form[df_form.astype(str)['brinkman_520x'] != '[]']

In [71]:
# Complete DF of entries having only assigned subjects refering to the form (7428 rows).
df_form.head(3)

Unnamed: 0,maa1,maa2,ppn,jvu_1100,taal_1500_publ,taal_1500_orig,isbn_2000,unesco_1121,unesco_1122,nur_codes_5061,brinkman_520x,brinkman_520x_ppn,onix_7880,prim_auteur_3000,sec_auteur_3011,titel_4000,ondertitel_4000,samenvatting_4207,samenvatting_plus_titel
0,A,Aa,322079640,2015,ned,,9789054000000.0,7,,321,[levensbeschrijvingen],[075613816],9789054292692,Han/van@Bree$aut$!069567727!Han van Bree 1957-,,De @geest van het Oude Loo,Juliana en haar vriendenkring 1947-1957,In het boek wordt de hofcrisis van 1956 voor h...,De geest van het Oude Loo Juliana en haar vri...
2,A,Aa,352655844,2015,ned,,9789460000000.0,7,,698,[levensbeschrijvingen],[075613816],9789460041228,Anton/van de@Sande$aut$!067525180!Antonius Wil...,,@Prins Frederik der Nederlanden 1797-1881,gentleman naast de troon,"Beschrijving van het leven van prins Frederik,...",Prins Frederik der Nederlanden 1797-1881 gent...
4,A,Aa,362837317,2015,ned,,9789047000000.0,4,,301,[romans en novellen ; oorspr. - Nederlands],[075629402],9789046815809,Jan/van der@Mast$aut$!07502943X!Jan van der Ma...,,@Agneta,,"Jacques van Marken (1845-1906), oprichter van ...","Agneta Jacques van Marken (1845-1906), opri..."


## Generate Full-text document corpus for Annif
[Annif document corpus formats](https://github.com/NatLibFi/Annif/wiki/Document-corpus-formats)

All files will be saved into ./data/fulltext_corpus directory.

In [72]:
# Import Brinkman TSV as python dictionary.
dict_subjects = {}
with open('data/brinkmanthesaurus_vocab.tsv', mode='r') as infile:
    reader = csv.reader(infile, delimiter="\t")
    for row in reader:
        dict_subjects[row[1]] = row[0]

#### Choose a DF to use to generate full-text corpus

In [73]:
# Choose DF (df_content, df_form or df) to use to create full-text corpus.
#df_corp = df_content
df_corp = df_form

### Generate full-text document corpus text files

In [74]:
# Create dict with 'ppn' (of work) and 'brinkman_520x' e.g. {33015673X': ['perceptie', 'esthetiek'], ...}
dict_ppn_bk = pd.Series(df_corp.brinkman_520x.values,index=df_corp.ppn).to_dict()

# Create dictionary {ppn : [brinkman_id, brinkman_term]}
dict_assigned_sub = {}

br_not_found = []
for ppn, asg_subj in dict_ppn_bk.items():
    mult_sub = []
    for subj in asg_subj:
        try:
            mult_sub.append([dict_subjects[subj], subj])
        except KeyError:
            br_not_found.append(subj)
    dict_assigned_sub[ppn] = mult_sub


print(f'De {len(set(br_not_found))} onderstaande toegewezen Brinkmantrefwoorden staan niet in brinkmanthesaurus_vocab.tsv, maar hebben nog wel andere termen toegewezen:\n')   
for i in set(br_not_found):
    print(i)

De 0 onderstaande toegewezen Brinkmantrefwoorden staan niet in brinkmanthesaurus_vocab.tsv, maar hebben nog wel andere termen toegewezen:



In [75]:
# Create fulltext corpus .txt files.
dict_sum = pd.Series(df_corp.samenvatting_plus_titel.values,index=df_corp.ppn).to_dict()
if not os.path.exists(os.path.join('data', 'fulltext_corpus')):
    os.makedirs(os.path.join('data', 'fulltext_corpus'))
for ppn, summ in dict_sum.items():
    filename = ppn + '.txt'
    with open(os.path.join(os.path.join('data', 'fulltext_corpus'), filename), mode='w') as sumfile:
        sumfile.write(str(summ))

### Generate full-text document corpus subject files
In simple format (.key files).

In [76]:
# Create .key file for each ppn. Subject vocabulary as TSV.
err = []
if not os.path.exists(os.path.join('data', 'fulltext_corpus')):
    os.makedirs(os.path.join('data', 'fulltext_corpus'))
for ppn, subj_id in dict_assigned_sub.items():
    filename = ppn + '.tsv'
    with open(os.path.join(os.path.join('data', 'fulltext_corpus'), filename), mode='w') as subfile:
        if len(subj_id) > 1:
            for subj_nr in subj_id:
                subfile.write(subj_nr[0] + '\t' + subj_nr[1] + '\n')
        elif len(subj_id) == 1:
            subfile.write(subj_id[0][0] + '\t' + subj_id[0][1])
        else:
            err.append(ppn)

# 'err' is a list with PPN's of problematic entries (no clear brinkman identifier)
# e.g. after split one of the single terms is not found in te vocabulary.
# Should be empty - I've removed them to tidy up GGC-data at the start of this notebook.
print(err)

[]


### Split dataset into train, test, eval

In [77]:
# Split dataset 80-15-5
train, test, ev = np.split(df_corp, [int(.8*len(df_corp)), int(.95*len(df_corp))])

# Create folders
if not os.path.exists(os.path.join(os.path.join('data', 'fulltext_corpus'), 'train')):
    os.makedirs(os.path.join(os.path.join('data', 'fulltext_corpus'), 'train'))
if not os.path.exists(os.path.join(os.path.join('data', 'fulltext_corpus'), 'test')):
    os.makedirs(os.path.join(os.path.join('data', 'fulltext_corpus'), 'test'))
if not os.path.exists(os.path.join(os.path.join('data', 'fulltext_corpus'), 'eval')):
    os.makedirs(os.path.join(os.path.join('data', 'fulltext_corpus'), 'eval'))

# Move selected vocab files to train folder.
for ppn in train['ppn'].tolist():
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.tsv'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'train'), ppn + '.tsv'))
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.txt'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'train'), ppn + '.txt'))

# Move selected vocab files to test folder.
for ppn in test['ppn'].tolist():
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.tsv'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'test'), ppn + '.tsv'))
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.txt'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'test'), ppn + '.txt'))
    
# Move selected vocab files to dev folder.
for ppn in ev['ppn'].tolist():
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.tsv'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'eval'), ppn + '.tsv'))
    os.rename(os.path.join(os.path.join('data', 'fulltext_corpus'), ppn + '.txt'), os.path.join(os.path.join(os.path.join('data', 'fulltext_corpus'), 'eval'), ppn + '.txt'))