# Compare Brinkman and Thema thesaurus
In this document we try to find the amount of overlap between two thesauri: The Brinkman Catalog (used by the KB) and Thema (created by Centraal Bookhuis (CB)). This list will be used to choose our dataset for Annif.

In [1]:
# !pip3 install numpy
# !pip3 install pandas

import pandas as pd
import numpy as np

In [2]:
# Import TSV file for Thema and Brinkman as Series.
brinkman_data = pd.read_csv('data/brinkmanthesaurus_vocab.tsv', sep='\t', usecols=[1], header=None, squeeze=True)
thema_data = pd.read_csv('data/Thema_v1.3.2_nl.tsv', sep='\t', usecols=[1], header=None, squeeze=True)

In [3]:
# Sort data and print lenth.
brinkman_data.sort_values(ascending=True, inplace=True)
thema_data.sort_values(ascending=True, inplace=True)
print('Total Brinkman subjects:\t{}'.format(len(brinkman_data)))
print('Total Thema subjects:\t\t{}'.format(len(thema_data)))
# brinkman_data.head(10)
# thema_data.head(10)

Total Brinkman subjects:	14737
Total Thema subjects:		7362


In [4]:
# Intersection

# Case and punctuation mark sensitive comparison (e.g. 's Gravenhage, s gravenhage).
intersect_direct = pd.Series(list(set(thema_data).intersection(set(brinkman_data))))
print('Total common subjects (exact matches):\t{}'.format(len(intersect_direct)))

# Case incesitive comparison (convert all entries to lowercase before comparison).
intersect_lower = pd.Series(list(set(thema_data.str.lower()).intersection(set(brinkman_data.str.lower()))))
print('\nTotal matches when converting to lowercase first:\t{}\n'.format(len(intersect_lower)))
print(intersect_lower.head(10))

# Is Capitalization important to these listed subjects? Can we use the lowercase version intersection from here on out?
# To determine if a mapping could be done 1-on-1 between Brinkman and Thema we want to see if capitalization is important.

# Matches after converted to lowercase.
intersect_lower_uniq = pd.Series(list((set(intersect_lower) - set(intersect_direct.str.lower()))))

print('\n{} subjects that are in both lists but have different capitalization:\n'.format(len(intersect_lower_uniq)))

print(intersect_lower_uniq.head(10))

Total common subjects (exact matches):	668

Total matches when converting to lowercase first:	980

0                     burkina faso
1    internationaal belastingrecht
2                          i tjing
3            economische geografie
4                      volksdansen
5               sociale pedagogiek
6                       tomografie
7                       duinkerken
8                           guyana
9                         richmond
dtype: object

312 subjects that are in both lists but have different capitalization:

0              computerbeveiliging
1                    gewichtheffen
2    internationaal belastingrecht
3            economische geografie
4                      volksdansen
5               sociale pedagogiek
6                       tomografie
7                       modernisme
8                           qigong
9                        popmuziek
dtype: object


In [5]:
# Unique subjects
# What makes these subjects unique?

brinkman_uniq = pd.Series(list(set(brinkman_data.str.lower()) - set(thema_data.str.lower())))
print('Total unique subjects in Brinkman:\t{}\n'.format(len(brinkman_uniq)))

print(brinkman_uniq.head(10))

Total unique subjects in Brinkman:	13729

0                    zandwinning
1                  monte cassino
2          zevenkamp (rotterdam)
3                   fibromyalgie
4                       kerkrade
5                       hurwenen
6                 melcombe regis
7                     zuiderveen
8    leermiddelen ; omgangskunde
9              geloofsbelijdenis
dtype: object


In [6]:
thema_uniq = pd.Series(list(set(thema_data.str.lower()) - set(brinkman_data.str.lower())))
print('Total unique subjects in Thema:\t\t{}\n'.format(len(thema_uniq)))
print(thema_uniq.head(10))

Total unique subjects in Thema:		6355

0                                               loches
1                                       zuliana region
2                                  peru: amazonegebied
3                                     northwest mexico
4                                   bevallingsmethoden
5                                ca. 1980 tot ca. 1989
6                           sociologie en antropologie
7                                          stijlen (u)
8    kind en jeugd, naslagwerken: tweetalige / meer...
9                                     data warehousing
dtype: object


In [7]:
# Search what 'autisme' (a subject only in Brinkman) is in Thema ('autisme' is a very common subject).
thema_filter = thema_uniq[thema_uniq.str.contains('autisme')]
thema_filter

933     autisme en asperger syndroom
4122     omgaan met autisme/asperger
dtype: object

In [8]:
# Is the subject 'cleveland' in Thema equal as 'cleveland (gbr.)' in Brinkman?
brinkman_filter = brinkman_uniq[brinkman_uniq.str.contains('cleveland')]
brinkman_filter

5397    cleveland (gbr.)
7830       old cleveland
dtype: object

In [10]:
# Export 980 overlapping words to text file
intersect_lower.to_csv(r'/Users/haighton_macbook/Desktop/brinkman_thema.txt', header=None, index=None, sep=' ', mode='a')

In [11]:
# We need the PPN (other column which was not imported)
df = pd.read_csv('data/brinkmanthesaurus_vocab.tsv', sep='\t', header=None, squeeze=True)

df_ppn = df[df.iloc[:, 1].str.lower().isin(intersect_lower)]
df_ppn = (df_ppn.iloc[:, 0])
df_ppn = df_ppn.str.split('/').str[-1].str.rsplit('>').str[0]
df_ppn.to_csv(r'/Users/haighton_macbook/Desktop/brinkman_thema_ppn.txt', header=None, index=None, sep=' ', mode='a')

In [13]:
# Generate Brinkman TSV of overlapping subjects.
df_tsv_overlap = df[df.iloc[:, 1].str.lower().isin(intersect_lower)]
df_tsv_overlap.to_csv(r'/Users/haighton_macbook/Desktop/brinkman_thema_overlap.tsv', header=None, index=None, sep='\t', mode='a')
