In [1]:
import pandas as pd
import numpy as np
import networkx as nx

import spacy
from spacy.matcher import Matcher
from spacy.pipeline import Sentencizer

from IPython.display import clear_output
from datetime import datetime

import PyPDF2
import tabula

import random
import os
import re
import itertools
import langid
import voila

In [2]:
spacy.prefer_gpu()
nlp = spacy.load("nl_core_news_lg")
nlp_ministries = spacy.load("..\\data\\spacy labeled\\output\\model-last")
df = pd.read_csv('..\\data\\ocred\\files_df.csv', index_col = 0)



### Dates

TODO combine spacy dates and regex dates

In [3]:
# dates matcher
def getDatesBase(text, nlp):
    months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december',
         'january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december',
         'jan', 'feb', 'mrt', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    days = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag', 'zondag',
        'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    sent = ['datum', 'verzonden', 'sent', 'date', 'received']

    datesPattern = [{"LOWER" : {"IN" : days}, "OP" : "?"}, 
           {"IS_DIGIT": True}, 
           {"LOWER" : {"IN" : months}},
           {"IS_PUNCT" : True, "OP" : "?", "TEXT":'.'},
           {"IS_DIGIT": True, "OP" : "?"}]
    matcher = Matcher(nlp.vocab)
    matcher.add("Dates", [datesPattern])

    doc = nlp(text)
    matches = matcher(doc)

    regexMatches = regexMatcher(text)
    return [doc[start:end].text for match_id, start, end in matches], regexMatches
    


def getDates(text, nlp):
    months = ['januari', 'februari', 'maart', 'april', 'mei', 'juni', 'juli', 'augustus', 'september', 'oktober', 'november', 'december',
         'january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december',
         'jan', 'feb', 'mrt', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'okt']

    datesPattern = [ 
           {"IS_DIGIT": True}, 
           {"LOWER" : {"IN" : months}},
           {"IS_PUNCT" : True, "OP" : "?", "TEXT":'.'},
           {"IS_DIGIT": True}]
    matcher = Matcher(nlp.vocab)
    matcher.add("Dates", [datesPattern])
    
    text = re.sub(r'[^\w\s]', '', text)
    text = removeKenmerkDate(text)
    
    doc = nlp(text)
    matches = matcher(doc)

    return [doc[start:end].text for match_id, start, end in matches]

def validate(dates, sep, pat):
    goodDates = []
    for date in dates:
        date = date.replace(' ', '')
        try:
            date = date.replace(sep, ' ')
            datetime.strptime(date, pat)
            goodDates.append(date.replace(' ', sep))
        except:
            try:
                if len(date.split(' ')) == 3 and len(date.split(' ')[2]) == 2:
                    datetime.strptime(date, '%d %m %y')
                    goodDates.append(date.replace(' ', sep))
            except:
                pass     
    return goodDates
            

def regexMatcher(text):
    results = []
    
    results += validate(re.findall('[0-3]{0,1}[0-9]\/[0-1]{0,1}[0-9]', text), '/', '%d %m')

    results += validate(re.findall('[0-3]{0,1}[0-9]\/[0-1]{0,1}[0-9]\/[0-9]{2,4}', text), '/', '%d %m %Y')

    results += validate(re.findall('[0-3]{0,1}[0-9]-[0-1]{0,1}[0-9]', text), '-', '%d %m')

    results += validate(re.findall('[0-3]{0,1}[0-9]-[0-1]{0,1}[0-9]-[0-9]{2,4}', text), '-', '%d %m %Y')
        
    return results


# converts dates to timestamp or yyyy-mm-dd
def convertDate(date, timestamp=False):
    months = {
        'januari':'01','jan':'01',
        'februari':'02','feb':'02',
        'maart':'03','mrt':'03',
        'april':'04','apr':'04',
        'mei':'05','mei':'05',
        'juni':'06','jun':'06',
        'juli':'07','jul':'07',
        'augustus':'08','aug':'08',
        'september':'09','sep':'09',
        'oktober':'10','okt':'10',
        'november':'11','nov':'11',
        'december':'12','dec':'12'
    }


    date = date.split(' ')
    date[1] = months[date[1].lower()]

    if timestamp:
        return pd.Timestamp(year=int(date[2]), month=int(date[1]), day=int(date[0]))
    if len(date[0]) == 1:
        date[0] = '0' + date[0]
    return date[2] + '-' + date[1] + '-' + date[0]

# calculate days between two dates
def days_between(d1, d2):
    d1 = convertDate(d1)
    d2 = convertDate(d2)
    try:
        d1 = datetime.strptime(d1, "%Y-%m-%d")
        d2 = datetime.strptime(d2, "%Y-%m-%d")
    except:
        return None
    return abs((d2 - d1).days)

# extracts all fields for dates
def dateInformation(text):        
    # find dates in text with date extractor
    matches = getDates(text, nlp)

    # first found date is date when document was written
    # the date the wob request was completed
    if len(matches) < 3:
        return None, None, None, None
    completedDate = matches[0]

    # check if request was received on a different date then when it was send
    # this is the case if it states "ontvangen op"
    receivedDate = re.findall('ontvangen op ([^.]+?)\,', text)
    if not receivedDate:
        receivedDate = matches[1]
    else:
        receivedDate = matches[2]

    # calculate days between request and completion
    daysTaken = days_between(completedDate, receivedDate)
    
    # converts to yyyy-mm-dd
    start = convertDate(receivedDate)
    end = convertDate(completedDate)
    
    # converts to np.datetime 

    start = np.datetime64(start)
    end = np.datetime64(end)
    businessDaysTaken = np.busday_count(start, end)
    inTime = businessDaysTaken <= 42

    # check if request was fulfilled wihtin 42 business days

    return receivedDate, completedDate, daysTaken, inTime


### Cleaning cells

In [4]:
# because the extractor used periods as indicators, abbreviations like N.V.T. need to be removed
# this function converts it to N V T while keeping the periods at the end of sentences
def removeAbbreviation(text):
    sentence = ''

    # split sentence in words
    for word in text.split(' '):

        # if there is no period in the word, add it back to sentence
        if word.count('.') == 0 or '\n' in word:
            sentence += word + ' '
            continue
        
        # when there is one period in word
        elif word.count('.') == 1:

            # if it is at the end, keep it and add word to sentence
            if word[-1] == '.':
                sentence += word + ' '
            
            # if its in the middle replace it with a space
            else:
                word = word.replace('.', ' ')
                sentence += word + ' '
        
        # if there are  more than 1 periods, replace them all
        else:
            word = word.replace('.', ' ')
            sentence += word + ' '
            
    return sentence

# edge case when the sidebar of the doc is earlier than the heading
# in that case we have to remove a date that is not used
def removeKenmerkDate(text):
    text = text.split('\n')
    
    toRemove = None
    for i in range(len(text)):
        if '(kenmerk)' in text[i] and 'ons' not in text[i]:
            toRemove = [i, i+1]
            break
            
    if toRemove:
        text.remove(text[toRemove[0]])
        text.remove(text[toRemove[1]])

    text = '\n'.join(text)                        
    
    return text


### subject of request

In [5]:
# use regex to find the request reason from decision doc
def getRequestReason(text, removeAbbr = True):

    # text = rawText.replace('\n', ' ')
    text = text.lower()
    if removeAbbr:
        text = removeAbbreviation(text)
    text = text[:1500]
    patterns = ['verzocht([^.]+?)\.', 
                'u verzoekt([^.]+?)\.', 
                'om informatie over([^.]+?)\.', 
                'uw verzoek ziet([^.]+?)\.', 
                'om openbaarmaking van([^.]+?)\.',
                'uw verzoek([^.]+?)\.']
    matches = []

    # get matches for all keywords
    for pattern in patterns:
        matches += re.findall(pattern, text)

    uniqueMatches = []

    # check all matches against eachother
    for i in range(len(matches)):
        for j in range(len(matches)):
            if i == j:
                continue
            
            # if there is a duplicate we do not add it to uniqueMatches
            if matches[i] in matches[j]:
                break

        # add match to uniqueMatches if the j loop completes
        else:
            uniqueMatches.append(matches[i])

    return matches


### retreiving text
TODO

In [26]:
# this function gets a random decision doc from a given ministry
def getRandomDecisionDoc(ministrie):

    # set paths 
    baseDFPath = '..\\data\\openstate data\\'
    basePDFPath = 'F:\\Data files\\Master thesis\\verzoeken\\'

    # get df of ministry
    for f in os.listdir(baseDFPath):
        if ministrie in f.lower():
            file = f
            break

    # get dir of ministry 
    for d in os.listdir(basePDFPath):
        if ministrie in d.lower():
            dir = basePDFPath + d + '\\'
            break
    
    requests = [x for x in os.listdir(dir) if x != '.DS_Store']
    requestNr = int(random.choice(requests))
    
    # load in dataframe of ministry and get random sample
    testDf = pd.read_excel(baseDFPath + file)
    testDf.columns = [x.replace('\n', '') for x in testDf.columns]    
    s = testDf[testDf['WOB Verzoek'] == requestNr]

    if not os.path.exists(dir + str(requestNr)):
        return None, None, None

    # find the desicion document
    for p in os.listdir(dir + str(requestNr)):

        # if found, save it in pdfPath
        if 'besluit' in p.lower() and 'bijlage' not in p.lower() and p.endswith('.pdf'):
            pdfPath = p
            break
    
    # if there is no desicion document, try again
    else:
        return None, None, None

    return s, dir + str(requestNr) + '\\', pdfPath


# extract text with pdftotext
def textExtract(dir, name):
    txtName = '.'.join(name.split('.')[0:-1])
    txtName = dir + txtName  + '.txt'

    # extract text
    os.system(f'pdftotext -raw "{dir}{name}" "{txtName}"')
    
    if not os.path.exists(txtName): 
        return None
    
    # open file and return content
    with open(txtName, 'r', encoding='utf8') as f:
        text = f.read()
    return text

def textExtractPyPdf(name, nPages = 400):
    reader = PyPDF2.PdfFileReader(name)
    text = []
    pageCounter = 0
    for page in reader.pages:
        page = page.extract_text()
        print(page)
        page = page.replace('\n', ' ')

        text.append(page)
        if pageCounter == nPages:
            break
    return text


In [None]:
import pdfplumber

with pdfplumber.open("path/to/file.pdf") as pdf:
    first_page = pdf.pages[0]
    print(first_page.chars[0])

In [27]:
textExtractPyPdf(r'F:\Data files\Master thesis\verzoeken\Wob verzoeken LNV\202\\Bes-Besluit+op+Wob-verzoek+over+gesubsidieerde+sanering+van+varkenshouderijen.pdf')

 

 
 
 
 































 



 






 

 




 

 

 

 
 
 

 



 


 

 






 

 
































































































































































































































 
 




 


 
 

 



 


 

 

 

















 


 









 








































































































































































































































































































































['                                                                                                          ',
 '                                                                         ',
 '                                                                     ',
 '                                                                   ',
 '                                                                                 ',
 '                                                                                      ',
 '                                                                                  ',
 '                                                                                               ',
 '                                                       ']

### inventory docs

In [7]:
def inventory(path, pdf):

    # words to look for
    rating = {'deels openbaar':0, 'niet openbaar':0, 'openbaar':0, 'reeds openbaar':0, 
            'geweigerd':0, 'gedeeltelijk openbaar':0,
            'volledig openbaar': 0}
    
    # if an inventory document exists, use that
    for file in os.listdir(path):
        if 'inventaris' in file.lower():
            rating = inventoryListToDataframe(path + file, rating)
            break
    
    # else try to find inventory table in decision doc
    if not rating:
        rating = inventoryListToDataframe(path + pdf, rating)

    if not rating:
        return None, None, None, None

    # combine categories
    notPublic = rating['niet openbaar'] + rating['geweigerd'] + rating['reeds openbaar']
    partialPublic = rating['deels openbaar'] + rating['gedeeltelijk openbaar']
    public = rating['openbaar'] - rating['niet openbaar'] - rating['reeds openbaar'] - partialPublic
    total = public + notPublic + partialPublic

    if total == 0:
        return None, None, None, None
    
    return public, notPublic, partialPublic, public + notPublic + partialPublic


# makes dataframes from table in pdf
# then counts occurences of 
def inventoryListToDataframe(pdf, rating):
    try:
        tables = tabula.read_pdf(pdf, pages='all')


        if len(tables) == 0:
            return None

        for table in tables:
            for col in table.columns:
                col = list(table[col])
                col = [str(x).lower() for x in col]
                for key in rating:
                    for value in col:
                        if key in value:
                            rating[key] += 1
    except:
        return None

    return rating


### other request metadata functions

In [8]:
# find number of pages in pdf documents
def getNumberOfPages(path):
    nPages = 0

    # gets list of pdf files in a directory
    pdfs = [x for x in os.listdir(path) if x.endswith('.pdf')]
    
    # open all files and count pages
    for file in pdfs:
        try:
            with open(path + file, 'rb') as f:
                pdf = PyPDF2.PdfFileReader(f, strict=False)
                nPages += pdf.numPages
        except:
            return None
    return nPages

def nPagesOfPdf(path):
    try:
        with open(path, 'rb') as f:
            pdf = PyPDF2.PdfFileReader(f, strict = False)
            nPages = pdf.numPages
        return nPages
    except:
        return 0
        

# finds the number of considered documents in the decision doc
def nDocs(text):

    # look for the first mention of a number of documents
    nDocuments = re.findall('[0-9]+? documenten[a-z]{2} aangetroffen', text)
    
    if not nDocuments:
        nDocuments = re.findall('[0-9]+? document[a-z]{2}', text)
    
    if not nDocuments:
        if len(re.findall('één document', text)) == 1:
            return 1

    # return the first if found, else return None
    if type(nDocuments) == list:
        if len(nDocuments) == 0:
            return None
        return int(nDocuments[0].split(' ')[0])
    elif type(nDocuments) == int:
        return nDocuments

    return None

### network

In [9]:
def getMatches(doc):

    # list of entities to include
    wantedTypes = ['FAC','GPE','LOC', 'ORG', 'PERSON']

    bannedWords = ['sent', 'cc', 'for', 'te', 'we', 'to']

    wantedEnts = []

    # for all entities, check if to include and clean up the string
    for ent in doc.ents:
        if ent.label_ in wantedTypes:
            text = str(ent.text)
            text = text.lower()
            text = re.sub(r'\n', ' ', text)
            text = text.strip()

            if text.endswith('2e'):
                continue
            if text in bannedWords:
                continue
            if len(text) == 1:
                continue
                

            wantedEnts.append(text)    
    
    # retrun list of entities 
    return wantedEnts

# remove all entities that only occur in one document
def goodEnts(entities):
    ents = {}

    # get a dict of entities as keys and a list of documents in which the entity occurs as values
    for document in entities:
        for ent in entities[document]:
            if ent in ents:
                ents[ent].append(document)
            else:
                ents[ent] = [document]

    entsToDelete = []

    # find and deletes entities that only occur in one document
    for ent in ents:
        language = langid.classify(ent)
        if len(ents[ent]) <= 1:
            entsToDelete.append(ent)
        
        # checks if the ent is english
        if language[0] =='en' and language[1] < 0:
            entsToDelete.append(ent)

    for ent in set(entsToDelete):
        del ents[ent]

    documents = {}

    # revert back to original format
    # documents as keys, entities in said document as values
    for ent in ents:
        for doc in ents[ent]:
            if doc in documents:
                documents[doc].append(ent)
            else:
                documents[doc] = [ent]
    
    return documents


def getEntities(text, name):
    try:
        text = re.sub('\n+', '\n', text)
        text = re.sub(' +', ' ', text)

        if langid.classify(text)[0] != 'nl':
            return {name:[]}

        NERdoc = nlp(text)
        NERmatches = getMatches(NERdoc)
        
        # MinDoc = nlp_ministries(text)
        # MinMatches = getMatches(MinDoc)
    except:
        return {name:[]}

    return {name: list(set(NERmatches))}

def orderCheck(x, y):
    ordered = [x, y]
    ordered.sort()
    return ordered[0], ordered[1]

def getEdges(entities):
    entitiesDict = {}

    for d in entities:
        key = list(d.keys())[0]
        entitiesDict[key] = d[key]

    entitiesDict = goodEnts(entitiesDict)

    edges = {}

    for document in entitiesDict:
        if len(entitiesDict[document]) <= 1:
            continue
        
        for edge in itertools.combinations(entitiesDict[document], 2):
            node1, node2 = orderCheck(edge[0], edge[1])
            key = node1 + '\t' + node2 
            if key in edges:
                edges[key].append(document)
            else:
                edges[key] = [document]

    return edges

def makeGraph(pages):
    G = nx.Graph()
    
    entities = []

    for i in range(len(pages)):
        entities.append(getEntities(pages[i], i))

    edges = getEdges(entities)
    for edge in edges:
        nodes = edge.split('\t')
        # G.add_edge(nodes[0], nodes[1])
        G.add_edge(nodes[0], nodes[1], weight = len(edges[edge]))

    if len(list(nx.connected_components(G))) > 1:
        x = [len(c) for c in list(nx.connected_components(G))]
        x.sort(reverse=True)
        limit = x[1]
        del x
        for component in list(nx.connected_components(G)):
            if len(component)<=limit:
                for node in component:
                    G.remove_node(node)

    return G




### network centrality measures

In [10]:
def mostConnectedNodes(G, n = 10):
    nodeNeighbors = [(node, len(list(G.neighbors(node)))) for node in G.nodes]
    sortedNodes = sorted(nodeNeighbors, key=lambda tup: tup[1], reverse=True)
    
    return sortedNodes[:n]

def mostCooccurences(G, n = 10):
    weightedEdges =  G.edges(data='weight')
    sortedEdges = sorted(weightedEdges, key=lambda tup: tup[2], reverse=True)
    return sortedEdges[:n]

def calculateBetweenness(G, n = 10):
    betweenness = nx.betweenness_centrality(G)
    betweenness = [(x, betweenness[x]) for x in betweenness]
    betweenness = sorted(betweenness, key=lambda tup: tup[1], reverse=True)
    return betweenness[:n]

def addNodeWeight(d, node, weight):
    if node in d:
        d[node] += weight
    else:
        d[node] = weight
    return d

def calculateStrength(G, n = 10):
    nodeStrenghts = {}

    for edge in G.edges(data=True):
        node1 = edge[0]
        node2 = edge[1]
        weight = edge[2]['weight']    

        nodeStrenghts = addNodeWeight(nodeStrenghts, node1, weight)
        nodeStrenghts = addNodeWeight(nodeStrenghts, node2, weight)
    nodeStrenghts = [(x, nodeStrenghts[x]) for x in nodeStrenghts]
    nodeStrenghts = sorted(nodeStrenghts, key=lambda tup: tup[1], reverse=True)
    return nodeStrenghts[:n]



In [11]:
def extractRequestMetadata(text, pdfPath, pdfName):
    reason = getRequestReason(text)
    if len(reason) == 0:
        reason = ''
    else:
        match = reason[0]
    receivedDate, completedDate, daysTaken, inTime = dateInformation(text)       
    nDocuments = nDocs(text)
    nPages = getNumberOfPages(pdfPath)
    public, notPublic, partialPublic, total = inventory(pdfPath, pdfName)
    
    # tabula has a lot of output, this clears it
    clear_output()

    # convert dates to pd Timestamp dates to compare to ground truth      
    try:
        if receivedDate:
            receivedDate = convertDate(receivedDate, True)
    except:
        receivedDate = None
    try:
        if completedDate:
            completedDate = convertDate(completedDate, True)
    except:
        completedDate = None

    # if number of documents was not found in text, use total docs from inventory list
    if not nDocuments and total:
        nDocuments = total        

    # calculate days per doc
    if daysTaken and nDocuments:
        daysPerDoc = round(daysTaken / nDocuments, 2)
    elif total and daysTaken:
        daysPerDoc = round(daysTaken / total, 2)
    else:
        daysPerDoc = None

    return (reason, receivedDate, completedDate, daysTaken, inTime, nPages, public, notPublic, partialPublic, nDocuments, daysPerDoc)


In [12]:
def getText(requestDir):
    pageLimit = 400
    totalPages = 0
    text = []

    for file in os.listdir(requestDir):
        if file.startswith('Bes') and file.endswith('.pdf'):
            decisionDocPath = requestDir + file
            decisionDocName = file
            decisionDocPages = textExtractPyPdf(decisionDocPath)
            decisionDocText = ''
            for page in decisionDocPages:
                decisionDocText += page + '\n'
            totalPages = nPagesOfPdf(decisionDocPath)

        if file.endswith('.pdf') and not file.startswith('Bes'):
            fileName = requestDir + file
            numberOfPages = nPagesOfPdf(fileName)
            totalPages += nPagesOfPdf(fileName)
            if numberOfPages + totalPages > pageLimit:
                test =  textExtractPyPdf(fileName, nPages = numberOfPages - (totalPages - pageLimit))
                text += test
                break
            test =  textExtractPyPdf(fileName, nPages = numberOfPages - (totalPages - pageLimit))
            text += test

    text = [x.replace('\n', ' ') for x in text]
    return decisionDocName, decisionDocText, text



In [13]:
def dictonaryCounter(d):
    result = {}
    for item in d:
        if item in result:
            result[item] += 1
        else:
            result[item] = 1

    result = [(x, result[x]) for x in result]
    result = sorted(result, key=lambda tup: tup[1], reverse=True)
    
    return result
    

In [23]:
# requestDir = r'F:\Data files\Master thesis\verzoeken\WOB-verzoeken BZK\71\\'

requestDir = r'F:\Data files\Master thesis\verzoeken\Wob verzoeken LNV\202\\Bes-Besluit+op+Wob-verzoek+over+gesubsidieerde+sanering+van+varkenshouderijen.pdf'
# gets text
decisionDocName, decisionDocText, text = getText(requestDir)

In [24]:

# gets metadata from request
match, receivedDate, completedDate, daysTaken, inTime, nPages, public, notPublic, partialPublic, nDocuments, daysPerDoc = extractRequestMetadata(decisionDocText, requestDir, decisionDocName)
print(match, receivedDate, completedDate, daysTaken, inTime, nPages, public, notPublic, partialPublic, nDocuments, daysPerDoc)

 None None None None 933 92 14 211 317 None


In [16]:

# extract dates
spacyDates, regexDates = getDatesBase(' '.join(text), nlp)
dates = spacyDates + regexDates
dates = dictonaryCounter(dates) 
print(dates[:10])

[('31 december', 73), ('31 december 2019', 55), ('31-12', 41), ('31-12-2019', 21), ('18-2', 15), ('9 november', 14), ('31-12-2018', 14), ('9 februari', 11), ('14 december', 10), ('1 januari', 10)]


In [17]:

# extract ministries
doc = nlp_ministries(' '.join(text))
mins = dictonaryCounter([ent.text.lower() for ent in doc.ents])
mins


[('ministerie van binnenlandse zaken', 18),
 ('bzk', 5),
 ('defensie', 4),
 ('ministerie van buitenlandse zaken', 2),
 ('ministerie van bzk', 2),
 ('ministerie van defensie', 1),
 ('ministerie van vws', 1),
 ('vws', 1)]

In [18]:

# named entites with graph
G = makeGraph(text)
between = calculateBetweenness(G)
strength = calculateStrength(G)
connected = mostConnectedNodes(G)
cooccur = mostCooccurences(G)
print(between)
print(strength)
print(connected)
print(cooccur)


[('cda', 0.18618709241713685), ('christenunie', 0.1276681274734423), ('nederland', 0.10214585013255557), ('cda activiteitenverslag', 0.06522559836927351), ('den haag', 0.06475944577503105), ('europees parlement', 0.050019829612994525), ('wi', 0.045982560642835), ('cdja', 0.04500487555616356), ('europa', 0.04259117707101891), ('tweede kamer', 0.038336822838858996)]
[('cda', 674), ('cda activiteitenverslag', 359), ('christenunie', 323), ('nederland', 281), ('wi', 262), ('cdja', 262), ('europa', 252), ('europees parlement', 234), ('den haag', 191), ('wetenschappelijk instituut', 189)]
[('cda', 263), ('cda activiteitenverslag', 189), ('nederland', 165), ('cdja', 158), ('europa', 143), ('europees parlement', 141), ('christenunie', 140), ('wi', 130), ('tweede kamer', 120), ('den haag', 114)]
[('cda', 'cda activiteitenverslag', 23), ('cda', 'cdja', 19), ('amersfoort', 'christenunie', 18), ('cda', 'europa', 17), ('christenunie', 'wetenschappelijk instituut', 16), ('wi', 'wetenschappelijk insti