In [47]:
import numpy as np
import json
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [48]:
corpus = []
labels = []
urls = []
files = [
    'kamerstukken_topics-multi_20160602_20161201.json',
    'kamerstukken_topics-multi_20160101_20160601.json',
    'kamerstukken_topics-multi_20170101_20170301.json',
    'kamerstukken_topics-multi_20170302_20170601.json',
    'kamerstukken_topics-multi_20170602_20180101.json',
    'kamerstukken_topics-multi_20180101_20180401.json'
]

for file in files:
    data = json.load(open('../../data_resources/topics/kamerstukken/{}'.format(file)))
    for obj in data:
        if len(obj['categories']) > 0 and 'Antwoord van' not in obj['content']:
            urls.append(obj['url'])
            corpus.append(obj['content'])
            labels.append(obj['categories'])



print(len(corpus))
print(len(labels))

6976
6976


In [49]:
transformer = TfidfVectorizer(smooth_idf=False, min_df=0.001, max_df=0.08, sublinear_tf=True, ngram_range=(1,1))

In [50]:
X = transformer.fit_transform(corpus)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

In [51]:
clf = SGDClassifier(loss='log', penalty='none', alpha=1e-6, random_state=42, max_iter=10)
clf = OneVsRestClassifier(clf).fit(X, y)

def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: \n%s" % (class_label, "\n".join(feature_names[j] for j in top10)))
        


In [52]:
print_top10(transformer, clf, labels)

['Onderwijs en wetenschap | Hoger onderwijs', 'Werk | Werkgelegenheid']: 
sint
antillen
services
voorbij
raffinaderij
bosman
curaçao
aruba
eustatius
bonaire
['Natuur en milieu | Stoffen']: 
decentralisaties
binnenlandsbestuur
verhoudingen
milieuzones
overheden
omroepgelderland
vng
brp
accountants
gemeentelijke
['Economie | Overige economische sectoren']: 
duurder
gouden
staatshoofd
zei
koning
degene
onderhoud
bende
oranjes
oranje
['Recht | Strafrecht']: 
eerlijke
corruptie
stroomstoring
verkiezingen
ministeries
berg
politici
stuit
teeven
israel
['Recht | Staatsrecht', 'Recht | Strafrecht']: 
noodgedwongen
baan
kant
referendum
ondernemers
dure
giftige
rechtszaal
familielid
50
['Natuur en milieu | Energie', 'Huisvesting | Organisatie en beleid']: 
ambtenaren
limburgse
cybersecurity
provincies
moderne
wethouder
rotterdams
brabantse
geïntimideerd
burgemeesters
['Natuur en milieu | Organisatie en beleid']: 
aardbevingen
gaf
bzk
sollicitatieprocedure
hachchi
moderne
datum
tongeren
tonnen
omr

['Openbare orde en veiligheid | Terrorisme', 'Openbare orde en veiligheid | Criminaliteit']: 
inlichtingendiensten
formeel
snellere
tennet
heffen
mobiel
kv
mobiele
uitrol
bereik
['Internationaal | Organisatie en beleid']: 
hoogland
2050
file
monumenten
plannen
provincies
transport
opgelost
tunnels
bruggen
['Zorg en gezondheid | Ziekten en behandelingen', 'Zorg en gezondheid | Ethiek']: 
von
ehrm
martels
westerschelde
gesloopt
besproken
leegstand
detailhandel
sluis
muur
['Verkeer | Spoor']: 
rijkswaterstaat
zand
beveiligd
binnenlands
kapotte
waterstaat
waterschap
rws
waterschappen
sluizen
['Openbare orde en veiligheid | Organisatie en beleid', 'Sociale zekerheid | Ouderen', 'Verkeer | Weg']: 
yücel
roma
aboutaleb
gezinnen
toenemend
armoede
kinderbijslag
siderius
kinderopvang
kinderopvangtoeslag
['Internationaal | Ontwikkelingssamenwerking']: 
uitkering
vrijwilligerswerk
wacht
jeugdhulp
westerveld
verdwijnt
vennootschapsbelasting
geef
baan
jongeren
['Zorg en gezondheid | Organisatie en b

IndexError: index 111 is out of bounds for axis 0 with size 111

In [59]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf_feats(corpus[0], transformer.get_feature_names()).head()

Unnamed: 0,feature,tfidf
0,0,V


In [69]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

top_feats_in_doc(X, transformer.get_feature_names(), 0)

Unnamed: 0,feature,tfidf
0,promovendi,0.393832
1,aanstelling,0.227078
2,dubieuze,0.214006
3,li,0.204763
4,aangeboden,0.202382
5,parttime,0.184472
6,contracten,0.182082
7,persbericht,0.176459
8,netwerk,0.165654
9,contract,0.164176


In [92]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

top_mean_feats(X, transformer.get_feature_names())

Unnamed: 0,feature,tfidf
0,mondelinge,0.012009
1,kinderen,0.005942
2,politie,0.005801
3,eu,0.004216
4,gemeente,0.003809
5,turkse,0.003806
6,bedrijven,0.003666
7,studenten,0.003561
8,scholen,0.003504
9,ouders,0.003446


In [123]:

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = []
        for i in range(0,len(y)):
            if y[i] == label:
                ids.append(i)
        
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs


labels = []
a = mlb.inverse_transform(y)

for obj in a:
    labels.append(obj[0])

    
for df in top_feats_by_class(X, labels, transformer.get_feature_names()):
    print(df.label)
    print(df)

Bestuur | De Nederlandse Antillen en Aruba
         feature     tfidf
0        curaçao  0.118381
1          aruba  0.097078
2         bosman  0.081530
3        bonaire  0.075072
4           sint  0.071322
5        maarten  0.065128
6   raffinaderij  0.047075
7         casino  0.027844
8      eustatius  0.027637
9     koninkrijk  0.026468
10       curacao  0.026278
11         citgo  0.025713
12     curaçaose  0.025130
13     venezuela  0.023854
14           uts  0.023765
15          neen  0.023489
16       aankoop  0.023355
17       lichaam  0.022858
18            st  0.022282
19           com  0.021377
20     begroting  0.021340
21      antillen  0.020325
22    gevangenis  0.020324
23    financieel  0.020034
24     caribisch  0.019433
Bestuur | Gemeenten
               feature     tfidf
0             gemeente  0.031321
1                  vng  0.023544
2        gemeentelijke  0.021580
3           raadsleden  0.020802
4                  brp  0.019755
5     persoonsgegevens  0.018846
6   

24              tijde  0.015737
Sociale zekerheid | Werkloosheid
                 feature     tfidf
0                     ww  0.157200
1              uitkering  0.141832
2                    uwv  0.116338
3   nabestaandenpensioen  0.090646
4                inkomen  0.068091
5              werklozen  0.059272
6            meegerekend  0.058722
7                 oudere  0.054449
8                     55  0.052084
9                    wia  0.050127
10            uitbetaald  0.047379
11               partner  0.046146
12                korten  0.041539
13                gekort  0.040295
14    sollicitatieplicht  0.040019
15             uitstroom  0.039586
16              plussers  0.038724
17             weyenberg  0.038002
18                 ouder  0.035678
19           uitbetaling  0.035365
20              werkzaam  0.034882
21           verdringing  0.033869
22          werkloosheid  0.032889
23  uitvoeringsinstituut  0.032680
24                   één  0.031856
Sociale zekerheid | Ziekt

[0]
