In [50]:
import pandas as pd
import spacy
from collections import Counter

In [74]:
nlp = spacy.load("nl_core_news_sm")

In [75]:
data = pd.read_csv('data/artikelen.csv')

In [76]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,identifier,type,title,date,content,subcategory,category,Year,DL score,Length,spatial
0,141,http://resolver.kb.nl/resolve?urn=ddd:01026254...,artikel,DUITSCHLAND.,1869/03/03 00:00:00,"WEENEN, 28 Febr. Men zal zich herinneren dat o...",trein,trein,1869,76.958525,434,Landelijk
1,405,http://resolver.kb.nl/resolve?urn=MMKB23:00138...,artikel,Verspreide Berichten.,1869/08/03 00:00:00,De heer Gladstone is vrij ernstig ziek geweest...,trein,trein,1869,80.363636,275,Landelijk


In [77]:
len(data)

533

In [78]:
data = data.dropna(subset=['content'])

In [79]:
def process_text(text):
    return nlp(text)

def flatten(xss):
    return [x for xs in xss for x in xs]

In [80]:
data["doc"] = data["content"].apply(process_text)

In [81]:
import pickle

with open('data/processed_docs.pkl', 'wb') as f:
    pickle.dump(data, f)

In [82]:
with open('data/processed_docs.pkl', 'rb') as f:
    processed_docs = pickle.load(f)

In [83]:
processed_docs.head(2)

Unnamed: 0.1,Unnamed: 0,identifier,type,title,date,content,subcategory,category,Year,DL score,Length,spatial,doc
0,141,http://resolver.kb.nl/resolve?urn=ddd:01026254...,artikel,DUITSCHLAND.,1869/03/03 00:00:00,"WEENEN, 28 Febr. Men zal zich herinneren dat o...",trein,trein,1869,76.958525,434,Landelijk,"(WEENEN, ,, 28, Febr, ., Men, zal, zich, herin..."
1,405,http://resolver.kb.nl/resolve?urn=MMKB23:00138...,artikel,Verspreide Berichten.,1869/08/03 00:00:00,De heer Gladstone is vrij ernstig ziek geweest...,trein,trein,1869,80.363636,275,Landelijk,"(De, heer, Gladstone, is, vrij, ernstig, ziek,..."


PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.

In [84]:
spacy.explain('GPE')

'Countries, cities, states'

In [145]:
def get_gpe(doc):
    return [ent.text for ent in doc.ents if ent.label_ == 'NORP']

In [146]:
processed_docs['GPEs'] = processed_docs['doc'].apply(get_gpe)

In [147]:
fiets = processed_docs[processed_docs['category'] == 'fiets']

In [148]:
trein = processed_docs[processed_docs['category'] == 'trein']

In [149]:
trein_lan = trein[trein['spatial'] == 'Landelijk']

In [150]:
trein_reg = trein[trein['spatial'] == 'Regionaal/lokaal']

In [151]:
fiets_gpe = fiets['GPEs'].values.tolist()
fiets_gpe = flatten(fiets_gpe)

In [152]:
trein_lan_gpe = trein_lan['GPEs'].values.tolist()
trein_lan_gpe = flatten(trein_lan_gpe)

In [153]:
trein_reg_gpe = trein_reg['GPEs'].values.tolist()
trein_reg_gpe = flatten(trein_reg_gpe)

In [154]:
Counter(fiets_gpe).most_common(10)

[('Fransche', 15),
 ('Duitsche', 13),
 ('Nederlandsche', 12),
 ('H.', 11),
 ('franken', 11),
 ('Belgische', 9),
 ('Franschen', 9),
 ('Amerikaansche', 8),
 ('liberalen', 7),
 ('liberale', 6)]

In [155]:
Counter(trein_lan_gpe).most_common(10)

[('’t', 67),
 ('Nederlandsche', 30),
 ('liberale', 29),
 ('liberalen', 25),
 ('katholieke', 25),
 ('Fransche', 23),
 ('Katholieken', 22),
 ('Belgische', 15),
 ('H.', 14),
 ('Pruisen', 13)]

In [156]:
Counter(trein_reg_gpe).most_common(10)

[('Nederlandsche', 51),
 ('Fransche', 50),
 ('liberale', 47),
 ('H.', 42),
 ('Belgische', 41),
 ('liberalen', 33),
 ('franken', 30),
 ('Russische', 28),
 ('nederlandsche', 25),
 ('Indische', 25)]

In [254]:
trein_nouns = []

for index, row in trein.iterrows():
    doc = row['doc']
    for token in doc:
        if token.pos_ == 'NOUN':
            #print(token.pos_)
            trein_nouns.append(str(token))

In [256]:
fiets_nouns = []

for index, row in fiets.iterrows():
    doc = row['doc']
    for token in doc:
        if token.pos_ == 'NOUN':
            #print(token.pos_)
            fiets_nouns.append(str(token))

In [262]:
result = Counter(fiets_nouns).most_common(30)
result

[('heer', 132),
 ('tijd', 78),
 ('plaats', 59),
 ('zoo', 56),
 ('tusschen', 52),
 ('dagen', 48),
 ('uur', 39),
 ('jaar', 38),
 ('man', 38),
 ('wijze', 38),
 ('jaren', 38),
 ('dag', 36),
 ('vélocipède', 33),
 ('weder', 33),
 ('”', 31),
 ('zaak', 30),
 ('petroleum', 28),
 ('personen', 27),
 ('water', 26),
 ('vergadering', 26),
 ('gelegenheid', 25),
 ('voeten', 23),
 ('hand', 23),
 ('gebruik', 23),
 ('beweging', 22),
 ('politie', 22),
 ('stad', 22),
 ('uren', 22),
 ('huis', 21),
 ('paar', 21)]

In [261]:
result = Counter(trein_nouns).most_common(30)
result

[('heer', 6394),
 ('plaats', 4787),
 ('trein', 4470),
 ('zoo', 3990),
 ('tusschen', 2935),
 ('”', 2845),
 ('tijd', 2844),
 ('dagen', 2500),
 ('jaar', 2494),
 ('station', 2430),
 ('uur', 2291),
 ('man', 2066),
 ('weder', 2032),
 ('dag', 1985),
 ('dienst', 1904),
 ('jaren', 1903),
 ('wijze', 1792),
 ('zaak', 1688),
 ('personen', 1604),
 ('einde', 1571),
 ('aantal', 1565),
 ('leden', 1550),
 ('brug', 1530),
 ('gedeelte', 1515),
 ('gebruik', 1513),
 ('water', 1471),
 ('weg', 1470),
 ('gelegenheid', 1436),
 ('stad', 1380),
 ('aanleiding', 1357)]

In [263]:
trein_nouns = []

for index, row in trein.iterrows():
    doc = row['doc']
    for token in doc:
        if token.pos_ == 'VERB':
            #print(token.pos_)
            trein_nouns.append(str(token))

In [264]:
fiets_nouns = []

for index, row in fiets.iterrows():
    doc = row['doc']
    for token in doc:
        if token.pos_ == 'VERB':
            #print(token.pos_)
            fiets_nouns.append(str(token))

In [267]:
result = Counter(fiets_nouns).most_common(30)
result

[('doen', 54),
 ('maken', 50),
 ('heeft', 49),
 ('hebben', 48),
 ('inde', 47),
 ('gemaakt', 44),
 ('zien', 40),
 ('komen', 39),
 ('geven', 36),
 ('genomen', 35),
 ('brengen', 31),
 ('gaan', 29),
 ('laten', 29),
 ('gebracht', 29),
 ('gedaan', 29),
 ('had', 29),
 ('gegeven', 28),
 ('nemen', 27),
 ('voorzien', 25),
 ('volgende', 25),
 ('zijn', 23),
 ('gesteld', 23),
 ('gehad', 23),
 ('komt', 21),
 ('willen', 21),
 ('schijnt', 20),
 ('bestaat', 20),
 ('bewogen', 19),
 ('inden', 19),
 ('deed', 19)]

In [268]:
result = Counter(trein_nouns).most_common(30)
result

[('doen', 3794),
 ('inde', 3648),
 ('maken', 2694),
 ('hebben', 2398),
 ('had', 2347),
 ('heeft', 2248),
 ('gemaakt', 2225),
 ('komen', 2111),
 ('gegeven', 2060),
 ('kwam', 1971),
 ('genomen', 1957),
 ('geven', 1931),
 ('nemen', 1923),
 ('brengen', 1847),
 ('gaan', 1646),
 ('zien', 1615),
 ('volgende', 1600),
 ('gebracht', 1515),
 ('gehouden', 1478),
 ('gedaan', 1423),
 ('laten', 1366),
 ('gesteld', 1356),
 ('voorzien', 1315),
 ('ontvangen', 1286),
 ('liet', 1258),
 ('houden', 1228),
 ('gehad', 1179),
 ('stellen', 1151),
 ('gaf', 1139),
 ('komt', 1114)]

In [33]:
Counter(fiets_nouns).keys() # equals to list(set(words))
Counter(fiets_nouns).values() # counts the elements' frequency
fiets_dict = dict(Counter(fiets_gpe))

NameError: name 'fiets_nouns' is not defined