In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import json


caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [3]:
def get_response_url(partial_url):
    full_url = 'https://vng.nl{}'.format(partial_url)
    r = requests.get(full_url)
    return r.url

def get_soup_from_partial_url(partial_url):
    full_url = 'https://vng.nl{}'.format(partial_url)
    r = requests.get(full_url)
    return BeautifulSoup(r.content, "html5lib")

def get_news_article_urls_from_page(soup):
    urls = []
    
    for a_tag in soup.find_all('a', href=True):

        if '/nieuws/' in a_tag['href']:
            urls.append(a_tag['href'])
            
    return urls

def next_page_soup(soup):    
    next_page_button = soup.find('li', {'class' : 'pager-next'})
    
    if next_page_button:
        return get_soup_from_partial_url(next_page_button.a['href'])
    else:
        return None

    
def read_news_content_from_url(url):
    page_soup = get_soup_from_partial_url(url)
    article = page_soup.find('div', {'about' : url})
    if article:
        content = article.find('div', {'class' : 'content'})
        sentences = split_into_sentences(content.get_text())
    else:
        sentences = []
    
    for sentence in sentences:
        if 'vng-dossier' in sentence.lower():
            sentences.remove(sentence)
    
    return ' '.join(sentences)

def read_label_from_url(url):
    splitted = url.split('/')
    return splitted[2]

In [4]:
urls = []

current_soup = get_soup_from_partial_url('/nieuws')

while current_soup:
    current_urls = get_news_article_urls_from_page(current_soup)
    
    for url in current_urls:
        urls.append(url)
    print(len(urls))
    
    current_soup = next_page_soup(current_soup)

14
24
34
44
54
64
74
84
94
104
114
124
134
144
154
164
174
184
194
204
214
224
234
244
254
264
274
284
294
304
314
324
334
344
354
364
374
384
394
404
414
424
434
444
454
464
474
484
494
504
514
524
534
544
554
564
574
584
594
604
614
624
634
644
654
664
674
684
694
704
714
724
734
744
754
764
774
784
794
804
814
824
834
844
854
864
874
884
894
904
914
924
934
944
954
964
974
984
994
1004
1014
1024
1034
1044
1054
1064
1074
1084
1094
1104
1114
1124
1134
1144
1154
1164
1174
1184
1194
1204
1214
1224
1234
1244
1254
1264
1274
1284
1294
1304
1314
1324
1334
1344
1354
1364
1374
1384
1394
1404
1414
1424
1434
1444
1454
1464
1474
1484
1494
1504
1514
1524
1534
1544
1554
1564
1574
1584
1594
1604
1614
1624
1634
1644
1654
1664
1674
1684
1694
1704
1714
1724
1734
1744
1754
1764
1774
1784
1794
1804
1814
1824
1834
1844
1854
1864
1874
1884
1894
1904
1914
1924
1934
1944
1954
1964
1974
1984
1994
2004
2014
2024
2034
2044
2054
2064
2074
2084
2094
2104
2114
2124
2134
2144
2154
2164
2174
2184
2194
2204
2214
222

In [10]:
result = []

for url in urls:    
    content = read_news_content_from_url(url)
    response_url = get_response_url(url)

    main = response_url.split('/')[4]
    sub = response_url.split('/')[5]
    
    if len(result) % 100 == 0:
        print(len(result))
    
    result.append({'response_url' : response_url, 'main_topic' : main, 'sub_topic' : sub, 'content' : content})
    

with open('../data_resources/topics/vng_training.json', 'w') as outfile:
    json.dump(result, outfile)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500


In [13]:
print(result[0])

{'response_url': 'https://vng.nl/onderwerpenindex/werk-en-inkomen/armoedebeleid-en-schuldhulpverlening/nieuws/rondetafelgesprek-kamer-aanpak-schulden-in-de-schijnwerpers', 'main_topic': 'werk-en-inkomen', 'sub_topic': 'armoedebeleid-en-schuldhulpverlening', 'content': 'Schulden en armoede staan volop in de schijnwerpers. Vandaag praat de Tweede Kamer over de aanpak van problematische schulden. Wethouder Arjan Vliegenthart neemt namens de VNG deel aan het Rondetafelgesprek met de vaste Kamercommissie SZW. Hiervoor dienden VNG en Divosa samen een position paper in. Met het position paper roepen gemeenten de Tweede Kamer en het kabinet op om concreet acht punten aan te pakken. Deze punten zijn randvoorwaardelijk\xa0om de gemeentelijke ambitie waar te kunnen maken. Wat is er concreet nodig? Zorg voor financiële educatie in het onderwijs. Neem belemmeringen privacy en gegevensuitwisseling weg om outreachende integrale schuldhulpverlening te kunnen bieden. Realiseer sociale incasso en pas de

In [16]:
with open('../data_resources/topics/vng_training.json', 'w') as outfile:
    json.dump(result, outfile)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)