In [1]:
from webScraper import *
import pandas as pd
import csv

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mikey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<font size="5">
    Step 1: Scrape Latest Articles
</font>

In [2]:
# Scrape latest articles from https://nos.nl
# Outputs latest articles in 'articles_scrapped.json'
webDataScrape()

<font size="5">
    Step 2: Add to Master Article List
</font>

In [3]:
# Updates master_article_list.json, using articles_scrapped.json as input
updateMasterList()

In [4]:
# Check length of master article list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))
print(len(master_article_list))

151


<font size="5">
    Step 3: Convert text in Articles to list of words with frequency
</font>

In [5]:
# Retrieve all articles from master list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))

In [6]:
# Retrieve list of words from each article
full_word_list = []
for article in master_article_list:
    full_word_list.extend(article.retrieveWordList())
print(full_word_list)

['IN', 'BRUSSEL', 'HEEFT', 'DE', 'POLITIE', 'INGEGREPEN', 'OP', 'DE', 'EERSTE', 'DAG', 'VAN', 'DE', 'NATIONAL', 'CONSERVATISM', 'CONFERENCE', 'NATCON', "ZO'N", 'VEERTIG', 'CONSERVATIEVE', 'EN', 'RECHTSE', 'POLITICI', 'ZOUDEN', 'SPREKEN', 'OP', 'DIE', 'CONFERENTIE', 'ONDER', 'WIE', 'DE', 'BRITSE', 'MINISTER', 'SUELLA', 'BRAVERMAN', 'EN', 'DE', 'FRANSE', 'POLITICUS', 'ÉRIC', 'ZEMMOUR', 'BURGEMEESTER', 'VAN', 'DE', 'BRUSSELSE', 'DEELGEMEENTE', 'SINT-JOOST-TEN-NODE', 'EMIR', 'KIR', 'VERBOOD', 'HET', 'EVENEMENT', 'KORT', 'NA', 'DE', 'START', 'UIT', 'VREES', 'VOOR', 'DE', 'VERSTORING', 'VAN', 'DE', 'OPENBARE', 'ORDE', 'DE', 'POLITIE', 'GREEP', 'IN', 'TOEN', 'DE', 'BRITSE', 'POLITICUS', 'NIGEL', 'FARAGE', 'OP', 'HET', 'PODIUM', 'STOND', 'DE', 'GEESTELIJK', 'VADER', 'VAN', 'DE', 'BREXIT', 'DE', 'ZAAL', 'WERD', 'NIET', 'ONTRUIMD', 'MAAR', 'DE', 'POLITIE', 'LIET', 'GEEN', 'NIEUWE', 'BEZOEKERS', 'MEER', 'TOE', 'BEZOEKERS', 'DIE', 'NAAR', 'BUITEN', 'GINGEN', 'KONDEN', 'DAARNA', 'DE', 'ZAAL', 'NIET

In [7]:
# Converts word list to dictionary containing count of each word
word_list_count_dict = {}
for word in full_word_list:
    if word in word_list_count_dict.keys():
        word_list_count_dict[word] = word_list_count_dict[word] + 1
    else:
        word_list_count_dict[word] = 1
print(word_list_count_dict)

{'IN': 1375, 'BRUSSEL': 16, 'HEEFT': 233, 'DE': 3682, 'POLITIE': 57, 'INGEGREPEN': 2, 'OP': 686, 'EERSTE': 55, 'DAG': 33, 'VAN': 1724, 'NATIONAL': 2, 'CONSERVATISM': 1, 'CONFERENCE': 4, 'NATCON': 1, "ZO'N": 57, 'VEERTIG': 3, 'CONSERVATIEVE': 5, 'EN': 1124, 'RECHTSE': 2, 'POLITICI': 8, 'ZOUDEN': 35, 'SPREKEN': 7, 'DIE': 561, 'CONFERENTIE': 7, 'ONDER': 95, 'WIE': 22, 'BRITSE': 13, 'MINISTER': 54, 'SUELLA': 1, 'BRAVERMAN': 1, 'FRANSE': 9, 'POLITICUS': 2, 'ÉRIC': 1, 'ZEMMOUR': 1, 'BURGEMEESTER': 26, 'BRUSSELSE': 5, 'DEELGEMEENTE': 2, 'SINT-JOOST-TEN-NODE': 2, 'EMIR': 1, 'KIR': 1, 'VERBOOD': 2, 'HET': 1777, 'EVENEMENT': 5, 'KORT': 17, 'NA': 75, 'START': 7, 'UIT': 182, 'VREES': 5, 'VOOR': 571, 'VERSTORING': 1, 'OPENBARE': 7, 'ORDE': 20, 'GREEP': 2, 'TOEN': 63, 'NIGEL': 1, 'FARAGE': 1, 'PODIUM': 3, 'STOND': 8, 'GEESTELIJK': 2, 'VADER': 5, 'BREXIT': 1, 'ZAAL': 4, 'WERD': 139, 'NIET': 434, 'ONTRUIMD': 3, 'MAAR': 305, 'LIET': 11, 'GEEN': 146, 'NIEUWE': 80, 'BEZOEKERS': 14, 'MEER': 202, 'TOE': 37

In [8]:
# Convert to Dataframe and sort
word_count_df = pd.DataFrame(columns=['word', 'count'])
for word, count in word_list_count_dict.items():
    word_count_df.loc[len(word_count_df.index)] = [word, count]

word_count_sorted_df = word_count_df.sort_values(by=['count'], ascending=False)

In [10]:
# Output sorted word list to csv
word_count_sorted_df.to_csv('word_list.csv', index=False)

<font size="5">
    Step 4: Remove Invalid Words and Custom Filtered Words
</font>

In [15]:
# Remove words from invalid_words.csv - These are either symbol characters, names of people etc.
with open('invalid_words.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    invalid_word_list = [element[0] for element in list(reader)]
print(invalid_word_list)

['``', "''", ':', "'", '-']


In [16]:
# Remove already learned words
learned_words_list = []
words_to_remove_list = invalid_word_list + learned_words_list


In [23]:
word_count_filtered_df = word_count_sorted_df.copy()
for remove_word in words_to_remove_list:
    word_count_filtered_df = word_count_filtered_df[word_count_filtered_df.word !=remove_word]

In [24]:
print(list(word_count_filtered_df.word))

['DE', 'HET', 'VAN', 'EEN', 'IN', 'EN', 'DAT', 'IS', 'OP', 'TE', 'ZIJN', 'VOOR', 'DIE', 'MET', 'NIET', 'ER', 'OM', 'OOK', 'AAN', 'MAAR', 'HIJ', 'BIJ', 'ALS', 'ZE', 'HEEFT', 'WORDEN', 'DOOR', 'NOG', 'NAAR', 'MEER', 'WORDT', 'OVER', 'HEBBEN', 'JAAR', 'OF', 'UIT', 'ZEGT', 'WAS', 'DAN', 'VOLGENS', 'MENSEN', 'TOT', 'AL', 'GEEN', 'WE', 'DIT', 'WERD', 'JE', 'ZICH', 'HUN', 'TEGEN', 'VEEL', 'DEZE', 'WEL', 'KUNNEN', 'NU', 'GAAT', 'IK', 'ZO', 'KAN', 'MOET', 'ONDER', 'MOETEN', 'ANDERE', 'TWEE', 'NEDERLAND', 'OMDAT', "'S", 'DAAR', 'WAT', 'ZOU', 'NIEUWE', 'WIL', 'GAAN', 'NA', 'ALLE', 'EURO', 'ZEI', 'KOMT', 'WAREN', 'ALLEEN', 'ONDERZOEK', 'GROTE', 'HAAR', 'WAAR', 'NEDERLANDSE', 'HAD', 'AANTAL', 'TOEN', 'WERDEN', 'TUSSEN', 'PROCENT', 'KRIJGEN', 'MAKEN', 'VANDAAG', 'KOMEN', 'STAAT', "ZO'N", 'POLITIE', 'HOE', 'LAND', 'WEER', 'EERSTE', 'PER', 'DUS', 'MINISTER', 'TIJD', 'DOEN', 'EUROPESE', 'DIEREN', 'SINDS', 'AF', 'ZOALS', 'MILJOEN', 'BEDRIJF', 'MINDER', 'ZAL', 'TWEEDE', 'KAMER', 'ZIJ', 'VOORAL', 'GOED', 