In [1]:
from webScraper import *
import pandas as pd
import csv
from openpyxl import load_workbook

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mikey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<font size="5">
    Step 1: Scrape Latest Articles
</font>

In [2]:
# Scrape latest articles from https://nos.nl
# Outputs latest articles in 'articles_scrapped.json'
webDataScrape()

<font size="5">
    Step 2: Add to Master Article List
</font>

In [3]:
# Updates master_article_list.json, using articles_scrapped.json as input
updateMasterList()

In [4]:
# Check length of master article list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))
print(len(master_article_list))

1074


<font size="5">
    Step 3: Convert text in Articles to list of words with frequency
</font>

In [5]:
# Retrieve all articles from master list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))

In [6]:
# Retrieve list of words from each article
full_word_list = []
for article in master_article_list:
    full_word_list.extend(article.retrieveWordList())
print(full_word_list)

['IN', 'BRUSSEL', 'HEEFT', 'DE', 'POLITIE', 'INGEGREPEN', 'OP', 'DE', 'EERSTE', 'DAG', 'VAN', 'DE', 'NATIONAL', 'CONSERVATISM', 'CONFERENCE', 'NATCON', "ZO'N", 'VEERTIG', 'CONSERVATIEVE', 'EN', 'RECHTSE', 'POLITICI', 'ZOUDEN', 'SPREKEN', 'OP', 'DIE', 'CONFERENTIE', 'ONDER', 'WIE', 'DE', 'BRITSE', 'MINISTER', 'SUELLA', 'BRAVERMAN', 'EN', 'DE', 'FRANSE', 'POLITICUS', 'ÉRIC', 'ZEMMOUR', 'BURGEMEESTER', 'VAN', 'DE', 'BRUSSELSE', 'DEELGEMEENTE', 'SINT-JOOST-TEN-NODE', 'EMIR', 'KIR', 'VERBOOD', 'HET', 'EVENEMENT', 'KORT', 'NA', 'DE', 'START', 'UIT', 'VREES', 'VOOR', 'DE', 'VERSTORING', 'VAN', 'DE', 'OPENBARE', 'ORDE', 'DE', 'POLITIE', 'GREEP', 'IN', 'TOEN', 'DE', 'BRITSE', 'POLITICUS', 'NIGEL', 'FARAGE', 'OP', 'HET', 'PODIUM', 'STOND', 'DE', 'GEESTELIJK', 'VADER', 'VAN', 'DE', 'BREXIT', 'DE', 'ZAAL', 'WERD', 'NIET', 'ONTRUIMD', 'MAAR', 'DE', 'POLITIE', 'LIET', 'GEEN', 'NIEUWE', 'BEZOEKERS', 'MEER', 'TOE', 'BEZOEKERS', 'DIE', 'NAAR', 'BUITEN', 'GINGEN', 'KONDEN', 'DAARNA', 'DE', 'ZAAL', 'NIET

In [7]:
# Converts word list to dictionary containing count of each word
word_list_count_dict = {}
for word in full_word_list:
    if word in word_list_count_dict.keys():
        word_list_count_dict[word] = word_list_count_dict[word] + 1
    else:
        word_list_count_dict[word] = 1
print(word_list_count_dict)

{'IN': 10177, 'BRUSSEL': 68, 'HEEFT': 1704, 'DE': 27999, 'POLITIE': 693, 'INGEGREPEN': 9, 'OP': 5117, 'EERSTE': 379, 'DAG': 261, 'VAN': 12674, 'NATIONAL': 13, 'CONSERVATISM': 1, 'CONFERENCE': 4, 'NATCON': 1, "ZO'N": 393, 'VEERTIG': 17, 'CONSERVATIEVE': 26, 'EN': 8310, 'RECHTSE': 17, 'POLITICI': 56, 'ZOUDEN': 222, 'SPREKEN': 65, 'DIE': 3696, 'CONFERENTIE': 13, 'ONDER': 871, 'WIE': 215, 'BRITSE': 75, 'MINISTER': 257, 'SUELLA': 1, 'BRAVERMAN': 1, 'FRANSE': 115, 'POLITICUS': 33, 'ÉRIC': 1, 'ZEMMOUR': 1, 'BURGEMEESTER': 114, 'BRUSSELSE': 9, 'DEELGEMEENTE': 2, 'SINT-JOOST-TEN-NODE': 2, 'EMIR': 2, 'KIR': 1, 'VERBOOD': 6, 'HET': 13437, 'EVENEMENT': 31, 'KORT': 116, 'NA': 790, 'START': 46, 'UIT': 1507, 'VREES': 17, 'VOOR': 3685, 'VERSTORING': 5, 'OPENBARE': 36, 'ORDE': 59, 'GREEP': 16, 'TOEN': 468, 'NIGEL': 1, 'FARAGE': 1, 'PODIUM': 37, 'STOND': 101, 'GEESTELIJK': 3, 'VADER': 40, 'BREXIT': 3, 'ZAAL': 32, 'WERD': 1239, 'NIET': 3058, 'ONTRUIMD': 25, 'MAAR': 1941, 'LIET': 81, 'GEEN': 871, 'NIEUWE'

In [8]:
# Convert to Dataframe and sort
word_count_df = pd.DataFrame(columns=['word', 'count'])
for word, count in word_list_count_dict.items():
    word_count_df.loc[len(word_count_df.index)] = [word, count]

word_count_sorted_df = word_count_df.sort_values(by=['count'], ascending=False)

In [9]:
# Output sorted word list to csv
word_count_sorted_df.to_csv('word_list.csv', index=False)

<font size="5">
    Step 4: Remove Invalid Words and Custom Filtered Words
</font>

In [10]:
# Remove words from invalid_words.csv - These are either symbol characters, names of people etc.
with open('invalid_words.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    invalid_word_list = [element[0] for element in list(reader)]

# Remove already learned words
with open('learned_words.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    learned_words_list = [element[0] for element in list(reader)]

# Combine lists
words_to_remove_list = invalid_word_list + learned_words_list

In [11]:
# Remove words from current dataframe word count list
word_count_filtered_df = word_count_sorted_df.copy()
for remove_word in words_to_remove_list:
    word_count_filtered_df = word_count_filtered_df[word_count_filtered_df.word !=remove_word]

In [12]:

top_1000_words_df = word_count_filtered_df.sort_values('count', ascending=False).head(1000).reset_index(drop=True)

In [13]:
# Output filtered word list to csv
top_1000_words_df.to_csv('word_list_filtered.csv', index=False)