In [2]:
from webScraper import *
import pandas as pd
import csv
from openpyxl import load_workbook

<font size="5">
    Step 1: Scrape Latest Articles
</font>

In [3]:
# Scrape latest articles from https://nos.nl
# Outputs latest articles in 'articles_scrapped.json'
webDataScrape()

<font size="5">
    Step 2: Add to Master Article List
</font>

In [4]:
# Updates master_article_list.json, using articles_scrapped.json as input
updateMasterList()

In [5]:
# Check length of master article list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))
print(len(master_article_list))

695


<font size="5">
    Step 3: Convert text in Articles to list of words with frequency
</font>

In [6]:
# Retrieve all articles from master list
master_article_list = []
with open('master_article_list.json') as json_file:
    master_article_data = json.load(json_file)
    for article in master_article_data:
        master_article_list.append(Article(article['ID'], article['url'], article['title'], article['date_time'], article['text']))

In [7]:
# Retrieve list of words from each article
full_word_list = []
for article in master_article_list:
    full_word_list.extend(article.retrieveWordList())
print(full_word_list)

['IN', 'BRUSSEL', 'HEEFT', 'DE', 'POLITIE', 'INGEGREPEN', 'OP', 'DE', 'EERSTE', 'DAG', 'VAN', 'DE', 'NATIONAL', 'CONSERVATISM', 'CONFERENCE', 'NATCON', "ZO'N", 'VEERTIG', 'CONSERVATIEVE', 'EN', 'RECHTSE', 'POLITICI', 'ZOUDEN', 'SPREKEN', 'OP', 'DIE', 'CONFERENTIE', 'ONDER', 'WIE', 'DE', 'BRITSE', 'MINISTER', 'SUELLA', 'BRAVERMAN', 'EN', 'DE', 'FRANSE', 'POLITICUS', 'ÉRIC', 'ZEMMOUR', 'BURGEMEESTER', 'VAN', 'DE', 'BRUSSELSE', 'DEELGEMEENTE', 'SINT-JOOST-TEN-NODE', 'EMIR', 'KIR', 'VERBOOD', 'HET', 'EVENEMENT', 'KORT', 'NA', 'DE', 'START', 'UIT', 'VREES', 'VOOR', 'DE', 'VERSTORING', 'VAN', 'DE', 'OPENBARE', 'ORDE', 'DE', 'POLITIE', 'GREEP', 'IN', 'TOEN', 'DE', 'BRITSE', 'POLITICUS', 'NIGEL', 'FARAGE', 'OP', 'HET', 'PODIUM', 'STOND', 'DE', 'GEESTELIJK', 'VADER', 'VAN', 'DE', 'BREXIT', 'DE', 'ZAAL', 'WERD', 'NIET', 'ONTRUIMD', 'MAAR', 'DE', 'POLITIE', 'LIET', 'GEEN', 'NIEUWE', 'BEZOEKERS', 'MEER', 'TOE', 'BEZOEKERS', 'DIE', 'NAAR', 'BUITEN', 'GINGEN', 'KONDEN', 'DAARNA', 'DE', 'ZAAL', 'NIET

In [8]:
# Converts word list to dictionary containing count of each word
word_list_count_dict = {}
for word in full_word_list:
    if word in word_list_count_dict.keys():
        word_list_count_dict[word] = word_list_count_dict[word] + 1
    else:
        word_list_count_dict[word] = 1
print(word_list_count_dict)

{'IN': 6591, 'BRUSSEL': 40, 'HEEFT': 1106, 'DE': 18380, 'POLITIE': 493, 'INGEGREPEN': 8, 'OP': 3397, 'EERSTE': 259, 'DAG': 168, 'VAN': 8334, 'NATIONAL': 5, 'CONSERVATISM': 1, 'CONFERENCE': 4, 'NATCON': 1, "ZO'N": 252, 'VEERTIG': 8, 'CONSERVATIEVE': 17, 'EN': 5391, 'RECHTSE': 11, 'POLITICI': 43, 'ZOUDEN': 147, 'SPREKEN': 43, 'DIE': 2479, 'CONFERENTIE': 13, 'ONDER': 544, 'WIE': 134, 'BRITSE': 33, 'MINISTER': 172, 'SUELLA': 1, 'BRAVERMAN': 1, 'FRANSE': 73, 'POLITICUS': 20, 'ÉRIC': 1, 'ZEMMOUR': 1, 'BURGEMEESTER': 85, 'BRUSSELSE': 5, 'DEELGEMEENTE': 2, 'SINT-JOOST-TEN-NODE': 2, 'EMIR': 2, 'KIR': 1, 'VERBOOD': 4, 'HET': 8605, 'EVENEMENT': 21, 'KORT': 81, 'NA': 525, 'START': 35, 'UIT': 964, 'VREES': 11, 'VOOR': 2439, 'VERSTORING': 3, 'OPENBARE': 25, 'ORDE': 36, 'GREEP': 14, 'TOEN': 305, 'NIGEL': 1, 'FARAGE': 1, 'PODIUM': 28, 'STOND': 66, 'GEESTELIJK': 3, 'VADER': 30, 'BREXIT': 2, 'ZAAL': 25, 'WERD': 803, 'NIET': 1944, 'ONTRUIMD': 18, 'MAAR': 1269, 'LIET': 51, 'GEEN': 587, 'NIEUWE': 384, 'BEZ

In [9]:
# Convert to Dataframe and sort
word_count_df = pd.DataFrame(columns=['word', 'count'])
for word, count in word_list_count_dict.items():
    word_count_df.loc[len(word_count_df.index)] = [word, count]

word_count_sorted_df = word_count_df.sort_values(by=['count'], ascending=False)

In [10]:
# Output sorted word list to csv
word_count_sorted_df.to_csv('word_list.csv', index=False)

<font size="5">
    Step 4: Remove Invalid Words and Custom Filtered Words
</font>

In [11]:
# Remove words from invalid_words.csv - These are either symbol characters, names of people etc.
with open('invalid_words.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    invalid_word_list = [element[0] for element in list(reader)]

# Remove already learned words
with open('learned_words.csv', newline='') as csv_file:
    reader = csv.reader(csv_file)
    learned_words_list = [element[0] for element in list(reader)]

# Combine lists
words_to_remove_list = invalid_word_list + learned_words_list

In [12]:
# Remove words from current dataframe word count list
word_count_filtered_df = word_count_sorted_df.copy()
for remove_word in words_to_remove_list:
    word_count_filtered_df = word_count_filtered_df[word_count_filtered_df.word !=remove_word]

In [13]:

top_1000_words_df = word_count_filtered_df.sort_values('count', ascending=False).head(1000).reset_index(drop=True)

In [14]:
# Output filtered word list to csv
top_1000_words_df.to_csv('word_list_filtered.csv', index=False)