In [3]:
## Libraries for file management
from os import listdir
import pickle

## Libraries for data management
import numpy as np

## Libraries for website interactions
import urllib3
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer


def soupify(url):
    """Convert a html file via url into bs4 soup with urlib3 and bs4."""
    http = urllib3.PoolManager()
    r = http.request('GET', url)
    html_doc = r.data
    soup = BeautifulSoup(html_doc, 'html.parser')
    return soup


def processing(text):
    """Tokenize a given string and create a dictionary. Return stats such as
    number of words, size of the dictionary, number of sentences. Also return
    the dictionary."""
    
    ## Tokenizing the text
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text.lower())

    ## Creating a dictionary of the token occurrences
    dictionary = {}
    for word in words:
        if word in dictionary:
            dictionary[word] += 1
        else:
            dictionary[word] = 1

    ## Creating a list of sentences
    sentences = text.split(". ")
    
    return len(words), len(dictionary), len(sentences), dictionary


def extract(soup):
    """Extracts the text, title, date and categories of the article and then
    dumps it into the database stored in pickle file (overwriting the old version)."""
    
    passages = soup.find_all("p","text_3v_J6Y0G")
    text = " ".join([passage.text for passage in passages])
    text = text.replace('"','').replace("'","")
    
    date = soup.find_all("time")[-1]['datetime']
    categories = [x.text for x in soup.find_all("a", class_="link_2imnEnEf")]
    
    title = soup.find("h1","title_iP7Q1aiP").text
    for x in [".",",",":","/","â€¢","'",'"',"?","*"]:
        title = title.replace(x,"")
    title = title.strip()
    
    w,d,s,dictionary = processing(text)
    
    if len(passages) != 0:
        arr = np.array([text, categories, dictionary, w, d, s])
        
        with open('article_database.pickle', 'rb+') as handle:
            db = pickle.load(handle)
            db[f"{date[:10]}_{title}"] = arr
            pickle.dump(db, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        return 1
    else:
        return 0


def scrape(link_categories):
    """Scrape the articles from a given list of categories on the NOS website."""
    
    n_new_arcticles = 0
    
    print("|",end="")
    for category in link_categories:
        ## Parse each category page
        soup = soupify(f"https://nos.nl/nieuws/{category}/")
        
        ## Find all of the articles and remove all liveblogs
        article_blocks = soup.find_all("a", class_="link-block list-items__link")
        article_links = [f"https://nos.nl{article['href']}" for article in article_blocks if "liveblog" not in article['href']]
        
        
        for link in article_links:
            soup = soupify(link)
            n_new_arcticles += extract(soup)
        print(f" {category} |",end="")
        
        handle = open('article_database.pickle', 'rb')
        db = pickle.load(handle)
        handle.close()
    
    return n_new_arcticles, len(db)

In [25]:
# path = ".\\articles"
# articles = listdir(path)
# articles.remove(".ipynb_checkpoints")

# database = {}

# for article in articles:
#     contents = np.load(f"{path}\\{article}", allow_pickle=True)
#     database[article[:-4]] = contents

# print(len(database))

3481


In [26]:
# with open('article_database.pickle', 'wb') as handle:
#     pickle.dump(database, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('article_database.pickle', 'rb') as handle:
    b = pickle.load(handle)

# b["2014-05-07_Ontvoeren schoolkinderen is in Nigeria inmiddels een winstgevende onderneming"]
print(len(b))

3481


C:\Users\marco\Documents\GitHub\NLP-exploration-with-NOS-articles\extractor.py:65: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.