In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import spacy
import nltk
import spacy_transformers
from nltk.corpus import stopwords
#nltk.download('stopwords')

stop_words = stopwords.words("swedish")
nlp = spacy.load("sv_pipeline")

df = pd.DataFrame(columns = ['author', 'url', 'heading', 'content'])
i = 0

def preprocess_text(text):
    text = text.lower()
    text = nlp(text)
    lemmatized = list()
    for word in text:
        lemma = word.lemma_.strip()
        if lemma and lemma not in stop_words:
            lemmatized.append(lemma)
    return " ".join(lemmatized)

authors = {}

# Step 1: Collect all urls to authors
soup = BeautifulSoup(requests.get("https://www.expressen.se/kronikorer/").content, "html.parser")
author_urls = soup.find_all(class_="site-body__column-2 widget-area widget-area--hard-divide")[0].find_all(class_="teaser")
for author in author_urls:
    if len(author.find_all("a", href=True)) > 0 and "kronikor" in author.find_all("a", href=True)[0]["href"]:
        h2_elements = author.find_all("h2")
        author_name = []
        for element in h2_elements:
            author_name.append(element.text)
        authors[" ".join(author_name)] = author.find_all("a", href=True)[0]["href"]

# Step 2: Collect the urls to all chronicles and heading
for key, value in authors.items():
    author_chronicle_teasers = BeautifulSoup(requests.get(value).content, "html.parser").find_all(class_="teaser")
    for teaser in author_chronicle_teasers:
        premium_check = teaser.find_all(class_="row row--100")
        # IF-statement, check that the chronicle is not a premium feature
        if len(premium_check) > 0 and len(teaser.find_all(class_="row row--100", href=True)) > 0 and "kronikor" in teaser.find_all(class_="row row--100", href=True)[0]["href"]:
            elements = teaser.find_all(class_="row row--100", href=True)[0]
            heading = elements.find_all("h2")[0].text
            url = "https://www.expressen.se" + elements["href"]
            df.loc[i] = [key, url, heading, ""]
            i += 1

# Step 3: Collect and preprocess text
for row in df.itertuples():
    chronicle_elements = BeautifulSoup(requests.get(row.url).content, "html.parser").find_all("div", class_="rich-text")
    full_text = ""
    for element in chronicle_elements:
        for tag in element.find_all():
            if tag.name == "p" or tag.name == "blockquote":
                full_text += tag.text
            elif tag.name == "figure":
                for child_tag in tag:
                    if child_tag.name == "p" or child_tag.name == "blockquote":
                        full_text += child_tag.text
    df.at[row.Index, 'content'] = preprocess_text(full_text)

df.to_csv("expressen-kronikor.csv", encoding='utf-8', index=False)


KeyboardInterrupt: 

In [9]:
#!pip install -U spacy==3.1.3