<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/BSSDH2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Acquiring plain-text data for the corpus

## Getting the source documents

In [None]:
!pip install requests

In [2]:
import requests

### Via RSS feeds

Consider [Europe Media Monitor](https://emm.newsbrief.eu) feeds:

* [Current top 10 stories](https://emm.newsbrief.eu/NewsBrief/clusteredition/en/latest_en.html) (in English) ⇒ [machine-readable feed](https://emm.newsbrief.eu/rss/rss?type=rtn&language=en&duplicates=false) (RSS/XML)
* [Biggest 10 Stories Over Last 24h](https://emm.newsbrief.eu/NewsBrief/clusteredition/en/24hrs_en.html) ⇒ [machine-readable feed](https://emm.newsbrief.eu/rss/rss?type=24hrs&language=en&duplicates=false) (RSS/XML)

The Really Simple Syndication (RSS) standard and its XML format: https://www.w3schools.com/xml/xml_rss.asp

The Python feedparser library: https://feedparser.readthedocs.io

In [None]:
!pip install feedparser

In [4]:
import feedparser

from urllib.parse import urlparse
from collections import Counter

In [None]:
LANG = 'en'

url_current = f'https://emm.newsbrief.eu/rss/rss?type=rtn&language={LANG}&duplicates=false'
url_last24h = f'https://emm.newsbrief.eu/rss/rss?type=24hrs&language={LANG}&duplicates=false'

feed = feedparser.parse(url_current)

LINKS = [entry.link for entry in feed.entries]

for link in LINKS: print(link)
print(len(LINKS))

In [None]:
filter = 'telegraph.co.uk'

FILTERED_LINKS = [link for link in LINKS if filter in link]

for link in FILTERED_LINKS: print(link)
print(len(FILTERED_LINKS))

#### Data analysis

In [None]:
domains = [urlparse(link).netloc for link in LINKS]

frequencies = Counter(domains)

for domain, count in frequencies.items():
    print(f'{domain}: {count}')

In [None]:
domains = [urlparse(link).netloc for link in LINKS]

frequencies = Counter(domains)

pruned = {domain: count for domain, count in frequencies.items() if count > 1}

for domain, count in Counter(pruned).most_common():
    print(f'{domain}: {count}')

### Via web crawling

## Extracting useful content

In [None]:
!pip install beautifulsoup4

In [10]:
from bs4 import BeautifulSoup

import requests
import json

In [11]:
def extract_plain_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    paragraphs = soup.find_all('p')
    text = '\n'.join([p.get_text() for p in paragraphs])

    return text

In [None]:
dataset = []

for link in FILTERED_LINKS:
    content = extract_plain_text(link)
    print(content[:140], '\n' + '='*140)

    article = {
        'language': LANG,
        'domain': urlparse(link).netloc,
        'link': link,
        'text': content
    }

    dataset.append(article)

with open('corpus.json', 'w', encoding='utf-8') as json_file:
    json.dump(dataset, json_file, ensure_ascii=False, indent=4)

## Some challenges

### Messy HTML source code

### PDF documents

# Creating an annotated text corpus

## Syntactic parsing

Documentation:
* Available models per language: https://stanfordnlp.github.io/stanza/models.html
* Supported processors and pipelines: https://stanfordnlp.github.io/stanza/pipeline.html
* Data objects: https://stanfordnlp.github.io/stanza/data_objects.html

In [None]:
!pip install stanza

In [14]:
import stanza

In [None]:
stanza.download('en')

In [None]:
NLP_PIPE = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

In [17]:
CORPUS = []

with open('corpus.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

    for article in data:
        CORPUS.append({
            'language': article['language'],
            'domain': article['domain'],
            'link': article['link'],
            'document': NLP_PIPE(article['text'])
        })

### CoNLL-U output

Format: https://universaldependencies.org/docs/format.html

In [18]:
with open('corpus.conllu', 'w', encoding='utf-8') as conllu_file:
    for article in CORPUS:

        for s in article['document'].sentences:
            conllu_file.write(f'# text = {s.text}\n')

            for w in s.words:
                conllu_file.write(
                    f'{w.id}\t'
                    f'{w.text}\t'
                    f'{w.lemma}\t'
                    f'{w.upos}\t'
                    '_\t'
                    '_\t'
                    f'{w.head}\t'
                    f'{w.deprel}\t'
                    '_\t'
                    '_\n'
                )

            conllu_file.write("\n")

### VERT output

In [20]:
with open('corpus.vert', 'w', encoding='utf-8') as vert_file:
    for article in CORPUS:
        vert_file.write(f'<doc>\n')
        for s in article['document'].sentences:
            vert_file.write(f'<s>\n')

            for w in s.words:
                vert_file.write(
                    f'{w.text}\t'
                    f'{w.upos}\t'
                    f'{w.lemma}\t'
                    '_\t'
                    '_\t'
                    '_\t'
                    '_\t'
                    '_\t'
                    '_\n'
                )
            vert_file.write("</s>\n")
        vert_file.write("</doc>\n")