# BBC News Preparation

This Notebook prepares the BBC political news dataset, which can be downloaden from: http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip.

It processes the separate text files to extract the title and article texts. Then it performs Named Entity Recognistion (NER) to identify the people mentioned in the article.

In [None]:
import urllib
import zipfile

from io import BytesIO, TextIOWrapper

import spacy
import pandas as pd

## Download and preprocess

In [None]:
data_zipped = urllib.request.urlopen("http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip")
data_zipped = BytesIO(data_zipped.read())

In [None]:
news = []

with zipfile.ZipFile(data_zipped, 'r') as archive:
    
    for file_name in archive.namelist():
        
        if file_name.startswith("bbc/politics/") and file_name.endswith(".txt"):
            
            with archive.open(file_name) as news_file:
                news_file = TextIOWrapper(news_file, encoding="utf-8")
                
                title = next(news_file).strip()
                article = news_file.read().strip()
                
                news.append({"filename": file_name, "title": title, "article": article})
        
news_df = pd.DataFrame(news)

In [None]:
news_df

## Named Entity Recognition

In [None]:
# Make sure en_core_web_sm is installed
# Use command: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
# Create person column with set of all mentioned persons
news_df = news_df.assign(
    person=lambda df: df["article"].map(
        lambda txt: {tkn.text.strip() for tkn in nlp(txt).ents if tkn.label_ == "PERSON"}
    )
)

In [None]:
news_df.sample(5)

In [None]:
news_df.explode("person").to_csv("bbc_news_political.csv", index=False)