In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from dotenv import load_dotenv
import os
from database_utils import split_texts_for_db, calculate_embedding, add_articles_to_qdrant, assign_keywords
from text_preparation_utils import sanitize_text, drop_similar_rows

load_dotenv()
QDRANT_KEY = os.getenv('QDRANT_KEY')
QDRANT_CLUSTER_URL = os.getenv('QDRANT_CLUSTER_URL')
HUGGING_FACE_TOKEN = os.getenv('HUGGING_FACE_TOKEN')

# Load base BBC dataset extended with additional features

In [7]:
df_base = pd.read_csv('bbc_news_base.csv')
df_base = df_base.drop(columns = ['Unnamed: 0', 'category_encoded', 'no_sentences', 'Flesch Reading Ease Score', 'Dale-Chall Readability Score'])

In [8]:
df_base.head()

Unnamed: 0,text,labels,keywords,summary
0,Ad sales boost Time Warner profit Quarterly p...,business,"['Time Warner', 'Quarterly profits', 'AOL', 'm...",Its profits were buoyed by one-off gains which...
1,Dollar gains on Greenspan speech The dollar h...,business,"['Federal Reserve', 'Greenspan speech', 'highe...",The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim The owners ...,business,"['embattled Russian', 'Russian oil', 'unit buy...",The owners of embattled Russian oil giant Yuko...
3,High fuel prices hit BA's profits British Air...,business,"['British Airways', 'High fuel', 'blamed high'...",Looking ahead to its full year results to Marc...
4,Pernod takeover talk lifts Domecq Shares in U...,business,"['Allied Domecq', 'Domecq Shares', 'Pernod Ric...",Reports in the Wall Street Journal and the Fin...


In [9]:
df_base.labels.unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [12]:
df_base['splitted_text'] = df_base['text'].apply(lambda x: split_texts_for_db(x))
df_base['embeddings'] = df_base['splitted_text'].apply(lambda x: calculate_embedding(x))
df_base.head()

Unnamed: 0,text,labels,keywords,summary,splitted_text,embeddings
0,Ad sales boost Time Warner profit Quarterly p...,business,"['Time Warner', 'Quarterly profits', 'AOL', 'm...",Its profits were buoyed by one-off gains which...,[Ad sales boost Time Warner profit Quarterly ...,"[[-0.06185626, -0.054555155, 0.0021503558, -0...."
1,Dollar gains on Greenspan speech The dollar h...,business,"['Federal Reserve', 'Greenspan speech', 'highe...",The dollar has hit its highest level against t...,[Dollar gains on Greenspan speech The dollar ...,"[[0.006948544, -0.05492217, -0.020653008, -0.0..."
2,Yukos unit buyer faces loan claim The owners ...,business,"['embattled Russian', 'Russian oil', 'unit buy...",The owners of embattled Russian oil giant Yuko...,[Yukos unit buyer faces loan claim The owners...,"[[-0.07373838, 0.0147340745, -0.018761022, -0...."
3,High fuel prices hit BA's profits British Air...,business,"['British Airways', 'High fuel', 'blamed high'...",Looking ahead to its full year results to Marc...,[High fuel prices hit BA's profits British Ai...,"[[0.038942885, -0.024428101, -0.015652342, 0.0..."
4,Pernod takeover talk lifts Domecq Shares in U...,business,"['Allied Domecq', 'Domecq Shares', 'Pernod Ric...",Reports in the Wall Street Journal and the Fin...,[Pernod takeover talk lifts Domecq Shares in ...,"[[0.023181796, -0.04872723, 0.00045929168, -0...."


In [47]:
df_base['data_source'] = 'https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset'
add_articles_to_qdrant(df_base, QDRANT_KEY, QDRANT_CLUSTER_URL)

# Read 2,225 articles published on the BBC News website during 2004-2005

In [2]:
df_old = pd.read_csv('bbc-text.csv')
df_old['text'] = df_old['text'].apply(lambda x: sanitize_text(x))

print(f"Before deletion of similar texts {len(df_old)}")
df_old = drop_similar_rows(df_old, 'text', 98)
print(f"After the deletion of similar texts {len(df_old)}")

Before deletion of similar texts 2225
After the deletion of similar texts 1915


In [3]:
df_old['word_count'] = df_old['text'].apply(lambda x: len(x.split()))
df_old = df_old[df_old['word_count'] >= 10].drop(columns=['word_count'])

In [4]:
print(f"After the deletion of too short text {len(df_old)}")

After the deletion of too short text 1915


In [5]:
df_old['keywords'] = df_old['text'].apply(lambda x: assign_keywords(x))
df_old['summary'] = ""

In [6]:
df_old['splitted_text'] = df_old['text'].apply(lambda x: split_texts_for_db(x))
df_old['embeddings'] = df_old['splitted_text'].apply(lambda x: calculate_embedding(x))
df_old.head()

Unnamed: 0,category,text,keywords,summary,splitted_text,embeddings
0,tech,tv future in the hands of viewers with home th...,"[home theatre, living room, las vegas, recorde...",,[tv future in the hands of viewers with home t...,"[[-0.0015538835, -0.06727467, 0.011174222, -0...."
1,business,worldcom boss left books alone former worldc...,"[told jurors, ebbers, accounting, boss left, m...",,[worldcom boss left books alone former world...,"[[-0.08354733, 0.059484005, -0.0131955845, -0...."
2,sport,tigers wary of farrell gamble leicester say ...,"[switch codes, great britain, tigers wary, and...",,[tigers wary of farrell gamble leicester say...,"[[-0.055961896, -0.008481116, -0.025843337, -0..."
3,sport,yeading face newcastle in fa cup premiership s...,"[ryman premier, side, yeading face, non-league...",,[yeading face newcastle in fa cup premiership ...,"[[0.015004399, -0.12569566, -0.028033992, -0.0..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, party, labour, michael, tory leader]",,[howard hits back at mongrel jibe michael howa...,"[[0.0056457724, -0.025242308, 0.0383256, 0.028..."


In [8]:
df_old['data_source'] = 'https://www.kaggle.com/datasets/bhavikjikadara/bbc-news-articles'
df_old.rename(columns = {'category': 'labels'}, inplace=True) 
add_articles_to_qdrant(df_old, QDRANT_KEY, QDRANT_CLUSTER_URL)

# Latest BBC News articles via dataset on Huggingface

In [9]:
from huggingface_hub import login

login(HUGGING_FACE_TOKEN)

df_hf = pd.read_parquet("hf://datasets/RealTimeData/bbc_latest/data/train-00000-of-00001.parquet")
df_hf.head()

Unnamed: 0,title,published_date,authors,description,section,content,link
0,Newscast - Keir Diary… A Week in the Life of t...,2024-09-02,,Recapping a busy week for Sir Keir Starmer.,,Keir Diary… A Week in the Life of the PM Keir ...,http://www.bbc.co.uk/sounds/play/m0022js5
1,Americast - Profile: Who is Donald Trump? (Par...,2024-09-02,,How the man became The Donald,,Americanswers! Do Harris/Trump actually need A...,http://www.bbc.co.uk/sounds/play/p0jmwg24
2,Titanic: Scan reveals world's most famous wrec...,2024-09-02,,It's the first full-sized digital scan of the ...,,The first full-sized digital scan of the Titan...,http://www.bbc.co.uk/news/science-environment-...
3,Holocaust: Nicholas Winton meets children he s...,2024-09-03,,It was 1988 when Sir Nicholas was surprised by...,,This is the moment Nicholas Winton came face-t...,http://www.bbc.co.uk/news/uk-wales-66467077
4,Titanic: Scan reveals world's most famous wrec...,2024-09-03,,It's the first full-sized digital scan of the ...,,The first full-sized digital scan of the Titan...,http://www.bbc.co.uk/news/science-environment-...


In [38]:
df_hf[df_hf['content'].str.len() > 500]

Unnamed: 0,title,published_date,authors,description,section,content,link
3,Holocaust: Nicholas Winton meets children he s...,2024-09-03,,It was 1988 when Sir Nicholas was surprised by...,,This is the moment Nicholas Winton came face-t...,http://www.bbc.co.uk/news/uk-wales-66467077
6,Holocaust: Nicholas Winton meets children he s...,2024-09-04,,It was 1988 when Sir Nicholas was surprised by...,,This is the moment Nicholas Winton came face-t...,http://www.bbc.co.uk/news/uk-wales-66467077
8,Watch: Maori MP performs haka before swearing ...,2024-09-05,,"Rawiri Waititim, an advocate for Maori rights,...",,New Zealand politician Rawiri Waititi broke pr...,http://www.bbc.co.uk/news/world-asia-67622272
9,In pictures: The 2024 Paralympic Games - BBC News,2024-09-06,,Amazing images from the Paris Paralympics,,The 2024 Summer Paralympic Games are drawing t...,http://www.bbc.co.uk/news/resources/idt-6c21f8...
14,In pictures: The 2024 Paralympic Games - BBC News,2024-09-08,,Amazing images from the Paris Paralympics,,The 2024 Summer Paralympic Games are drawing t...,http://www.bbc.co.uk/news/resources/idt-6c21f8...


In [41]:
df_hf.iloc[6]['content']

"This is the moment Nicholas Winton came face-to-face with some of the 669 Jewish children he saved from the Nazis.\n\nIt was 1988 and for millions watching That's Life! on TV at home it was an unforgettable moment.\n\nNow Sir Nicholas's story is to be brought to life on the big screen with Port Talbot's Sir Anthony Hopkins in the starring role.\n\nOne Life will tell how Sir Nicholas, a London stockbroker who died in Berkshire in 2015, helped get young Jewish refugees out of occupied Czechoslovakia in 1938."

In [10]:
import datasets
ds = datasets.load_dataset('RealTimeData/bbc_latest', revision = '2023-08-20')

# Dataset with only urls having 35,860 rows from 07 March 2022 to 03 July 2024

In [4]:
df_urls = pd.read_csv('bbc_news.csv')
df_urls.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [5]:
print(df_ver_1.iloc[2].link)

https://www.bbc.co.uk/news/business-60623941?at_medium=RSS&at_campaign=KARANGA


In [6]:
print(df_ver_1.iloc[2].description)

One of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.


In [7]:
print(len(df_ver_1))

42115


In [8]:
df_ver_1 = df_ver_1.sample(frac=0.05, random_state=42)
print(len(df_ver_1))

2106


In [10]:
def scrape_bbc_article(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the headline
        headline = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'No headline found'

        # Extract the article body
        article_body = soup.find('article')  # Target the main article container
        paragraphs = article_body.find_all('p') if article_body else []
        # List of prefixes to ignore
        ignore_prefixes = ["LIVE:", "IN KYIV:", "ANALYSIS:", "EXPLAINED:", "IN DEPTH:"]

        # Filter out paragraphs that start with any of the prefixes
        filtered_paragraphs = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if not any(text.startswith(prefix) for prefix in ignore_prefixes):
                filtered_paragraphs.append(text)

        # Combine the filtered paragraphs
        article_text = "\n".join(filtered_paragraphs)

        # Return the headline and article text
        return headline, article_text

    except Exception as e:
        return f"An error occurred: {e}"

In [11]:
# get texts of articles
#for i, row in df_ver_1.iterrows():
#    url = row['link']
#    try:
#        headline, article_text = scrape_bbc_article(url)
#    except Exception as e:
#        print(e)
#    df_ver_1.loc[i, "text"]= article_text
#    time.sleep(3)

KeyboardInterrupt: 

# Sources

1. https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset
2. https://www.kaggle.com/datasets/bhavikjikadara/bbc-news-articles
3. https://huggingface.co/datasets/SetFit/bbc-news