In [3]:
import os
from dotenv import load_dotenv
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from huggingface_hub import login


from database_utils import (
    split_texts_for_db,
    calculate_embedding,
    add_articles_to_qdrant,
    add_article_to_qdrant,
    assign_keywords,
    classify_text,
)
from text_preparation_utils import (
    sanitize_text,
    drop_similar_rows,
)
import warnings

warnings.filterwarnings("ignore")

# Load .env file
load_dotenv()
QDRANT_KEY = os.getenv('QDRANT_KEY')
QDRANT_CLUSTER_URL = os.getenv('QDRANT_CLUSTER_URL')
HUGGING_FACE_TOKEN = os.getenv('HUGGING_FACE_TOKEN')

# Load base BBC dataset extended with additional features

In [5]:
# Read the preprocessed CSV file 'bbc_news_base.csv' into a DataFrame
df_base = pd.read_csv('bbc_news_base.csv')

# Drop unnecessary columns from the DataFrame
df_base = df_base.drop(columns = ['Unnamed: 0', 'category_encoded', 'no_sentences', 'Flesch Reading Ease Score', 'Dale-Chall Readability Score'])

In [6]:
df_base.head()

Unnamed: 0,text,labels,keywords,summary
0,Ad sales boost Time Warner profit Quarterly p...,business,"['Time Warner', 'Quarterly profits', 'AOL', 'm...",Its profits were buoyed by one-off gains which...
1,Dollar gains on Greenspan speech The dollar h...,business,"['Federal Reserve', 'Greenspan speech', 'highe...",The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim The owners ...,business,"['embattled Russian', 'Russian oil', 'unit buy...",The owners of embattled Russian oil giant Yuko...
3,High fuel prices hit BA's profits British Air...,business,"['British Airways', 'High fuel', 'blamed high'...",Looking ahead to its full year results to Marc...
4,Pernod takeover talk lifts Domecq Shares in U...,business,"['Allied Domecq', 'Domecq Shares', 'Pernod Ric...",Reports in the Wall Street Journal and the Fin...


In [7]:
df_base.labels.unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [None]:
# Apply the function 'split_texts_for_db' to each element in the 'text' column
df_base['splitted_text'] = df_base['text'].apply(lambda x: split_texts_for_db(x))

# Apply the function 'calculate_embedding' to each element in the 'splitted_text' column
df_base['embeddings'] = df_base['splitted_text'].apply(lambda x: calculate_embedding(x))

# Display the first few rows of the DataFrame to verify the changes
df_base.head()

In [None]:
# Assigning the URL of the BBC articles dataset to the 'data_source' column of the df_base DataFrame
df_base['data_source'] = 'https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset'

# Calling the function add_articles_to_qdrant to add articles from df_base to Qdrant using the provided key and cluster URL
add_articles_to_qdrant(df_base, QDRANT_KEY, QDRANT_CLUSTER_URL, "bbc_news_articles")

# Read 2,225 articles published on the BBC News website during 2004-2005

In [None]:
# Load the dataset from a CSV file into a DataFrame
df_old = pd.read_csv('bbc-text.csv')

# Sanitize the 'text' column by applying the 'sanitize_text' function to each entry
df_old['text'] = df_old['text'].apply(lambda x: sanitize_text(x))

# Remove rows with similar texts based on the 'text' column, allowing for 98% similarity
print(f"Before deletion of similar texts {len(df_old)}")
df_old = drop_similar_rows(df_old, 'text', 98)
print(f"After the deletion of similar texts {len(df_old)}")

In [None]:
# Apply the assign_keywords function to each element in the 'text' column
df_old['keywords'] = df_old['text'].apply(lambda x: assign_keywords(x))

# Apply the split_texts_for_db function to each element in the 'text' column
df_old['splitted_text'] = df_old['text'].apply(lambda x: split_texts_for_db(x))

# Apply the calculate_embedding function to each element in the 'splitted_text' column
df_old['embeddings'] = df_old['splitted_text'].apply(lambda x: calculate_embedding(x))

df_old.head()

In [None]:
# Assign a URL to the 'data_source' column of the df_old DataFrame
df_old['data_source'] = 'http://mlg.ucd.ie/datasets/bbc.html'

# Rename the 'category' column to 'labels' in the df_old DataFrame
df_old.rename(columns = {'category': 'labels'}, inplace=True)

# Call the function to add articles to Qdrant using the modified DataFrame and provided keys
add_articles_to_qdrant(df_old, QDRANT_KEY, QDRANT_CLUSTER_UR, "bbc_news_articles")

# Latest BBC News articles via dataset on Huggingface

In [None]:
# log to huggingface for API connection
login(HUGGING_FACE_TOKEN)

# Load the dataset from a CSV file into a DataFrame
df_hf = pd.read_parquet("hf://datasets/RealTimeData/bbc_latest/data/train-00000-of-00001.parquet")
df_hf.head()

In [None]:
from typing import Any, Dict, List

def process_articles(contents: List[str], titles: List[str], sources: List[str]) -> None:
    """
    Processes articles by sanitizing text, assigning keywords, splitting text,
    calculating embeddings, classifying text, and adding the article to Qdrant.

    Args:
        contents (List[str]): List of article contents.
        titles (List[str]): List of article titles.
        sources (List[str]): List of article sources (links).
    """
    for content, title, source in zip(contents, titles, sources):
        text = title + content
        sanitized_text = sanitize_text(text)
        keywords = assign_keywords(text)
        split_text = split_texts_for_db(text)
        embeddings = calculate_embedding(split_text)
        label = classify_text(sanitized_text)

        row: Dict[str, Any] = {
            "text": sanitized_text,
            "keywords": keywords,
            "splitted_text": split_text,
            "embeddings": embeddings,
            "labels": label,
            "data_source": source
        }

        add_article_to_qdrant(row, QDRANT_KEY, QDRANT_CLUSTER_URL, "bbc_news_articles")


# Extract the 'content' column from the DataFrame and convert it to a list
contents = df_hf['content'].tolist()

# Extract the 'title' column from the DataFrame and convert it to a list
titles = df_hf['title'].tolist()

# Extract the 'link' column from the DataFrame and convert it to a list
sources = df_hf['link'].tolist()

# Process the articles using the extracted contents, titles, and sources
process_articles(contents, titles, sources)

In [None]:
import datasets
ds = datasets.load_dataset('RealTimeData/bbc_latest', revision = '2023-08-20')
df_hf = ds['train'].to_pandas()

# Extract the 'content' column from the DataFrame and convert it to a list
contents = df_hf['content'].tolist()

# Extract the 'title' column from the DataFrame and convert it to a list
titles = df_hf['title'].tolist()

# Extract the 'link' column from the DataFrame and convert it to a list
sources = df_hf['link'].tolist()

# Process the articles using the extracted contents, titles, and sources
process_articles(contents, titles, sources)

# Dataset with only urls having 35,860 rows from 07 March 2022 to 03 July 2024

In [None]:
df_urls = pd.read_csv('bbc_news.csv')
df_urls.head()

In [None]:
print(df_ver_1.iloc[2].link)

In [None]:
print(df_ver_1.iloc[2].description)

In [None]:
def scrape_bbc_article(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the headline
        headline = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'No headline found'

        # Extract the article body
        article_body = soup.find('article')  # Target the main article container
        paragraphs = article_body.find_all('p') if article_body else []
        # List of prefixes to ignore
        ignore_prefixes = ["LIVE:", "IN KYIV:", "ANALYSIS:", "EXPLAINED:", "IN DEPTH:"]

        # Filter out paragraphs that start with any of the prefixes
        filtered_paragraphs = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if not any(text.startswith(prefix) for prefix in ignore_prefixes):
                filtered_paragraphs.append(text)

        # Combine the filtered paragraphs
        article_text = "\n".join(filtered_paragraphs)

        # Return the headline and article text
        return headline, article_text

    except Exception as e:
        return f"An error occurred: {e}"

In [None]:
# get texts of articles
#for i, row in df_ver_1.iterrows():
#    url = row['link']
#    try:
#        headline, article_text = scrape_bbc_article(url)
#    except Exception as e:
#        print(e)
#    df_ver_1.loc[i, "text"]= article_text
#    time.sleep(3)

# Data Sources

1. https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset
2. https://www.kaggle.com/datasets/bhavikjikadara/bbc-news-articles
3. https://huggingface.co/datasets/SetFit/bbc-news
4. http://mlg.ucd.ie/datasets/bbc.html