In [20]:
import os
import pandas as pd

# Set the path to the root directory containing subfolders like 'business', 'sport', etc.
data_dir = "file_path"

# Prepare a list to hold (text, category) tuples
data = []

# Iterate through each category folder
for category in os.listdir(data_dir):
    category_path = os.path.join(data_dir, category)

    # Ensure it's a directory (e.g., 'business', 'entertainment', etc.)
    if os.path.isdir(category_path):
        # Iterate through files inside the category folder
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)

            # Check if it's a regular file
            if os.path.isfile(file_path):
                # Read the file content
                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                    text = file.read().strip()
                    # Append text and label to data list
                    data.append((text, category))

# Create a DataFrame and save it as a CSV file
df = pd.DataFrame(data, columns=["text", "label"])
df.to_csv("bbc_raw_dataset.csv", index=False)

print("✅ Dataset saved as 'bbc_raw_dataset.csv'")


In [None]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from transformers import pipeline

# File paths
DATA_PATH = "/content/bbc_raw_dataset.csv"
OUTPUT_DIR = "/content"
os.makedirs(OUTPUT_DIR, exist_ok=True)


def load_dataset(path):
    """Load the dataset from a CSV file."""
    return pd.read_csv(path)


def extract_subtopics(df, category, n_clusters=4):
    """
    Perform K-Means clustering on text to extract subtopics.

    Args:
        df (pd.DataFrame): The full dataset.
        category (str): One of the main categories to cluster.
        n_clusters (int): Number of topic clusters.

    Returns:
        pd.DataFrame: Clustered subtopics with top keywords.
    """
    texts = df[df["label"] == category]["text"]
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    X = vectorizer.fit_transform(texts)

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    feature_names = vectorizer.get_feature_names_out()

    records = []
    for i in range(n_clusters):
        top_indices = kmeans.cluster_centers_[i].argsort()[-10:][::-1]
        keywords = ", ".join(feature_names[idx] for idx in top_indices)
        records.append({
            "category": category,
            "cluster": i,
            "keywords": keywords
        })

    return pd.DataFrame(records)


def summarize_april_sentences(df):
    """
    Extract and summarize sentences mentioning 'April'.

    Args:
        df (pd.DataFrame): The dataset with full articles.

    Returns:
        pd.DataFrame: Sentences and summaries containing 'April'.
    """
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    april_data = []

    for _, row in df.iterrows():
        for sentence in re.split(r"[.?!]", row["text"]):
            if "April" in sentence:
                clean = sentence.strip().replace("\n", " ")
                if len(clean) > 30:
                    input_len = len(clean.split())
                    max_len = max(10, int(input_len * 0.8))
                    summary = summarizer(
                        clean,
                        max_length=max_len,
                        min_length=5,
                        do_sample=False
                    )[0]["summary_text"]
                    april_data.append({
                        "category": row["label"],
                        "original": clean,
                        "summary": summary
                    })

    return pd.DataFrame(april_data)


def extract_names(text):
    """
    Extract person-like names using regex.

    Args:
        text (str): News article text.

    Returns:
        list: List of names.
    """
    return re.findall(r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b", text)


def infer_job(text):
    """
    Infer the job of a named entity based on text context.

    Args:
        text (str): Full article text.

    Returns:
        str: Inferred job category.
    """
    text = text.lower()
    if any(w in text for w in ["singer", "album", "music", "concert"]):
        return "Musician"
    if any(w in text for w in ["actor", "actress", "film", "movie"]):
        return "Actor"
    if any(w in text for w in ["prime minister", "president", "parliament", "election"]):
        return "Politician"
    if any(w in text for w in ["match", "player", "coach", "goal", "tournament"]):
        return "Athlete"
    return "Unknown"


def extract_media_entities(df):
    """
    Extract and classify media personalities from text.

    Args:
        df (pd.DataFrame): Dataset with news articles.

    Returns:
        pd.DataFrame: Name, inferred role, category, and sample context.
    """
    media_records = []

    for _, row in df.iterrows():
        names = extract_names(row["text"])
        for name in names:
            role = infer_job(row["text"])
            media_records.append({
                "name": name,
                "role": role,
                "category": row["label"],
                "context": row["text"][:200]
            })

    return pd.DataFrame(media_records).drop_duplicates(subset=["name", "role"])


def main():
    """Run the full pipeline end-to-end."""
    df = load_dataset(DATA_PATH)

    # Step 1: Subtopics
    subtopics = pd.concat([
        extract_subtopics(df, "business"),
        extract_subtopics(df, "entertainment"),
        extract_subtopics(df, "sport")
    ])
    subtopics.to_csv(f"{OUTPUT_DIR}/subtopics.csv", index=False)

    # Step 2: April summarization
    april_df = summarize_april_sentences(df)
    april_df.to_csv(f"{OUTPUT_DIR}/april_summaries.csv", index=False)

    # Step 3: Media personalities
    media_df = extract_media_entities(df)
    media_df.to_csv(f"{OUTPUT_DIR}/media_entities.csv", index=False)

    print("✅ All tasks completed. Outputs saved in 'outputs/'.")


if __name__ == "__main__":
    main()

