In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import unicodedata

In [None]:
file_to_check = "/content/drive/MyDrive/NewsSumm_clean.csv"
df=pd.read_csv(file_to_check)

In [None]:
print("Initial dataset shape:", df.shape)


Initial dataset shape: (347815, 11)


In [None]:
df.columns = df.columns.str.strip()
print(df.columns)

Index(['newspaper_name', 'published_date', 'headline', 'article_text',
       'human_summary', 'news_category', 'article_clean', 'summary_clean',
       'article_tokens', 'summary_tokens', 'compression_ratio'],
      dtype='object')


In [None]:
EXPECTED_COLUMNS = [
    "newspaper_name",
    "published_date",
    "headline",
    "article_text",
    "human_summary",
    "news_category",
    "article_clean",
    "summary_clean",
    "article_tokens",
    "summary_tokens",
    "compression_ratio"
]

In [None]:
missing_cols = set(EXPECTED_COLUMNS) - set(df.columns)
if missing_cols:
    raise ValueError(f"Missing columns in dataset: {missing_cols}")

In [None]:
df = df.drop_duplicates(subset=["article_text", "human_summary"])
print("After duplicate removal:", df.shape)

After duplicate removal: (327869, 11)


In [None]:
MANDATORY_COLS = ["headline", "article_text", "human_summary"]
df = df.dropna(subset=MANDATORY_COLS)

In [None]:
df["news_category"] = df["news_category"].fillna("Unknown")
df["newspaper_name"] = df["newspaper_name"].fillna("Unknown")

In [None]:

print("After missing value handling:", df.shape)

After missing value handling: (327869, 11)


In [None]:
df["published_date"] = pd.to_datetime(
    df["published_date"], errors="coerce"
)

In [None]:

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Unicode normalization
    text = unicodedata.normalize("NFKC", text)

    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", " ", text)

    # Remove encoding artifacts
    text = text.encode("utf-8", "ignore").decode()

    # Remove unwanted symbols (keep punctuation)
    text = re.sub(r"[^\w\s.,!?;:()\"'-]", " ", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [None]:
# 7. APPLY TEXT CLEANING

df["article_clean"] = df["article_text"].apply(clean_text)
df["summary_clean"] = df["human_summary"].apply(clean_text)

df = df[
    (df["article_clean"].str.len() > 50) &
    (df["summary_clean"].str.len() > 10)
]

print("After text cleaning:", df.shape)

After text cleaning: (327197, 11)


In [None]:
df["article_tokens"] = df["article_clean"].apply(lambda x: len(x.split()))
df["summary_tokens"] = df["summary_clean"].apply(lambda x: len(x.split()))

In [None]:
# 9. COMPRESSION RATIO

df["compression_ratio"] = (
    df["summary_tokens"] / df["article_tokens"]
)

In [None]:
df = df[
    (df["article_tokens"] >= 100) &
    (df["summary_tokens"] >= 15) &
    (df["compression_ratio"] <= 0.5)
]

In [None]:
print("After token & ratio filtering:", df.shape)

After token & ratio filtering: (220247, 11)


In [None]:

# 10. CLUSTER CREATION & FILTERING (IMPORTANT)
# Group articles sharing the same cleaned summary
df["cluster_id"] = df.groupby("summary_clean").ngroup()

In [None]:
# Compute cluster sizes
cluster_sizes = df.groupby("cluster_id").size()

In [None]:
# Keep clusters with >= 2 documents
valid_clusters = cluster_sizes[cluster_sizes >= 2].index
df = df[df["cluster_id"].isin(valid_clusters)].reset_index(drop=True)

In [None]:
print("============================================")
print("After cluster filtering:")
print("Filtered rows:", len(df))
print("Filtered clusters:", df["cluster_id"].nunique())
print("Avg docs per cluster:",
      df.groupby("cluster_id").size().mean())
print("============================================")

After cluster filtering:
Filtered rows: 4335
Filtered clusters: 2060
Avg docs per cluster: 2.104368932038835


In [None]:
df = df[
    [
        "newspaper_name",
        "published_date",
        "headline",
        "article_text",
        "human_summary",
        "news_category",
        "article_clean",
        "summary_clean",
        "article_tokens",
        "summary_tokens",
        "compression_ratio",
        "cluster_id"
    ]
]

In [None]:
df.to_csv("/content/drive/MyDrive/NewsSumm_perfect_clean.xlsx", index=False, encoding="utf-8")

In [None]:
print("Number of rows:", len(df))


Number of rows: 4335
