In [1]:
import os, re, uuid, warnings, logging, math
from pathlib import Path
import pandas as pd
import torch
import spacy
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    logging as hf_logging,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
# ─── 0) SILENCE WARNINGS & SET DEVICE ──────────────────────────────────────
warnings.filterwarnings("ignore")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"]  = "1"
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
hf_logging.set_verbosity_error()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
# ─── 1) MOUNT GOOGLE DRIVE & LOAD RAW CSVS ────────────────────────────────
from google.colab import drive
from pathlib import Path
import pandas as pd

# 1.1 Mount your Drive (you’ll be prompted to authenticate once)
drive.mount("/content/drive")

# 1.2 Point to the folder in your Drive where the two CSVs live
#    e.g. if you uploaded them to MyDrive/79kDataset/, then:
DATA_DIR = Path("/content/drive/MyDrive")

# 1.3 Read the FAKE / TRUE splits
fake = pd.read_csv(DATA_DIR / "DataSet_Misinfo_FAKE.csv").rename(columns=str.lower)
true = pd.read_csv(DATA_DIR / "DataSet_Misinfo_TRUE.csv").rename(columns=str.lower)

# 1.4 Tag and concat
fake["label"], true["label"] = "fake", "true"
raw = pd.concat([fake, true], ignore_index=True)

# 1.5 Ensure there's an 'id' column
if "id" not in raw.columns:
    raw.insert(0, "id", range(len(raw)))

print("Loaded raw articles:", raw.shape)




Mounted at /content/drive
Loaded raw articles: (78617, 4)


In [4]:
# ──────────────────────────────────────────────────────────────────────────
# STEP 2 – CLEAN & CHUNK INTO ≤120-WORD PIECES
# ──────────────────────────────────────────────────────────────────────────
import re, uuid
import pandas as pd
import spacy

# lightweight sentencizer
nlp_sent = spacy.blank("en")
nlp_sent.add_pipe("sentencizer")

MAX_TOK = 120
def clean(text: str) -> str:
    text = re.sub(r"https?://\S+", "URL", str(text))
    return re.sub(r"\s+", " ", text).strip()

def chunk_article(text: str) -> list[str]:
    out, buf = [], []
    doc = nlp_sent(clean(text))
    for sent in doc.sents:
        words = sent.text.split()
        if len(buf) + len(words) > MAX_TOK:
            out.append(" ".join(buf))
            buf = []
        buf += words
    if buf:
        out.append(" ".join(buf))
    return out

pieces = []
for art in raw.itertuples(index=False):
    # combine title/subject + body
    title   = getattr(art, "title", "") or getattr(art, "subject", "")
    content = getattr(art, "text", "")  or getattr(art, "content", "")
    body    = f"{title}. {content}" if title else content
    for chunk in chunk_article(body):
        pieces.append({
            "chunk_id":   uuid.uuid4().hex,
            "article_id": art.id,
            "chunk":      chunk,
            "src_label":  art.label
        })

chunks = pd.DataFrame(pieces)
print("Total chunks:", len(chunks))
chunks.head(3)



Total chunks: 390529


Unnamed: 0,chunk_id,article_id,chunk,src_label
0,ec513be9eda447be83f8e390fe687a85,0,Donald Trump just couldn t wish all Americans ...,fake
1,3dea585fb56541919438cabbb5b66177,0,As our Country rapidly grows stronger and smar...,fake
2,46a2b318a20e4c1ea5b85ffdc1932e32,0,Bishop Talbert Swan (@TalbertSwan) December 31...,fake


In [5]:
# ──────────────────────────────────────────────────────────────────────────
# STEP 3 – WEAK-LABEL TOPICS & SAMPLE SEED TRAINING SET
# ──────────────────────────────────────────────────────────────────────────
import random

TOPICS = ["Politics","Health","Business","SciTech",
          "Environment","CrimeLaw","Entertainment","Other"]

SEED = {
    "Politics":     ["election","president","Security","Senate","Trump","Republicans"],
    "Health":       ["virus","covid","vaccine","disease","hospital","doctor"],
    "Business":     ["market","stock","company","profit","trade"],
    "SciTech":      ["ai","quantum","nasa","spacex","researchers","technology"],
    "Environment":  ["climate","wildfire","hurricane","pollution","carbon"],
    "CrimeLaw":     ["arrest","court","police","fraud","terror"],
    "Entertainment":["movie","music","celebrity","festival","award"],
}

def weak_topic(txt: str) -> str:
    t = txt.lower()
    for topic, kws in SEED.items():
        if any(kw in t for kw in kws):
            return topic
    return "Other"

chunks["weak_topic"] = chunks["chunk"].map(weak_topic)

# sample up to 1k per weak topic
train_df = (
    chunks
    .groupby("weak_topic", group_keys=False)
    .apply(lambda g: g.sample(min(1000, len(g)), random_state=42))
    .reset_index(drop=True)
)
print("Seed counts:\n", train_df.weak_topic.value_counts())


Seed counts:
 weak_topic
Business         1000
CrimeLaw         1000
Entertainment    1000
Environment      1000
Health           1000
Other            1000
Politics         1000
SciTech          1000
Name: count, dtype: int64


In [6]:
# ──────────────────────────────────────────────────────────────────────────
# STEP 4 – FINE-TUNE BERT FOR TOPIC CLASSIFICATION
# ──────────────────────────────────────────────────────────────────────────
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

# 1) Prepare HF Dataset
df_train = train_df[["chunk","weak_topic"]].rename(columns={"weak_topic":"labels"})
ds = Dataset.from_pandas(df_train, preserve_index=False)

# 2) Load tokenizer & label maps
MODEL = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
label2id  = {t:i for i,t in enumerate(TOPICS)}
id2label  = {i:t for t,i in label2id.items()}

# 3) Tokenize & map labels→IDs
def preprocess(batch):
    enc = tokenizer(batch["chunk"],
                    truncation=True,
                    padding="max_length",
                    max_length=128)
    enc["labels"] = [label2id[l] for l in batch["labels"]]
    return enc

ds = ds.map(preprocess, batched=True, remove_columns=["chunk","labels"])

# 4) Load & train model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=len(TOPICS),
    id2label=id2label, label2id=label2id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

args = TrainingArguments(
    output_dir="bert-topic-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    disable_tqdm=True,
    report_to=[]
)
Trainer(model=model, args=args, train_dataset=ds).train()

# 5) Save
model.save_pretrained("bert-topic-model")
tokenizer.save_pretrained("bert-topic-model")
print("BERT fine-tuned & saved → bert-topic-model")





tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

{'loss': 0.9911, 'grad_norm': 4.370952606201172, 'learning_rate': 1.002e-05, 'epoch': 1.0}
{'loss': 0.4066, 'grad_norm': 2.8939096927642822, 'learning_rate': 2e-08, 'epoch': 2.0}
{'train_runtime': 390.8151, 'train_samples_per_second': 40.94, 'train_steps_per_second': 2.559, 'train_loss': 0.6988599853515625, 'epoch': 2.0}
BERT fine-tuned & saved → bert-topic-model


In [7]:
# ───────────────────────────────────────────
# STEP 5 – PREDICT topic_o FOR EVERY CHUNK
# ───────────────────────────────────────────

# 1) Install & import tqdm for a progress bar
!pip install -q tqdm
from tqdm.auto import tqdm

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 2) (Re)load your fine-tuned model & tokenizer if needed
try:
    model, tokenizer
except NameError:
    model     = AutoModelForSequenceClassification.from_pretrained("bert-topic-model")
    tokenizer = AutoTokenizer.from_pretrained("bert-topic-model")

# 3) Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# 4) Prepare an integer-keyed id2label mapping
raw_id2label = model.config.id2label
# convert string keys to ints if necessary
id2label = {int(k):v for k,v in raw_id2label.items()} if isinstance(next(iter(raw_id2label)), str) else raw_id2label

# 5) Batch‐wise inference with a tqdm progress bar
batch_size = 64
pred_ids   = []

for start in tqdm(range(0, len(chunks), batch_size), desc="Inferring topics"):
    texts = chunks.chunk[start : start + batch_size].tolist()
    enc   = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits

    pred_ids.extend(logits.argmax(-1).cpu().tolist())

# 6) Map integer IDs back to topic strings
chunks["topic_o"] = [ id2label[i] for i in pred_ids ]

# 7) Report
print("✓ STEP 5 complete – topic_o assigned")
print(chunks.topic_o.value_counts())




Inferring topics:   0%|          | 0/6103 [00:00<?, ?it/s]

✓ STEP 5 complete – topic_o assigned
topic_o
Other            114502
SciTech          109183
Politics          99561
CrimeLaw          23568
Business          22595
Health             9286
Entertainment      6775
Environment        5059
Name: count, dtype: int64


In [8]:
# ─────────────────────────────────────────────────────────────
# STEP 6 – EXTRACT raw entities with spaCy small-model NER
#           (uses nlp.pipe + tqdm for live progress)
# ─────────────────────────────────────────────────────────────

# 1) Install the small English model package via pip
!pip install -q en-core-web-sm

# 2) Import & load only the NER component from en_core_web_sm
import en_core_web_sm
nlp_ner = en_core_web_sm.load(
    disable=["parser", "tagger", "attribute_ruler", "lemmatizer"]
)

# 3) Define which spaCy entity types to keep
KEEP = {"PERSON", "ORG", "GPE", "PRODUCT", "NORP", "LOC"}

# 4) Helper: pick the first matching entity from a Doc
def first_ent(doc):
    for ent in doc.ents:
        if ent.label_ in KEEP:
            return ent.text
    return "_NONE_"

# 5) Perform batched NER with a live tqdm progress bar
from tqdm.auto import tqdm

texts    = chunks["chunk"].tolist()
out_ents = []

# Wrap the nlp.pipe generator in tqdm for live feedback
for doc in tqdm(
    nlp_ner.pipe(texts, batch_size=64, n_process=1),
    total=len(texts),
    desc="Extracting entities"
):
    out_ents.append(first_ent(doc))

# 6) Attach results back to the DataFrame
chunks["entity_raw"] = out_ents

# 7) Sanity‐check output
print("✓ STEP 6 complete — sample entity counts:")
print(chunks.entity_raw.value_counts().head(10))



Extracting entities:   0%|          | 0/390529 [00:00<?, ?it/s]

✓ STEP 6 complete — sample entity counts:
entity_raw
Trump           19755
_NONE_          13370
U.S.             7255
Donald Trump     6754
Obama            5656
Clinton          5462
American         3931
Republican       3867
Republicans      3033
America          2987
Name: count, dtype: int64


In [9]:
# ────────────────────────────────────────────────────────────────
# STEP 7 – CLUSTER & CANONICALISE similar entity strings
#           (fast MiniBatchKMeans + live tqdm bar)
# ────────────────────────────────────────────────────────────────

import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from tqdm.auto import tqdm

# 1) Check that STEP 6 has run:
if "entity_raw" not in chunks.columns:
    raise RuntimeError("`chunks['entity_raw']` not found. Please re-run STEP 6 before STEP 7.")

# 2) Gather unique non-NONE entities
ents = [e for e in chunks["entity_raw"].unique() if e != "_NONE_"]
print(f"▸ {len(ents)} unique entities (excluding '_NONE_')")

# 3) Compute TF-IDF on character n-grams
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5))
X = vectorizer.fit_transform(ents)

# 4) Decide on √N clusters
n_clusters = max(1, int(math.sqrt(len(ents))))
print(f"▸ Clustering into {n_clusters} clusters with MiniBatchKMeans…")

# 5) Fit the KMeans (very fast)
mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000, random_state=42)
labels = mbk.fit_predict(X)

# 6) Pick the shortest string per cluster (with live bar)
rep = {}
for lbl, ent in tqdm(zip(labels, ents), total=len(ents), desc="Selecting reps"):
    if lbl not in rep or len(ent) < len(rep[lbl]):
        rep[lbl] = ent

# 7) Build canonical map and apply
canon_map = {ent: rep[lbl] for ent, lbl in zip(ents, labels)}
chunks["entity_e"] = chunks["entity_raw"].map(lambda e: canon_map.get(e, e))

# 8) Sanity check
print("✓ STEP 7 complete – sample canonical entities:")
print(chunks["entity_e"].value_counts().head(10))

▸ 72263 unique entities (excluding '_NONE_')
▸ Clustering into 268 clusters with MiniBatchKMeans…


Selecting reps:   0%|          | 0/72263 [00:00<?, ?it/s]

✓ STEP 7 complete – sample canonical entities:
entity_e
s             260524
Trump          30312
The﻿           18869
_NONE_         13370
Bama           12464
Ameri          12121
Iowa            7791
Clinton         5500
Democrat        4863
Washington      3429
Name: count, dtype: int64


In [11]:
# ──────────────────────────────────────────────────────────────────────────
# STEP 8 – SAVE THE FULLY LABELED TABLE TO PARQUET (with live progress)
# ──────────────────────────────────────────────────────────────────────────

# 1) Install pyarrow if needed
!pip install -q pyarrow tqdm

import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
from tqdm.auto import tqdm

# 2) Define output path
OUT_PATH = Path("/content/drive/MyDrive") / "79k_topic_entity_labeled.parquet"

# 3) Columns to save
cols = ["chunk_id","article_id","topic_o","entity_e","chunk","src_label"]

# 4) Prepare a PyArrow schema from the first batch
batch_size = 50000  # adjust if you like
sample = chunks[cols].iloc[:batch_size]
schema = pa.Schema.from_pandas(sample, preserve_index=False)

# 5) Open a ParquetWriter
writer = pq.ParquetWriter(str(OUT_PATH), schema)

# 6) Loop over DataFrame in batches with a tqdm bar
n = len(chunks)
for start in tqdm(range(0, n, batch_size), desc="Writing to Parquet"):
    end = min(start + batch_size, n)
    df_batch = chunks[cols].iloc[start:end]
    table   = pa.Table.from_pandas(df_batch, schema=schema, preserve_index=False)
    writer.write_table(table)

# 7) Close the writer
writer.close()

print(f"✓ STEP 8 complete — saved to {OUT_PATH}")
print("\nFinal topic distribution:")
print(chunks.topic_o.value_counts())
print("\nEmpty‐entity rate:", (chunks.entity_e=="_NONE_").mean().round(3))


Writing to Parquet:   0%|          | 0/8 [00:00<?, ?it/s]

✓ STEP 8 complete — saved to /content/drive/MyDrive/79k_topic_entity_labeled.parquet

Final topic distribution:
topic_o
Other            114502
SciTech          109183
Politics          99561
CrimeLaw          23568
Business          22595
Health             9286
Entertainment      6775
Environment        5059
Name: count, dtype: int64

Empty‐entity rate: 0.034


In [12]:
# 1) Mount Google Drive (skip if already mounted)
from google.colab import drive
drive.mount("/content/drive")

# 2) Import pandas & Path
import pandas as pd
from pathlib import Path

# 3) Point to your Parquet file
PARQUET = Path("/content/drive/MyDrive/79k_topic_entity_labeled.parquet")
CSV     = PARQUET.with_suffix(".csv")

# 4) Read the Parquet file
df = pd.read_parquet(PARQUET)

# 5) Write to CSV (no index column)
df.to_csv(CSV, index=False)

print(f"🔄 Converted {PARQUET.name} → {CSV.name}")
print(f"Rows: {len(df):,} • Columns: {df.shape[1]}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔄 Converted 79k_topic_entity_labeled.parquet → 79k_topic_entity_labeled.csv
Rows: 390,529 • Columns: 6
