In [None]:
import os
import re
import json
from pathlib import Path
from tqdm.auto import tqdm

import pandas as pd
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

In [None]:
# CELL 2 - load CSV and province list

CSV_FN = "disaster_articless.csv"   # change if different
PH_JS_PATH = Path("static/phprovinces.js")  # attempt to read province names from this file

# --------------- load CSV ---------------
if not Path(CSV_FN).exists():
    raise FileNotFoundError(f"CSV file not found at {CSV_FN}. Put your CSV in same directory as notebook.")
df = pd.read_csv(CSV_FN, dtype=str).fillna("")

# Expect columns: Date, Headline, Keyword, Link, Tags, Abstract, Article
print("Loaded", len(df), "rows from", CSV_FN)

# --------------- load province names ---------------
def load_provinces_from_ph_js(js_path):
    """
    If you have a JS file that defines a var like `var phProvinces = {...};`,
    we attempt to extract the JSON and collect feature.properties.NAME_1 values.
    """
    s = Path(js_path).read_text(encoding="utf-8")
    # naive extraction: find first '{' and last '}' and try to json.loads
    start = s.find('{')
    end = s.rfind('}')
    if start == -1 or end == -1:
        return []
    json_text = s[start:end+1]
    # JS may use single quotes or trailing commas; perform minor cleanup:
    json_text = re.sub(r"(\w+):", r'"\1":', json_text)  # crude: bareword keys => quoted (risky)
    # remove `var phProvinces =` if exists - we've sliced
    try:
        geo = json.loads(json_text)
        names = []
        for f in geo.get("features", []):
            props = f.get("properties", {})
            # several possible keys used earlier: NAME_1, adm2_en etc.
            for key in ("NAME_1", "name", "adm2_en", "Prov"): 
                if key in props and props[key]:
                    names.append(props[key])
                    break
        return list(dict.fromkeys([n.strip() for n in names if n]))
    except Exception as e:
        # fallback no provinces loaded
        print("Could not parse phprovinces.js as JSON:", e)
        return []

provinces = []
if PH_JS_PATH.exists():
    try:
        provinces = load_provinces_from_ph_js(PH_JS_PATH)
    except Exception as e:
        print("Error loading provinces from JS:", e)

# Fallback list (shortened common provinces). If you want exhaustive list, replace below with a full list.
if not provinces:
    provinces = [
        "Abra","Agusan del Norte","Agusan del Sur","Aklan","Albay","Antique","Apayao","Aurora",
        "Basilan","Bataan","Batanes","Batangas","Benguet","Biliran","Bohol","Bukidnon","Bulacan",
        "Cagayan","Camarines Norte","Camarines Sur","Camiguin","Capiz","Catanduanes","Cavite",
        "Cebu","Cotabato","Davao del Norte","Davao del Sur","Davao de Oro","Davao Occidental","Davao Oriental",
        "Dinagat Islands","Eastern Samar","Guimaras","Ifugao","Ilocos Norte","Ilocos Sur","Iloilo","Isabela",
        "Kalinga","La Union","Laguna","Lanao del Norte","Lanao del Sur","Leyte","Maguindanao","Marinduque",
        "Masbate","Metro Manila","Misamis Occidental","Misamis Oriental","Mountain Province","Negros Occidental",
        "Negros Oriental","Northern Samar","Nueva Ecija","Nueva Vizcaya","Occidental Mindoro","Oriental Mindoro",
        "Palawan","Pampanga","Pangasinan","Quezon","Quirino","Rizal","Romblon","Samar","Sarangani","Siquijor",
        "Sorsogon","South Cotabato","Southern Leyte","Sultan Kudarat","Surigao del Norte","Surigao del Sur",
        "Tarlac","Tawi-Tawi","Zambales","Zamboanga del Norte","Zamboanga del Sur","Zamboanga Sibugay"
    ]

# Normalize provinces (for matching)
provinces_normalized = [p.lower() for p in provinces]
print(f"Using {len(provinces_normalized)} provinces for detection (sample):", provinces[:8])


In [None]:
# CELL 3 - load model and tokenizer (uses GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

MODEL_NAME = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model.eval()

# Summarization helper with chunking
def summarize_text(text, max_input_tokens=1024, max_summary_tokens=150, chunk_overlap=50):
    """
    Token-based chunking: split long input into chunks of max_input_tokens - overlap,
    summarize each chunk, then summarize concatenated chunk-summaries into one final summary.
    """
    if not text or text.strip() == "":
        return ""
    # Tokenize full text (list of token ids)
    tokens = tokenizer.encode(text, add_special_tokens=False)
    n = len(tokens)
    if n <= max_input_tokens:
        inputs = tokenizer.encode_plus(text, return_tensors="pt", truncation=True, max_length=max_input_tokens)
        input_ids = inputs["input_ids"].to(device)
        with torch.no_grad():
            summary_ids = model.generate(input_ids, num_beams=4, max_length=max_summary_tokens, early_stopping=True)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    # chunk
    summaries = []
    stride = max_input_tokens - chunk_overlap
    for i in range(0, n, stride):
        chunk_ids = tokens[i : i + max_input_tokens]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        inputs = tokenizer.encode_plus(chunk_text, return_tensors="pt", truncation=True, max_length=max_input_tokens)
        input_ids = inputs["input_ids"].to(device)
        with torch.no_grad():
            summary_ids = model.generate(input_ids, num_beams=4, max_length=max_summary_tokens, early_stopping=True)
        chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(chunk_summary)
        if i + max_input_tokens >= n:
            break
    # combine chunk summaries into final summary
    combined = " ".join(summaries)
    # final compress summarization (optional)
    inputs = tokenizer.encode_plus(combined, return_tensors="pt", truncation=True, max_length=max_input_tokens)
    input_ids = inputs["input_ids"].to(device)
    with torch.no_grad():
        final_ids = model.generate(input_ids, num_beams=4, max_length=max_summary_tokens, early_stopping=True)
    final_summary = tokenizer.decode(final_ids[0], skip_special_tokens=True)
    return final_summary


In [None]:
# CELL 4 - disaster classification by keywords
disaster_type_keywords = {
    "flooding": ["flood", "flooding", "floods", "inundation", "water level", "submerged", "flash flood", "flooded"],
    "typhoon": ["typhoon", "super typhoon", "bagyo", "storm", "tropical storm", "landfall"],
    "earthquake": ["earthquake", "tremor", "aftershock", "magnitude", "richter"],
    "fire": ["fire", "blaze", "wildfire", "burned", "arson", "inferno", "smoke"],
    "volcanic eruption": ["volcano", "eruption", "ashfall", "lahar", "pyroclastic", "volcanic"],
}

# Normalize keywords to lowercase
for k in disaster_type_keywords:
    disaster_type_keywords[k] = [w.lower() for w in disaster_type_keywords[k]]

def classify_disaster_types(text):
    """
    Returns a list of disaster type labels detected in text using keyword matching.
    If none matched, returns ["other"].
    """
    text_l = (text or "").lower()
    found = set()
    for dtype, kws in disaster_type_keywords.items():
        for kw in kws:
            if kw in text_l:
                found.add(dtype)
                break
    if not found:
        # fallback: look for words like 'evacuation' or 'calamity' etc. to mark as 'other'
        if any(x in text_l for x in ["evacuate", "evacuee", "calamity", "disaster", "incident"]):
            return ["other"]
        return []
    return sorted(found)


In [None]:
# CELL 5 - province detection helpers

def detect_provinces(text, provinces_list=provinces):
    """
    Returns a list of province names (original-cased) that appear in text.
    Uses approximate substring matching: tokenized words compared to province tokens.
    """
    t = (text or "").lower()
    matches = set()
    # exact substring match for province names
    for p in provinces:
        if p.lower() in t:
            matches.add(p)
    # additional heuristic: check common short forms (Metro Manila / Manila)
    if "manila" in t and "Metro Manila" in provinces:
        matches.add("Metro Manila")
    return sorted(matches)


In [None]:
# CELL 6 - main processing
OUTPUT_JSON = "static/data/summaries_by_province.json"
OUTPUT_CSV = "static/data/summaries_flat.csv"

results = []   # flat list of processed records

# iterate rows (use tqdm)
for idx, row in tqdm(df.iterrows(), total=len(df)):
    article_text = row.get("Article") or row.get("Abstract") or row.get("Headline") or ""
    if not article_text or article_text.strip()=="":
        continue
    # optionally use Headline + Abstract + Article to give more context
    combined_text = " ".join([str(row.get(c,"")) for c in ["Headline","Abstract","Article"] if row.get(c)])
    # classify disaster types (keyword rules)
    types = classify_disaster_types(combined_text)
    # detect provinces mentioned in text
    provinces_found = detect_provinces(combined_text)
    # if no provinces found, try to look in 'Tags' or 'Link' or fallback to 'unknown'
    if not provinces_found:
        tags = (row.get("Tags") or "").lower()
        for p in provinces:
            if p.lower() in tags:
                provinces_found.append(p)
    if not provinces_found:
        provinces_found = ["Unknown"]

    # summarize (skip if extremely short)
    summary = summarize_text(combined_text)
    
    record = {
        "index": int(idx),
        "Date": row.get("Date",""),
        "Headline": row.get("Headline",""),
        "Link": row.get("Link",""),
        "Detected_Provinces": provinces_found,
        "Detected_Types": types if types else ["other"],
        "Summary": summary
    }
    results.append(record)

# save flat CSV
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False, encoding="UTF-8")
print("Wrote flat results to", OUTPUT_CSV)


In [None]:
# CELL 7 - aggregate
from collections import defaultdict

agg = defaultdict(lambda: defaultdict(list))
for r in results:
    for p in r["Detected_Provinces"]:
        for t in r["Detected_Types"]:
            agg[p][t].append({
                "Date": r["Date"],
                "Headline": r["Headline"],
                "Link": r["Link"],
                "Summary": r["Summary"]
            })

# Convert to plain dict for JSON serialization
agg_out = {p: {t: vals for t, vals in types.items()} for p, types in agg.items()}

with open(OUTPUT_JSON, "w", encoding="utf-8") as fh:
    json.dump(agg_out, fh, ensure_ascii=False, indent=2)

print("Wrote aggregated summaries to", OUTPUT_JSON)


In [None]:
# CELL 8 - quick check and example display
import json
with open(OUTPUT_JSON, encoding="utf-8") as fh:
    data = json.load(fh)

# Show provinces that have entries
for prov, types in list(data.items())[:10]:
    print(prov, "->", {t: len(v) for t, v in types.items()})
