In [5]:
# === Config ===
from pathlib import Path
import re, json, sys

# Set your input directory (folder containing v3.1_oag_author.part00000.jsonl ... part00357.jsonl)
INPUT_DIR = Path(r"C:\Users\XxRui\Desktop\newData\oag_chunks")  # <-- change me
# Output directory (will be created). If you prefer "alongside", set OUTPUT_DIR = INPUT_DIR
OUTPUT_DIR = Path(r"C:\Users\XxRui\Desktop\newData\output") 
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# === Build country lexicon ===
def build_country_set():
    base = set()

    # Fallback: built-in list (covers most occurrences in OAG org strings)
    fallback = {
        # Common English names
        "china","japan","france","germany","spain","italy","portugal","mexico","canada","australia",
        "new zealand","united kingdom","netherlands","sweden","norway","denmark","finland","iceland",
        "ireland","poland","czech republic","slovakia","hungary","slovenia","croatia","serbia",
        "romania","bulgaria","greece","turkey","israel","egypt","south africa","nigeria","kenya",
        "russia","russian federation","ukraine","belarus","kazakhstan","uzbekistan",
        "india","bangladesh","pakistan","sri lanka","nepal","bhutan",
        "thailand","vietnam","laos","cambodia","myanmar","philippines","malaysia","singapore",
        "indonesia","brunei","taiwan","republic of china","r.o.c.","roc",
        "south korea","republic of korea","korea","north korea",
        "saudi arabia","united arab emirates","uae","qatar","kuwait","oman","bahrain","iran","iraq","jordan","lebanon",
        "brazil","argentina","chile","peru","colombia","venezuela","uruguay","paraguay","bolivia","ecuador",
        "switzerland","austria","belgium","luxembourg","liechtenstein","monaco","andorra","malta","cyprus",
        # English variants frequently seen
        "united states","united states of america","usa","u.s.a.","u.s.","u.s.a","u.s",
        "PRC","ROC",
        "uk","u.k.","the netherlands","the bahamas","côte d’ivoire","cote d’ivoire","ivory coast",
        "the gambia","dominican republic","costa rica","el salvador","guatemala","honduras","nicaragua","panama",
        # Native language endonyms commonly appearing in org lines
        "deutschland","españa","méxico","brasil","rossiyskaya federatsiya","российская федерация",
        "suomi","norge","danmark","polska","magyarország","italia","sverige","schweiz","österreich","elláda","ελλάδα",
        "대한민국","日本","中国","россия","台灣","臺灣","香港","澳门","澳門",
        # UK component nations sometimes used informally
        "england","scotland","wales","northern ireland"
    }
    base |= {c.lower() for c in fallback}

    # Try to add canonical names from pycountry if available
    try:
        import pycountry
        for c in pycountry.countries:
            base.add(c.name.lower())
            if hasattr(c, "official_name"):
                base.add(c.official_name.lower())
            for alpha in ("alpha_2","alpha_3"):
                if getattr(c, alpha, None):
                    base.add(getattr(c, alpha).lower())
        # Common historical/short forms not always in pycountry
        base |= {"uk","u.k.","usa","u.s.a.","u.s."}
    except Exception:
        pass

    # normalize spacing for matching
    return {re.sub(r"\s+", " ", x.strip()) for x in base}

COUNTRIES = build_country_set()

# === Compile a regex to capture `, Country` pattern robustly
# We will:
#  1) normalize org whitespace (collapse spaces)
#  2) search for comma (allow optional spaces around), then a country token
#  3) accept matches before end, semicolon, period, parenthesis, or EOL
COMMA_COUNTRY_RE = re.compile(r",\s*([A-Za-z\.\u00C0-\u024F\u0370-\u03FF\u0400-\u04FF\u3040-\u30FF\u4E00-\u9FFF][^;()\[\]]{0,60})")

def normalize_text(s: str) -> str:
    # collapse multiple spaces, remove stray NBSPs
    return re.sub(r"\s+", " ", (s or "").replace("\xa0"," ")).strip()

def extract_comma_country(org: str):
    """
    Returns (True, matched_country) if org has a ', Country' style segment
    where 'Country' matches our country lexicon (case-insensitive),
    after basic normalization. Otherwise returns (False, None).
    """
    if not org:
        return (False, None)

    org_norm = normalize_text(org)
    for m in COMMA_COUNTRY_RE.finditer(org_norm):
        raw_tail = m.group(1).strip()
        # stop at typical terminators
        raw_tail = re.split(r"[.;,)]+", raw_tail)[0].strip()

        # try single or first 1-3 tokens (to catch things like 'Republic of Korea', 'United States',
        # and also variants like 'U.S.A.' and 'U.K.')
        candidates = set()

        # exact tail
        candidates.add(raw_tail.lower())

        # progressively shorten to first k tokens (k up to 4)
        toks = raw_tail.split()
        for k in range(1, min(4, len(toks)) + 1):
            candidates.add(" ".join(toks[:k]).lower())

        # punctuation-stripped variants (e.g., 'U.S.A.' -> 'usa')
        punct_free = re.sub(r"[^\w\s]","", raw_tail).lower()
        if punct_free:
            candidates.add(punct_free)
            toks2 = punct_free.split()
            for k in range(1, min(4, len(toks2)) + 1):
                candidates.add(" ".join(toks2[:k]).lower())

        # final check against lexicon
        for cand in candidates:
            cand_norm = re.sub(r"\s+", " ", cand).strip()
            if cand_norm in COUNTRIES:
                return (True, cand_norm)

    return (False, None)

# === Stream filter ===
def iter_jsonl(path: Path):
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                # Skip malformed line
                continue

def output_name_for(input_path: Path) -> Path:
    # v3.1_oag_author.part00000.jsonl -> v3.1_oag_author.part00000.filtered.jsonl
    return OUTPUT_DIR / (input_path.stem + ".filtered.jsonl")

def filter_batch_file(in_path: Path) -> dict:
    keep = 0
    total = 0
    out_path = output_name_for(in_path)
    with out_path.open("w", encoding="utf-8") as out_f:
        for obj in iter_jsonl(in_path):
            total += 1
            id_  = obj.get("id")
            name = obj.get("name")
            org  = obj.get("org")

            # Must have required keys
            if not (id_ and name and org):
                continue

            has_country, _ = extract_comma_country(org)
            if has_country:
                keep += 1
                # Emit only the required fields in normalized form
                record = {
                    "id": str(id_),
                    "name": normalize_text(name),
                    "org": normalize_text(org),
                }
                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

    return {"file": in_path.name, "kept": keep, "total": total, "out": out_path.name}

# === Run over all parts ===
parts = sorted(INPUT_DIR.glob("v3.1_oag_author.part*.jsonl"))
if not parts:
    print(f"⚠️ No files matched in {INPUT_DIR}")
else:
    summary = []
    for p in parts:
        stats = filter_batch_file(p)
        summary.append(stats)
        print(f"✓ {stats['file']}: kept {stats['kept']}/{stats['total']} → {stats['out']}")

    kept_sum = sum(s["kept"] for s in summary)
    total_sum = sum(s["total"] for s in summary)
    print("\n==== Done ====")
    print(f"Total kept: {kept_sum} / {total_sum} ({(kept_sum/total_sum*100 if total_sum else 0):.2f}%)")
    print(f"Output dir: {OUTPUT_DIR}")


In [17]:
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

COUNTER = Counter()  # to collect country frequencies globally

def extract_countries(org: str):
    """Return list of all distinct matched country names (normalized)"""
    if not org:
        return []
    org_norm = normalize_text(org)
    matches = []
    for m in COMMA_COUNTRY_RE.finditer(org_norm):
        raw_tail = m.group(1).strip()
        raw_tail = re.split(r"[.;,)]+", raw_tail)[0].strip()

        toks = raw_tail.split()
        # try combos up to 4 tokens
        candidates = {raw_tail.lower()}
        for k in range(1, min(4, len(toks)) + 1):
            candidates.add(" ".join(toks[:k]).lower())
        punct_free = re.sub(r"[^\w\s]", "", raw_tail).lower()
        if punct_free:
            candidates.add(punct_free)
            toks2 = punct_free.split()
            for k in range(1, min(4, len(toks2)) + 1):
                candidates.add(" ".join(toks2[:k]).lower())

        for cand in candidates:
            cand_norm = re.sub(r"\s+", " ", cand.strip())
            if cand_norm in COUNTRIES:
                matches.append(cand_norm)
                break
    # unique normalized list
    return sorted(set(matches))

def filter_batch_file_with_country(in_path: Path) -> dict:
    keep = 0
    total = 0
    out_path = output_name_for(in_path)
    with out_path.open("w", encoding="utf-8") as out_f:
        for obj in iter_jsonl(in_path):
            total += 1
            id_  = obj.get("id")
            name = obj.get("name")
            org  = obj.get("org")
            if not (id_ and name and org):
                continue

            countries = extract_countries(org)
            if len(countries) == 1:
                country = countries[0]
                keep += 1
                COUNTER[country] += 1
                record = {
                    "id": str(id_),
                    "name": normalize_text(name),
                    "org": normalize_text(org),
                    "country": country,
                }
                out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

    return {"file": in_path.name, "kept": keep, "total": total, "out": out_path.name}

# === Run across all parts ===
parts = sorted(INPUT_DIR.glob("v3.1_oag_author.part*.jsonl"))
summary = []
for p in parts:
    stats = filter_batch_file_with_country(p)
    summary.append(stats)
    print(f"✓ {stats['file']}: kept {stats['kept']}/{stats['total']} → {stats['out']}")

kept_sum = sum(s["kept"] for s in summary)
total_sum = sum(s["total"] for s in summary)
print("\n==== Done ====")
print(f"Total kept: {kept_sum} / {total_sum} ({(kept_sum/total_sum*100 if total_sum else 0):.2f}%)")


✓ v3.1_oag_author.part00000.jsonl: kept 7296/100000 → v3.1_oag_author.part00000.filtered.jsonl
✓ v3.1_oag_author.part00001.jsonl: kept 4951/100000 → v3.1_oag_author.part00001.filtered.jsonl
✓ v3.1_oag_author.part00002.jsonl: kept 3098/100000 → v3.1_oag_author.part00002.filtered.jsonl
✓ v3.1_oag_author.part00003.jsonl: kept 3922/100000 → v3.1_oag_author.part00003.filtered.jsonl
✓ v3.1_oag_author.part00004.jsonl: kept 6736/100000 → v3.1_oag_author.part00004.filtered.jsonl
✓ v3.1_oag_author.part00005.jsonl: kept 3924/100000 → v3.1_oag_author.part00005.filtered.jsonl
✓ v3.1_oag_author.part00006.jsonl: kept 1538/100000 → v3.1_oag_author.part00006.filtered.jsonl
✓ v3.1_oag_author.part00007.jsonl: kept 6689/100000 → v3.1_oag_author.part00007.filtered.jsonl
✓ v3.1_oag_author.part00008.jsonl: kept 5295/100000 → v3.1_oag_author.part00008.filtered.jsonl
✓ v3.1_oag_author.part00009.jsonl: kept 3689/100000 → v3.1_oag_author.part00009.filtered.jsonl
✓ v3.1_oag_author.part00010.jsonl: kept 2809/10000