In [1]:
from pathlib import Path
import re, json, unicodedata, string
from collections import Counter, defaultdict
from tqdm import tqdm

# INPUT: where your *.filtered.jsonl batches live
INPUT_DIR  = Path(r"C:\Users\XxRui\Desktop\newData\output")
# OUTPUT: where to write merged batches (1:1 filenames)
OUTPUT_DIR = INPUT_DIR / "country_merged_batches"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Only process part00000 … part00300 (inclusive)
START_IDX, END_IDX = 0, 300


In [3]:
def strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

PUNC_TABLE = str.maketrans("", "", string.punctuation)

def norm_key(s: str) -> str:
    """Lowercase, remove punctuation and extra spaces."""
    if not s:
        return ""
    s = strip_accents(s)
    s = s.lower().translate(PUNC_TABLE)
    s = re.sub(r"\s+", " ", s).strip()
    return s


In [4]:
ALIASES = defaultdict(lambda: None)

def add_alias(alias, canonical):
    ALIASES[norm_key(alias)] = canonical.lower()

# --- Merged canonical forms (all lowercase) ---

# united states
for a in ["usa","u.s.a.","u.s.","us","united states","united states of america","america"]:
    add_alias(a, "usa")

# china (includes hong kong, macau)
for a in [
    "china","prc","people's republic of china","peoples republic of china",
    "中国","中國","zhongguo",
    "hong kong","香港","香港特別行政區","xianggang","香港特别行政区",
    "macau","macao","澳门","澳門","aomen","澳門特別行政區","澳门特别行政区"
]:
    add_alias(a, "china")

# taiwan (ROC)
for a in ["taiwan","台灣","臺灣","republic of china","roc","r.o.c."]:
    add_alias(a, "taiwan")

for a in ["polska"]:
    add_alias(a, "poland")
    
for a in ["日本"]:
    add_alias(a, "japan")

for a in ["magyarorszag"]:
    add_alias(a, "hungary")

for a in ["uae"]:
    add_alias(a, "united arab emirates")    

# korea
for a in ["south korea","republic of korea","korea","대한민국"]:
    add_alias(a, "south korea")
for a in ["north korea","dprk","democratic people's republic of korea"]:
    add_alias(a, "north korea")

# uk and home nations
for a in ["uk","u.k.","united kingdom","great britain","gb","g.b.","england","scotland","wales","northern ireland"]:
    add_alias(a, "united kingdom")

# russia
for a in ["russia","russian federation","россия","российская федерация","rossiyskaya federatsiya"]:
    add_alias(a, "russia")

# european and language variants
alias_pairs = {
    "méxico": "mexico", "brasil": "brazil", "deutschland": "germany",
    "sverige": "sweden", "österreich": "austria", "schweiz": "switzerland",
    "suomi": "finland", "norge": "norway", "danmark": "denmark",
    "españa": "spain", "italia": "italy", "the netherlands": "netherlands",
    "côte d’ivoire": "côte d’ivoire", "cote d’ivoire": "côte d’ivoire",
    "cote d'ivoire": "côte d’ivoire", "ivory coast": "côte d’ivoire"
}
for k, v in alias_pairs.items():
    add_alias(k, v)


In [7]:
def normalize_country_to_canonical(country_raw: str) -> str:
    """Return lowercase canonical country name."""
    if not country_raw:
        return ""
    key = norm_key(country_raw)

    canon = ALIASES.get(key)
    if canon:
        return canon

    # fallback: remove periods
    key2 = key.replace(".", "")
    canon = ALIASES.get(key2)
    if canon:
        return canon

    if key.startswith("the "):
        canon = ALIASES.get(key[4:])
        if canon:
            return canon

    # Default: normalized lowercase itself
    return key


In [9]:
def part_index_from_name(name: str):
    m = re.search(r"\.part(\d{5})\.filtered\.jsonl$", name)
    return int(m.group(1)) if m else None

def out_path_for(in_path: Path) -> Path:
    return OUTPUT_DIR / in_path.name.replace(".filtered.jsonl", ".country_merged.jsonl")

files = sorted(INPUT_DIR.glob("v3.1_oag_author.part*.filtered.jsonl"))

total_in, total_out = 0, 0
merged_counter = Counter()
unknown_samples = Counter()

for fp in files:
    idx = part_index_from_name(fp.name)
    if idx is None or not (START_IDX <= idx <= END_IDX):
        continue

    out_fp = out_path_for(fp)
    written = 0

    with fp.open("r", encoding="utf-8", errors="ignore") as fin, \
         out_fp.open("w", encoding="utf-8") as fout:

        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue

            total_in += 1
            raw_country = obj.get("country") or ""
            merged = normalize_country_to_canonical(raw_country)
            obj["country"] = merged.lower()

            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
            written += 1
            total_out += 1
            merged_counter[merged] += 1

    print(f"✓ {fp.name} → {out_fp.name} ({written} records)")

print(f"\n✅ Done. Processed {total_in:,} records, wrote {total_out:,} records.")


✓ v3.1_oag_author.part00000.filtered.jsonl → v3.1_oag_author.part00000.country_merged.jsonl (7296 records)
✓ v3.1_oag_author.part00001.filtered.jsonl → v3.1_oag_author.part00001.country_merged.jsonl (4951 records)
✓ v3.1_oag_author.part00002.filtered.jsonl → v3.1_oag_author.part00002.country_merged.jsonl (3098 records)
✓ v3.1_oag_author.part00003.filtered.jsonl → v3.1_oag_author.part00003.country_merged.jsonl (3922 records)
✓ v3.1_oag_author.part00004.filtered.jsonl → v3.1_oag_author.part00004.country_merged.jsonl (6736 records)
✓ v3.1_oag_author.part00005.filtered.jsonl → v3.1_oag_author.part00005.country_merged.jsonl (3924 records)
✓ v3.1_oag_author.part00006.filtered.jsonl → v3.1_oag_author.part00006.country_merged.jsonl (1538 records)
✓ v3.1_oag_author.part00007.filtered.jsonl → v3.1_oag_author.part00007.country_merged.jsonl (6689 records)
✓ v3.1_oag_author.part00008.filtered.jsonl → v3.1_oag_author.part00008.country_merged.jsonl (5295 records)
✓ v3.1_oag_author.part00009.filtered.

In [11]:
import pandas as pd

df = pd.DataFrame(
    [(k, v, v / sum(merged_counter.values()) * 100) for k, v in merged_counter.items()],
    columns=["country", "count", "percent"]
).sort_values("count", ascending=False)

out_csv = OUTPUT_DIR / "country_distribution_merged_lowercase.csv"
df.to_csv(out_csv, index=False, encoding="utf-8-sig")

print(f"Saved lowercase merged distribution → {out_csv}")


Saved lowercase merged distribution → C:\Users\XxRui\Desktop\newData\output\country_merged_batches\country_distribution_merged_lowercase.csv
