<a href="https://colab.research.google.com/github/LaoM10617/CrossLingual_InformationRetrieval2526/blob/main/Constructing_Datasets2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 挂载google Drive&设置缓存
#@title 0) Mount Drive (optional) & set cache dir
USE_DRIVE = True  #@param {type:"boolean"}

if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    os.environ["HF_DATASETS_CACHE"] = "/content/drive/MyDrive/hf_cache"
    BASE_DIR = "/content/drive/MyDrive/mini_kept_rag"
else:
    BASE_DIR = "/content/mini_kept_rag"

import os, pathlib
pathlib.Path(BASE_DIR).mkdir(parents=True, exist_ok=True)
print("BASE_DIR =", BASE_DIR)
print("HF_DATASETS_CACHE =", os.environ.get("HF_DATASETS_CACHE"))

Mounted at /content/drive
BASE_DIR = /content/drive/MyDrive/mini_kept_rag
HF_DATASETS_CACHE = /content/drive/MyDrive/hf_cache


In [7]:
#@title 1) Install deps
!pip install datasets>=2.19.0
!pip install pandas>=2.0.0
!pip install numpy>=1.24.0
!pip install sentence-transformers>=2.7.0
!pip install tqdm>=4.66.0
!pip install faiss-cpu
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [4]:
#@title 2) Config
import re

LANGS = ["en","de","zh"]        # 可加 "id","vi","th" 等
PER_LANG_MAX = 20000            # 每语种抽取上限（先小后大）
MIN_TOK, MAX_TOK = 20, 300

# 文化主题关键词（标题+正文）：可按需要调整
TOPIC_RE = re.compile(r"(festival|holiday|cuisine|food|dish|wedding|custom|tradition|greeting|family)", re.I)
print("Config:", LANGS, PER_LANG_MAX, MIN_TOK, MAX_TOK, TOPIC_RE.pattern)

Config: ['en', 'de', 'zh'] 20000 20 300 (festival|holiday|cuisine|food|dish|wedding|custom|tradition|greeting|family)


In [8]:
#@title 3) Stream finewiki & write per-language JSONL
import os, json
from tqdm import tqdm
from datasets import load_dataset
from unidecode import unidecode

mini_dir = f"{BASE_DIR}/mini_finewiki"
os.makedirs(mini_dir, exist_ok=True)

def tok_count(s): return len((s or "").split())

def keep_example(ex):
    t = (ex.get("title") or "") + " " + (ex.get("text") or "")
    if not TOPIC_RE.search(unidecode(t)):
        return False
    n = tok_count(ex.get("text",""))
    return MIN_TOK <= n <= MAX_TOK

for lang in LANGS:
    print(f"[{lang}] streaming…")
    ds = load_dataset("HuggingFaceFW/finewiki", lang, split="train", streaming=True)
    out_path = f"{mini_dir}/{lang}.jsonl"
    kept, buf, BATCH = 0, [], 512
    with open(out_path, "w", encoding="utf-8") as f:
        for ex in tqdm(ds, total=None):
            if keep_example(ex):
                row = {
                    "in_language": lang,
                    "title": (ex.get("title") or "").strip(),
                    "text": (ex.get("text") or "").strip(),
                    "wikidata_id": (ex.get("wikidata_id") or "").strip()
                }
                buf.append(json.dumps(row, ensure_ascii=False))
                if len(buf) >= BATCH:
                    f.write("\n".join(buf) + "\n")
                    buf.clear()
                kept += 1
                if kept >= PER_LANG_MAX:
                    break
        if buf:
            f.write("\n".join(buf) + "\n")
    print(f"[{lang}] kept = {kept}, saved → {out_path}")

[en] streaming…


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

247891it [07:45, 532.22it/s]


[en] kept = 20000, saved → /content/drive/MyDrive/mini_kept_rag/mini_finewiki/en.jsonl
[de] streaming…


1199491it [38:28, 519.64it/s]


[de] kept = 20000, saved → /content/drive/MyDrive/mini_kept_rag/mini_finewiki/de.jsonl
[zh] streaming…


1295955it [27:56, 772.83it/s]

[zh] kept = 4614, saved → /content/drive/MyDrive/mini_kept_rag/mini_finewiki/zh.jsonl





In [10]:
#@title 4) Build CLK via wikidata_id
import json, uuid
from collections import defaultdict
import numpy as np

paths = {lang: f"{mini_dir}/{lang}.jsonl" for lang in LANGS}

# 建轻量索引：wid -> [{title,text}, ...]
index = {lang: defaultdict(list) for lang in LANGS}
for lang, p in paths.items():
    with open(p, "r", encoding="utf-8") as f:
        for line in f:
            ex = json.loads(line)
            wid = (ex.get("wikidata_id") or "").strip()
            if wid and wid.lower() != "none":
                index[lang][wid].append({"title": ex["title"], "text": ex["text"]})

clk_out = f"{mini_dir}/CLK.jsonl"
cnt = 0
with open(clk_out, "w", encoding="utf-8") as fout:
    all_wids = list(set(k for lang in LANGS for k in index[lang].keys()))
    for wid in all_wids:
        langs_have = [lang for lang in LANGS if wid in index[lang]]
        if len(langs_have) < 2:
            continue
        # 两两配对，每对抽1条
        for i in range(len(langs_have)):
            for j in range(i+1, len(langs_have)):
                l1, l2 = langs_have[i], langs_have[j]
                r1 = np.random.choice(index[l1][wid])
                r2 = np.random.choice(index[l2][wid])
                pair = {
                    "pair_id": f"CLK_{uuid.uuid4().hex[:12]}",
                    "type": "CLK",
                    "lang_1": l1, "lang_2": l2,
                    "text_1": r1["text"], "text_2": r2["text"],
                    "entity_id": wid,
                    "title_1": r1["title"], "title_2": r2["title"],
                    "alignment_source": "wikidata_id",
                    "source": "finewiki"
                }
                fout.write(json.dumps(pair, ensure_ascii=False) + "\n")
                cnt += 1
print("CLK pairs:", cnt, "→", clk_out)

CLK pairs: 121 → /content/drive/MyDrive/mini_kept_rag/mini_finewiki/CLK.jsonl


In [11]:
#@title 5) Build ILK within-language
import json, uuid

def split_two_chunks(text, min_tok=15):
    s = (text or "").split(". ")
    if len(s) < 3: return None
    a = " ".join(s[:max(1, len(s)//3)]).strip()
    b = " ".join(s[-max(1, len(s)//3):]).strip()
    if len(a.split()) < min_tok or len(b.split()) < min_tok:
        return None
    return a, b

ilk_out = f"{mini_dir}/ILK.jsonl"
with open(ilk_out, "w", encoding="utf-8") as fout:
    for lang, p in paths.items():
        with open(p, "r", encoding="utf-8") as f:
            for line in f:
                ex = json.loads(line)
                two = split_two_chunks(ex["text"])
                if not two: continue
                a, b = two
                pair = {
                    "pair_id": f"ILK_{uuid.uuid4().hex[:12]}",
                    "type": "ILK",
                    "lang": lang,
                    "text_1": a, "text_2": b,
                    "entity_id": ex.get("wikidata_id",""),
                    "title_1": ex["title"], "title_2": ex["title"],
                    "source": "finewiki"
                }
                fout.write(json.dumps(pair, ensure_ascii=False) + "\n")
print("ILK written →", ilk_out)

ILK written → /content/drive/MyDrive/mini_kept_rag/mini_finewiki/ILK.jsonl


In [12]:
#@title 6) Merge & split to train/dev/test JSONL
import json, random, os
random.seed(42)

pairs_dir = f"{BASE_DIR}/pairs"
os.makedirs(pairs_dir, exist_ok=True)

def stream_jsonl(path):
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

buf = []
for p in [f"{mini_dir}/CLK.jsonl", f"{mini_dir}/ILK.jsonl"]:
    for obj in stream_jsonl(p):
        buf.append(obj)
random.shuffle(buf)

n = len(buf)
t = int(n*0.9); d = int(n*0.05)
splits = {"train": buf[:t], "dev": buf[t:t+d], "test": buf[t+d:]}
for k, items in splits.items():
    outp = f"{pairs_dir}/mini_kept_pairs.{k}.jsonl"
    with open(outp, "w", encoding="utf-8") as f:
        for x in items:
            f.write(json.dumps(x, ensure_ascii=False)+"\n")
    print(k, "→", outp, ":", len(items))

with open(f"{pairs_dir}/stats.json","w",encoding="utf-8") as f:
    json.dump({k: len(v) for k,v in splits.items()}, f, indent=2, ensure_ascii=False)
print("Done.")

train → /content/drive/MyDrive/mini_kept_rag/pairs/mini_kept_pairs.train.jsonl : 26568
dev → /content/drive/MyDrive/mini_kept_rag/pairs/mini_kept_pairs.dev.jsonl : 1476
test → /content/drive/MyDrive/mini_kept_rag/pairs/mini_kept_pairs.test.jsonl : 1477
Done.


In [13]:
#@title 7) Tiny dual-encoder training demo
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

base_model = "sentence-transformers/distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(base_model)

train_ds = load_dataset("json", data_files={"train": f"{pairs_dir}/mini_kept_pairs.train.jsonl"})["train"]
def to_examples(split):
    exs = []
    for r in split:
        exs.append(InputExample(texts=[r["text_1"], r["text_2"]]))
    return exs

train_examples = to_examples(train_ds)
train_loader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Demo 用 1 epoch；正式建议 3–5 epoch（Colab GPU 约 6–8 h, 50k 样本量级）
model.fit(train_objectives=[(train_loader, train_loss)], epochs=1, warmup_steps=500, use_amp=True)
model.save(f"{BASE_DIR}/mini_kept_encoder")
print("Saved encoder →", f"{BASE_DIR}/mini_kept_encoder")

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlaom10617[0m ([33mlaom10617-ludwig-maximilian-university-of-munich[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Saved encoder → /content/drive/MyDrive/mini_kept_rag/mini_kept_encoder


In [None]:
#@title 8) Build FAISS index for RAG
import faiss, json
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer(f"{BASE_DIR}/mini_kept_encoder")

def corpus_iter():
    # 用 pair 生成索引语料，并去重
    seen = set()
    for split in ["train","dev","test"]:
        path = f"{pairs_dir}/mini_kept_pairs.{split}.jsonl"
        with open(path,"r",encoding="utf-8") as f:
            for line in f:
                r = json.loads(line)
                if r["type"]=="ILK":
                    for lang,title,text in [(r["lang"], r["title_1"], r["text_1"]),
                                            (r["lang"], r["title_2"], r["text_2"])]:
                        key=(lang,title,text[:64])
                        if key in seen: continue
                        seen.add(key)
                        yield {"lang":lang,"title":title,"text":text}
                else:
                    for (lang,title,text) in [(r.get("lang_1",""), r.get("title_1",""), r["text_1"]),
                                             (r.get("lang_2",""), r.get("title_2",""), r["text_2"])]:
                        key=(lang,title,text[:64])
                        if key in seen: continue
                        seen.add(key)
                        yield {"lang":lang,"title":title,"text":text}

meta = []
DIM = 768
index = faiss.IndexFlatIP(DIM)

BATCH = 256
batch_texts, batch_meta = [], []
for item in tqdm(corpus_iter()):
    batch_texts.append(item["text"])
    batch_meta.append(item)
    if len(batch_texts) >= BATCH:
        embs = encoder.encode(batch_texts, normalize_embeddings=True, convert_to_numpy=True, batch_size=256, show_progress_bar=False)
        index.add(embs)
        meta.extend(batch_meta)
        batch_texts, batch_meta = [], []

if batch_texts:
    embs = encoder.encode(batch_texts, normalize_embeddings=True, convert_to_numpy=True, batch_size=256, show_progress_bar=False)
    index.add(embs); meta.extend(batch_meta)

faiss.write_index(index, f"{BASE_DIR}/rag_index.faiss")
with open(f"{BASE_DIR}/rag_meta.jsonl","w",encoding="utf-8") as f:
    for m in meta:
        f.write(json.dumps(m, ensure_ascii=False)+"\n")

print("Index size:", index.ntotal)
print("Saved:", f"{BASE_DIR}/rag_index.faiss")
print("Saved:", f"{BASE_DIR}/rag_meta.jsonl")

In [None]:
#@title 9) Quick retrieval test
import faiss, json, numpy as np
from sentence_transformers import SentenceTransformer

index = faiss.read_index(f"{BASE_DIR}/rag_index.faiss")
encoder = SentenceTransformer(f"{BASE_DIR}/mini_kept_encoder")

def preview_meta(path, n=3):
    out = []
    with open(path,"r",encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i>=n: break
            out.append(json.loads(line))
    return out

print(preview_meta(f"{BASE_DIR}/rag_meta.jsonl", 3))

query = "What food is commonly eaten during Chinese New Year?"
q = encoder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
D, I = index.search(q, 5)
print("Scores:", D[0])

# read back the top-5
hits = []
with open(f"{BASE_DIR}/rag_meta.jsonl","r",encoding="utf-8") as f:
    meta_lines = f.readlines()
for idx in I[0]:
    hits.append(json.loads(meta_lines[idx]))

for h in hits:
    print(f"[{h['lang']}] {h['title']}: {h['text'][:200]}…")