In [None]:
!pip install nltk





In [None]:
import random
import json
from itertools import product
import os

# Define logic templates
TEMPLATES = [
    # Entailment
    ("All {A} are {B}.", "Some {A} are {B}.", "entailment"),
    # Contradiction
    ("All {A} are {B}.", "No {A} are {B}.", "contradiction"),
    # Neutral
    ("All {A} are {B}.", "Some {B} are {A}.", "neutral")
]

# Word lists
A_NOUNS = [
    "dragons", "robots", "teachers", "aliens", "wizards", "carrots", "knights", "monkeys", "computers", "ghosts",
    "doctors", "soldiers", "mermaids", "pirates", "zombies", "students", "dinosaurs", "ogres", "cows", "musicians",
    "chefs", "ghost hunters", "painters", "goblins", "clowns", "farmers", "bakers", "spies", "scientists", "inventors"
]
B_NOUNS = [
    "animals", "machines", "creatures", "vegetables", "warriors", "objects", "humans", "entities", "tools", "beings",
    "professionals", "instruments", "foods", "species", "myths", "characters", "occupations", "devices", "intelligences", "structures",
    "organisms", "monsters", "things", "concepts", "forces", "vehicles", "technologies", "roles", "symbols", "systems"
]

# Number of examples to generate
NUM_EXAMPLES = 1000

# Output file path
OUTPUT_PATH = "synthetic_nli_english.jsonl"

def generate_examples(num_examples):
    examples = []
    for _ in range(num_examples):
        template = random.choice(TEMPLATES)
        A = random.choice(A_NOUNS)
        B = random.choice(B_NOUNS)
        premise = template[0].format(A=A, B=B)
        hypothesis = template[1].format(A=A, B=B)
        label = template[2]
        example = {
            "language": "en",
            "premise": premise,
            "hypothesis": hypothesis,
            "label": label
        }
        examples.append(example)
    return examples

# Generate and save the dataset
dataset = generate_examples(NUM_EXAMPLES)

# Save to JSONL
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for example in dataset:
        json.dump(example, f)
        f.write("\n")

OUTPUT_PATH


'/content/drive/MyDrive/THESIS/Multilingual Stress Testing LLMs - Entailment/synthetic_nli_english.jsonl'

Translating

In [None]:
import json
from typing import List
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os

# Load MarianMT translation pipeline using Hugging Face Transformers
def get_translation_pipeline(src_lang: str, tgt_lang: str):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    translator = pipeline("translation", model=model, tokenizer=tokenizer)
    return translator

# Translate a batch of texts
def translate_batch(texts: List[str], translator) -> List[str]:
    results = translator(texts, batch_size=8, truncation=True)
    return [r['translation_text'] for r in results]

# Define target languages (Diverse language families)
TARGET_LANGUAGES = {
    "fr": "French",      # Romance
    "de": "German",      # Germanic
    "sw": "Swahili",     # Bantu
    "hi": "Hindi",       # Indic
    "ar": "Arabic",      # Semitic
}

# Load English NLI dataset
INPUT_PATH = "synthetic_nli_english.jsonl"
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    english_data = [json.loads(line) for line in f]

# Directory to store translations
output_dir = "multilingual_nli"
os.makedirs(output_dir, exist_ok=True)

# Translate and save for each language
for tgt_lang_code, lang_name in TARGET_LANGUAGES.items():
    print(f"Translating to {lang_name} ({tgt_lang_code})...")
    translator = get_translation_pipeline("en", tgt_lang_code)
    translated_data = []

    for example in english_data:
        try:
            trans_premise = translate_batch([example["premise"]], translator)[0]
            trans_hypothesis = translate_batch([example["hypothesis"]], translator)[0]
            translated_data.append({
                "language": tgt_lang_code,
                "premise": trans_premise,
                "hypothesis": trans_hypothesis,
                "label": example["label"]
            })
        except Exception as e:
            print(f"Skipping example due to error: {e}")
            continue

    # Save translated dataset
    output_path = os.path.join(output_dir, f"nli_{tgt_lang_code}.jsonl")
    with open(output_path, "w", encoding="utf-8") as f:
        for item in translated_data:
            json.dump(item, f)
            f.write("\n")

output_dir


# Code Switching

In [None]:
import os
import json
from itertools import permutations
from tqdm import tqdm

# Paths
DATA_DIR = "multilingual_nli"
OUTPUT_DIR = "code_switch"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get all available languages from file names
lang_files = [f for f in os.listdir(DATA_DIR) if f.startswith("nli_") and f.endswith(".jsonl")]
languages = [f.split("_")[1].split(".")[0] for f in lang_files]

# Load all datasets into memory
lang_data = {}
for lang in languages:
    file_path = os.path.join(DATA_DIR, f"nli_{lang}.jsonl")
    with open(file_path, "r", encoding="utf-8") as f:
        lang_data[lang] = [json.loads(line) for line in f]

# Ensure all datasets are the same length
example_count = len(next(iter(lang_data.values())))
assert all(len(data) == example_count for data in lang_data.values()), "Mismatch in dataset lengths."

# Generate code-switched pairs for all language permutations (L1 ≠ L2)
for lang1, lang2 in permutations(languages, 2):
    switched_data = []

    for ex1, ex2 in zip(lang_data[lang1], lang_data[lang2]):
        switched_data.append({
            "premise": ex1["premise"],
            "hypothesis": ex2["hypothesis"],
            "label": ex1["label"],
            "lang_premise": lang1,
            "lang_hypothesis": lang2,
            "codeswitch_type": f"{lang1}_prem_{lang2}_hyp"
        })

    output_file = f"codeswitch_{lang1}_prem_{lang2}_hyp.json"
    output_path = os.path.join(OUTPUT_DIR, output_file)
    with open(output_path, "w", encoding="utf-8") as f:
        for ex in switched_data:
            json.dump(ex, f, ensure_ascii=False)
            f.write("\n")

print("✅ All language pair code-switched datasets created.")


✅ All language pair code-switched datasets created.
