# Yoruba Proverbs CG‑CoT Full Study Notebook
This notebook implements the complete CG‑CoT study pipeline solely using `data/yoruba.json`:
- Zero-Shot (ZS)
- Zero-Shot + Chain-of-Thought (ZS-CoT)
- Few-Shot (FS)
- Real Cultural-KB CG‑CoT (RealKB-CG) (KB auto-generated from the dataset)
- Retrieval-Augmented Generation (RAG) for GPT and Anthropic
- Automatic evaluation (BLEU, BERTScore, Exact Match)
- GPT-based explanation scoring (accuracy, depth)
- Statistical analysis

**Note:** No external KB file is required; it's generated from the English translations in the JSON.

In [None]:
# 1. Imports & Setup
import os
import json
import random
from pathlib import Path

from openai import OpenAI
import anthropic
from sentence_transformers import SentenceTransformer
import faiss
import sacrebleu
from bert_score import score
import numpy as np
from scipy.stats import ttest_rel, wilcoxon

os.environ["OPENAI_API_KEY"] = "xxx"
os.environ["ANTHROPIC_API_KEY"] = "xxx"
# API keys - set these in your environment
client = OpenAI()        
anthropic_client = anthropic.Client(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Paths
DATA_PATH = Path("data/yoruba.json")
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)

# Experiment parameters
TEST_N = 100
DEV_N = 20
MODEL_GPT = "gpt-4o"
MODEL_ANTHROPIC = "claude-3-5-haiku-20241022"
EMBED_MODEL = SentenceTransformer('all-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm





In [62]:
# 2. Load & Sample Data
with open(DATA_PATH, encoding="utf-8") as f:
    data = json.load(f)

random.seed(42)
random.shuffle(data)
test_set = data[:TEST_N]
dev_set = data[TEST_N:TEST_N+DEV_N]
print("Sample test items:")
for item in test_set[:5]:
    print(item)


Sample test items:
{'yoruba': 'Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.', 'english': 'A person only newly acquainted with wealth; he has a son and names him Ọlaniyọnu.'}
{'yoruba': 'Ìwọ̀n eku nìwọ̀n ìtẹ́; olongo kì í gbé tìmùtìmù.', 'english': 'The measure of the rat is the measure of the nest; a robin does not live on a cushion.'}
{'yoruba': 'Ẹyẹ tó fi ara wé igún, ẹ̀hìn àdìrò ní ńsùn.', 'english': 'Whatever bird emulates the vulture will find itself behind the cooking hearth.'}
{'yoruba': 'Àwúrèbeé ní òún lè yẹ̀nà; ta ní jẹ́ tọ ọ̀nà àwúrèbe?', 'english': 'Àwúrebe says it can make a path; who would wish to follow a path it makes?'}
{'yoruba': 'Má tẹ̀ẹ́ lọ́wọ́ oníle, má tẹ̀ẹ́ lọ́wọ́ àlejò; lọ́wọ́ ara ẹni la ti ńtẹ́.', 'english': 'Save face with members of your household and save face with complete strangers, such a person loses face with himself/herself.'}


In [63]:
# 3. Auto-generate Cultural KB from the dataset
import os
import random
import json


def generate_kb_facts(data, kb_file="cultural_kb.txt", raw_file="raw_cultural_kb.txt"):
    # Check if the knowledge base file exists
    if os.path.exists(kb_file):
        print(f"[INFO] Loading knowledge base from {kb_file}")
        with open(kb_file, "r", encoding="utf-8") as f:
            return [line.strip() for line in f.readlines()]


    # Use a representative English subset to derive facts
    sample = random.sample(data, min(len(data), 50))
    english_list = [entry["english"] for entry in sample]
    prompt = (
        "You are an expert on Yoruba culture. Given these Yoruba proverbs in English:" +
        "\n".join(f"- {t}" for t in english_list) +
        "\n\nProvide 10 concise bullet points on common cultural themes, symbolism, and usage contexts as a list."
    )


    # Print the prompt for debugging
    print("[DEBUG] Prompt sent to LLM:")
    print(prompt)


    try:
        resp = client.chat.completions.create(
            model=MODEL_GPT,
            messages=[{"role":"user","content":prompt}],
            temperature=0.0,
            max_tokens=512
        )
    except Exception as e:
        print(f"[ERROR] Failed to get response from LLM: {e}")
        return []


    # Print the response for debugging
    print("[DEBUG] Response received from LLM:")
    print(resp)


    # Extract the content safely
    try:
        content = resp.choices[0].message.content
        print("[DEBUG] Extracted content:")
        print(content)
    except (KeyError, IndexError) as e:
        print(f"[ERROR] Failed to extract content from response: {e}")
        return []


    # Attempt to parse the response generously
    try:
        facts = [line.strip("- ").strip() for line in content.splitlines() if line.strip()]
        print("[INFO] Successfully parsed response into a list.")
    except Exception as e:
        print(f"[WARNING] Parsing failed, saving raw response: {e}")
        facts = []


    # Save the raw response to a file
    try:
        with open(raw_file, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"[INFO] Raw response saved to {raw_file}")
    except Exception as e:
        print(f"[ERROR] Failed to save raw response to file: {e}")


    # Save the parsed knowledge base to a file
    try:
        with open(kb_file, "w", encoding="utf-8") as f:
            f.write("\n".join(facts))
        print(f"[INFO] Knowledge base saved to {kb_file}")
    except Exception as e:
        print(f"[ERROR] Failed to save knowledge base to file: {e}")


    return facts


kb_facts = generate_kb_facts(data)
print("[INFO] Generated Cultural KB facts:")
for fact in kb_facts[:5]:
    print("-", fact)


[INFO] Loading knowledge base from cultural_kb.txt
[INFO] Generated Cultural KB facts:
- 1. **Courage and Duty**: Proverbs like "A palace guard does not receive arrows on his back" emphasize bravery and the importance of facing challenges head-on, reflecting the cultural value placed on courage and duty.
- 2. **Wisdom and Humility**: Many proverbs, such as "Having people to advise one is nothing like knowing how to take advice," highlight the importance of wisdom, humility, and the ability to learn from others.
- 3. **Self-awareness and Limitations**: Proverbs like "It is an elder who does not know his limitations that is washed away by a river" stress the importance of self-awareness and recognizing one's limitations.
- 4. **Community and Social Roles**: The proverbs often reflect the significance of social roles and community, as seen in "The person whom people have seated on a pig should moderate his or her strutting," which advises humility regardless of one's social position.
- 5.

In [64]:
print(kb_facts)



In [65]:
# 4. Build RAG Index Over Yoruba Texts
texts = [entry['yoruba'] for entry in data]
embs = EMBED_MODEL.encode(texts, convert_to_numpy=True)
index = faiss.IndexFlatL2(embs.shape[1])
index.add(embs)

def retrieve_similar(query, k=3):
    vec = EMBED_MODEL.encode([query], convert_to_numpy=True)
    _, I = index.search(vec, k)
    return [texts[i] for i in I[0]]

print("Example retrieval:")
print(retrieve_similar(test_set[0]['yoruba']))


Example retrieval:
['Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.', 'Ojú ò ti oníṣègùn, ó ní àna òun ńkú lọ.', 'Inú burúkú làgbà ńní, àgbà kì í ní ojú burúkú.']


In [66]:
# 5. Prompt Builders
def build_prompt(cond, proverb):
    yor = proverb["yoruba"]
    if cond == "ZS":
        return f"Translate this Yoruba proverb into English:\n“{yor}”"
    if cond == "ZS-CoT":
        return f"Translate this Yoruba proverb into English. Let's think step by step.\nProverb: “{yor}”"
    if cond == "FewShot":
        ex = ""
        for exmp in dev_set[:3]:
            ex += f"Yoruba: {exmp['yoruba']}\nEnglish: {exmp['english']}\n\n"
        return ex + f"Now translate this one:\nYoruba: {yor}\nEnglish:"
    if cond == "RealKB-CG":
        # Retrieve the 3 nearest neighbors (including itself), then skip the first
        sims = retrieve_similar(yor, k=3)  
        # sims[0] is usually the proverb itself, so we take sims[1] and sims[2]
        f1, f2 = sims[1], sims[2]

        return f"""You are a Culturally-Grounded Chain-of-Thought assistant. Follow these steps exactly:

        Task: Translate and interpret the proverb "{yor}"

        Step 0 – Context Injection  
        Similar Phrase 1: {f1}

        Step 1 – Reasoning with Similar Phrase 1  
        > Reflect on how this phrase informs the imagery or symbolism of the proverb.  
        Chain-of-Thought:  
        1. …  
        2. …  
        Interim Insight: …

        Step 2 – Literal Translation  
        > Provide a literal English translation of the proverb.  
        Translation: …

        Step 3 – Context Injection  
        Similar Phrase 2: {f2}

        Step 4 – Reasoning with Similar Phrase 2  
        > Explain how this phrase adds depth or cultural nuance to the meaning.  
        Chain-of-Thought:  
        1. …  
        2. …  
        Interim Insight: …

        Final Synthesis:  
        - Final Translation: …  
        - Culturally-Grounded Interpretation: …  
        - References: Similar Phrase 1, Similar Phrase 2

        Final Answer:
        """

    if cond == "RAG":
        sims = retrieve_similar(yor)
        sim_str = "\n".join(f"{i+1}. {s}" for i,s in enumerate(sims))
        return f"Here are similar Yoruba proverbs:\n{sim_str}\n\nTranslate & explain the target proverb: “{yor}”"


print("Sample prompts for each condition:")
for cond in ["ZS","ZS-CoT","FewShot","RealKB-CG","RAG"]:
    print("----", cond)
    print(build_prompt(cond, test_set[0]), "\n")


Sample prompts for each condition:
---- ZS
Translate this Yoruba proverb into English:
“Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.” 

---- ZS-CoT
Translate this Yoruba proverb into English. Let's think step by step.
Proverb: “Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.” 

---- FewShot
Yoruba: Àgbà tí kò mọ ìwọ̀n ara-a rẹ̀ lodò ńgbé lọ́.
English: It is an elder who does not know his limitations that is washed away by a river.

Yoruba: Ejò kì í ti ojú Ààrẹ gun ọgbà lọ.
English: A snake does not escape over the fence while a warrior

Yoruba: Apá èkúté-ilé ò ká awùsá; kìkìi yíyíkiri ló mọ.
English: The mouse cannot get a grip on the awùsá nut; all it can do is roll it around.

Now translate this one:
Yoruba: Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.
English: 

---- RealKB-CG
You are a Culturally-Grounded Chain-of-Thought assistant. Follow these steps exactly:

        Task: Translate and interpret the proverb "Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu."

     

In [3]:
# 6. API Wrappers with Caching
def call_gpt(prompt):
    return client.chat.completions.create(
        model=MODEL_GPT,
        messages=[{"role":"user","content":prompt}],
        temperature=0.0,
        max_tokens=512
    ).choices[0].message.content

def call_anthropic(prompt):
    response = anthropic.Anthropic().messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.content[0].text

def call_model(model_type, prompt):
    if model_type == "GPT":
        return call_gpt(prompt)
    elif model_type == "Anthropic":
        return call_anthropic(prompt)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def call_and_cache(cond, proverb, model_type):
    key = f"{cond}_{model_type}_{hash(proverb['yoruba']) & 0xffffffff}"
    cache_file = RESULTS_DIR / f"{key}.json"
    if cache_file.exists():
        return json.loads(cache_file.read_text(encoding="utf-8"))
    prompt = build_prompt(cond, proverb)
    gen = call_model(model_type, prompt)
    out = {"yoruba": proverb["yoruba"], "gold": proverb["english"], "gen": gen}
    cache_file.write_text(json.dumps(out, indent=2), encoding="utf-8")
    return out

# Test
print(call_and_cache("RealKB-CG", test_set[0], "GPT"))


NameError: name 'test_set' is not defined

In [69]:
# 7. Batch Generation
conditions = ["ZS","ZS-CoT","FewShot","RealKB-CG","RAG"]
all_results = {c: {"GPT": [], "Anthropic": []} for c in conditions}


for cond in conditions:
    print("Generating for", cond)
    for ex in test_set:
        # Run with GPT
        result_gpt = call_and_cache(cond, ex, "GPT")
        all_results[cond]["GPT"].append(result_gpt)


        # Run with Anthropic
        result_anthropic = call_and_cache(cond, ex, "Anthropic")
        all_results[cond]["Anthropic"].append(result_anthropic)


    # Save results for each model
    for model_type in ["GPT", "Anthropic"]:
        with open(RESULTS_DIR / f"{cond}_{model_type}_results.json", "w", encoding="utf-8") as f:
            json.dump(all_results[cond][model_type], f, indent=2)


# Preview
print(all_results["RealKB-CG"]["GPT"][0])


Generating for ZS
Generating for ZS-CoT
Generating for ZS-CoT
Generating for FewShot
Generating for FewShot
Generating for RealKB-CG
Generating for RealKB-CG
Generating for RAG
Generating for RAG
{'yoruba': 'Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu.', 'gold': 'A person only newly acquainted with wealth; he has a son and names him Ọlaniyọnu.', 'gen': '**Step 0 – Context Injection**  \nSimilar Phrase 1: Ojú ò ti oníṣègùn, ó ní àna òun ńkú lọ.\n\n**Step 1 – Reasoning with Similar Phrase 1**  \n> Reflect on how this phrase informs the imagery or symbolism of the proverb.  \nChain-of-Thought:  \n1. The phrase "Ojú ò ti oníṣègùn, ó ní àna òun ńkú lọ" suggests that a person who is not easily embarrassed or ashamed can make bold claims or statements, even if they are questionable or untrue.  \n2. It highlights the idea of audacity and the lack of shame in making declarations, regardless of the reality or truth of the situation.  \nInterim Insight: This phrase suggests that the eyes (or

In [75]:
# Save results to CSV
import csv

def save_results_to_csv(all_results, output_file="results_summary.csv"):
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(["Condition", "Model", "Yoruba Proverb", "Gold Translation", "Generated Translation"])

        # Write data
        for cond, models in all_results.items():
            for model, results in models.items():
                for result in results:
                    yoruba = result.get("yoruba", "")
                    gold = result.get("gold", "")
                    gen = result.get("gen", "")
                    writer.writerow([cond, model, yoruba, gold, gen])

# Call the function to save results
save_results_to_csv(all_results)

In [11]:
import pandas as pd
import json
from sacrebleu import corpus_bleu
import time

# Load the CSV file
csv_path = "results_summary.csv"
data = pd.read_csv(csv_path)

# Prepare prompts for ChatGPT
prompts = []
for _, row in data.iterrows():
    prompt = (
        f"You are a Yoruba language expert. Here is a proverb, its English translation, and a model output:\n\n"
        f"Proverb: \"{row['Yoruba Proverb']}\"\n"
        f"Gold translation: \"{row['Gold Translation']}\"\n"
        f"Model output:\n{row['Generated Translation']}\n\n"
        "Extract the final translation and rate the explanation part on:\n"
        "1. core-meaning accuracy (0 or 1)\n"
        "2. cultural depth (1-5)\n"
        "Reply as a list of strings using single quotes. Example:\n"
        "['final_translation: ...', 'accuracy: 1', 'depth: 5']"
    )
    prompts.append(prompt)

# Function to call GPT with logging and validation
def call_gpt_with_logging(prompt, retries=3):
    print("[INFO] Sending prompt to GPT:")
    print(prompt)
    for attempt in range(retries):
        try:
            response = call_gpt(prompt)  # Use existing GPT call function
            print("[INFO] Received response from GPT:")
            print(response)
            # Validate and parse the response
            if response.startswith("[") and response.endswith("]"):
                parsed_response = eval(response)  # Convert string to list
                if isinstance(parsed_response, list) and len(parsed_response) == 3:
                    result = {
                        "final_translation": parsed_response[0].split(': ', 1)[1].strip("'"),
                        "accuracy": int(parsed_response[1].split(': ', 1)[1]),
                        "depth": int(parsed_response[2].split(': ', 1)[1])
                    }
                    return result
                else:
                    raise ValueError("Response is not in the expected list format.")
            else:
                raise ValueError("Response does not start and end with square brackets.")
        except Exception as e:
            print(f"[ERROR] GPT call failed on attempt {attempt + 1}: {e}")
        if attempt < retries - 1:
            time.sleep(1)  # Wait before retrying
    # Return a default response after retries are exhausted
    return {"final_translation": "", "accuracy": 0, "depth": 0}

# Process each prompt and collect results
results = []
for prompt in prompts:
    response = call_gpt_with_logging(prompt)
    results.append(response)
    time.sleep(0.1)

# Save results to a JSON file
with open("chatgpt_test_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

# Print results for verification
print("Test Results:")
print(results)

[INFO] Sending prompt to GPT:
You are a Yoruba language expert. Here is a proverb, its English translation, and a model output:

Proverb: "Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu."
Gold translation: "A person only newly acquainted with wealth; he has a son and names him Ọlaniyọnu."
Model output:
The Yoruba proverb "Ojú ò rọ́lá rí; ó bímọ ẹ̀ ó sọ ọ́ ní Ọláníyọnu" translates to English as: "The future is unpredictable; it gives birth to a child and names it 'Wealth is Worrisome.'" This proverb suggests that the future is uncertain and can bring unexpected challenges or responsibilities, even when it seems promising.

Extract the final translation and rate the explanation part on:
1. core-meaning accuracy (0 or 1)
2. cultural depth (1-5)
Reply as a list of strings using single quotes. Example:
['final_translation: ...', 'accuracy: 1', 'depth: 5']
[INFO] Received response from GPT:
['final_translation: The future is unpredictable; it gives birth to a child and names it "Wealth i

In [3]:
import json
import pandas as pd
from sacrebleu import corpus_bleu
from bert_score import score
from collections import defaultdict

# Load ChatGPT results
with open("chatgpt_test_results.json", "r", encoding="utf-8") as f:
    chatgpt_results = json.load(f)

# Load the CSV file containing conditions and gold translations
data = pd.read_csv("results_summary.csv")

# Extract metrics for evaluation
final_translations = [r.get("final_translation", "") for r in chatgpt_results]
accuracies = [r.get("accuracy", 0) for r in chatgpt_results]
depths = [r.get("depth", 0) for r in chatgpt_results]

# Compute BLEU scores and BERT scores for each prompt method
methods = data['Condition'].unique()
bleu_scores = defaultdict(list)
bert_scores = defaultdict(list)

for method in methods:
    method_translations = [ft for ft, cond in zip(final_translations, data['Condition']) if cond == method]
    method_gold = [gt for gt, cond in zip(data['Gold Translation'], data['Condition']) if cond == method]

    # Compute BLEU
    try:
        bleu = corpus_bleu(method_translations, [method_gold]).score
    except Exception as e:
        bleu = 0.0
    bleu_scores[method].append(bleu)

    # Compute BERTScore
    try:
        P, R, F1 = score(method_translations, method_gold, lang="en", verbose=True)
        bert_scores[method].append(F1.mean().item())
    except Exception as e:
        bert_scores[method].append(0.0)

# Compute average accuracy and depth for each prompt method
average_accuracies = defaultdict(float)
average_depths = defaultdict(float)

for method in methods:
    method_accuracies = [acc for acc, cond in zip(accuracies, data['Condition']) if cond == method]
    method_depths = [depth for depth, cond in zip(depths, data['Condition']) if cond == method]

    average_accuracies[method] = sum(method_accuracies) / len(method_accuracies) if method_accuracies else 0.0
    average_depths[method] = sum(method_depths) / len(method_depths) if method_depths else 0.0

# Save final metrics to a text file
metrics_output_path = "yoruba_full_cg_cot_study_updated_metrics.txt"
with open(metrics_output_path, "w", encoding="utf-8") as metrics_file:
    metrics_file.write("Final Metrics:\n")
    for method in methods:
        metrics_file.write(f"Method: {method}\n")
        metrics_file.write(f"  BLEU Score: {sum(bleu_scores[method]) / len(bleu_scores[method]):.2f}\n")
        metrics_file.write(f"  BERTScore: {sum(bert_scores[method]) / len(bert_scores[method]):.2f}\n")
        metrics_file.write(f"  Average Accuracy: {average_accuracies[method]:.2f}\n")
        metrics_file.write(f"  Average Depth: {average_depths[method]:.2f}\n\n")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler

calculating scores...
computing bert embedding.


100%|██████████| 5/5 [01:06<00:00, 13.31s/it]
100%|██████████| 5/5 [01:06<00:00, 13.31s/it]


computing greedy matching.


100%|██████████| 4/4 [00:00<00:00, 25.03it/s]



done in 66.74 seconds, 3.00 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 5/5 [00:53<00:00, 10.77s/it]
100%|██████████| 5/5 [00:53<00:00, 10.77s/it]


computing greedy matching.


100%|██████████| 4/4 [00:00<00:00, 43.15it/s]



done in 53.95 seconds, 3.71 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 5/5 [00:56<00:00, 11.28s/it]
100%|██████████| 5/5 [00:56<00:00, 11.28s/it]


computing greedy matching.


100%|██████████| 4/4 [00:00<00:00, 45.28it/s]



done in 56.51 seconds, 3.54 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 5/5 [00:45<00:00,  9.11s/it]
100%|██████████| 5/5 [00:45<00:00,  9.11s/it]


computing greedy matching.


100%|██████████| 4/4 [00:00<00:00, 44.46it/s]



done in 45.64 seconds, 4.38 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 5/5 [00:45<00:00,  9.17s/it]
100%|██████████| 5/5 [00:45<00:00,  9.17s/it]


computing greedy matching.


100%|██████████| 4/4 [00:00<00:00, 42.92it/s]



done in 45.95 seconds, 4.35 sentences/sec


In [8]:
import pandas as pd
from scipy.stats import ttest_rel, wilcoxon

# Load metrics from the text file
metrics_file_path = "yoruba_full_cg_cot_study_updated_metrics.txt"
metrics = {}

with open(metrics_file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
    current_method = None
    for line in lines:
        if line.startswith("Method:"):
            current_method = line.split(": ")[1].strip()
            metrics[current_method] = {}
        elif current_method and ("Score" in line or "Accuracy" in line or "Depth" in line):
            key, value = line.split(": ")
            metrics[current_method][key.strip()] = float(value.strip())

# Extract data for statistical tests
methods = list(metrics.keys())
bleu_scores = [metrics[method].get("BLEU Score", 0.0) for method in methods]
bert_scores = [metrics[method].get("BERTScore", 0.0) for method in methods]
accuracies = [metrics[method].get("Average Accuracy", 0.0) for method in methods]
depths = [metrics[method].get("Average Depth", 0.0) for method in methods]

# Perform paired t-tests and Wilcoxon tests
results = []
for i in range(len(methods) - 1):
    method_a, method_b = methods[i], methods[i + 1]

    # Check if data is sufficient for statistical tests
    def check_variation(values_a, values_b):
        return len(set([values_a, values_b])) > 1

    # Initialize result placeholders
    p_bleu = p_bert = p_accuracy = p_depth = "No variation"
    p_bleu_w = p_bert_w = p_accuracy_w = p_depth_w = "No variation"

    if check_variation(bleu_scores[i], bleu_scores[i + 1]):
        try:
            _, p_bleu = ttest_rel([bleu_scores[i]], [bleu_scores[i + 1]])
            _, p_bleu_w = wilcoxon([bleu_scores[i]], [bleu_scores[i + 1]])
        except Exception:
            p_bleu = p_bleu_w = "Error"

    if check_variation(bert_scores[i], bert_scores[i + 1]):
        try:
            _, p_bert = ttest_rel([bert_scores[i]], [bert_scores[i + 1]])
            _, p_bert_w = wilcoxon([bert_scores[i]], [bert_scores[i + 1]])
        except Exception:
            p_bert = p_bert_w = "Error"

    if check_variation(accuracies[i], accuracies[i + 1]):
        try:
            _, p_accuracy = ttest_rel([accuracies[i]], [accuracies[i + 1]])
            _, p_accuracy_w = wilcoxon([accuracies[i]], [accuracies[i + 1]])
        except Exception:
            p_accuracy = p_accuracy_w = "Error"

    if check_variation(depths[i], depths[i + 1]):
        try:
            _, p_depth = ttest_rel([depths[i]], [depths[i + 1]])
            _, p_depth_w = wilcoxon([depths[i]], [depths[i + 1]])
        except Exception:
            p_depth = p_depth_w = "Error"

    results.append({
        "Comparison": f"{method_a} vs {method_b}",
        "Paired t-test": {
            "BLEU p-value": p_bleu,
            "BERT p-value": p_bert,
            "Accuracy p-value": p_accuracy,
            "Depth p-value": p_depth
        },
        "Wilcoxon test": {
            "BLEU p-value": p_bleu_w,
            "BERT p-value": p_bert_w,
            "Accuracy p-value": p_accuracy_w,
            "Depth p-value": p_depth_w
        }
    })

# Save statistical test results to a text file
stats_output_path = "yoruba_full_cg_cot_study_stats.txt"
with open(stats_output_path, "w", encoding="utf-8") as stats_file:
    stats_file.write("Statistical Test Results:\n")
    for result in results:
        stats_file.write(f"Comparison: {result['Comparison']}\n")
        stats_file.write("Paired t-test:\n")
        for key, value in result["Paired t-test"].items():
            stats_file.write(f"  {key}: {value}\n")
        stats_file.write("Wilcoxon test:\n")
        for key, value in result["Wilcoxon test"].items():
            stats_file.write(f"  {key}: {value}\n")
        stats_file.write("\n")