In [1]:
import subprocess
import pandas as pd
import json
from tqdm import tqdm
from pathlib import Path
import time

# ==========================
# CONFIG
# ==========================
MODELS = ["gpt-oss:20b"]  # Corrected model name (hyphen, not dash)
TRIALS = [1, 2, 3]
BATCH_SIZE = 5  # Now actually used
PROMPT_TEMPLATE = Path("./prompts/fewshot_prompt.txt").read_text(encoding="utf-8")
RETRIES = 3

# ==========================
# HELPERS
# ==========================
def run_ollama(prompt: str, model: str) -> str:
    """Call Ollama CLI and return output text. Now with error handling."""
    try:
        result = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        capture_output=True,
        text=True,
        encoding="utf-8",   # Force UTF-8
        timeout=60,
    )


        if result.stderr:
            print(f"Warning: Ollama stderr for {model}: {result.stderr.strip()}")
        output = result.stdout.strip()
        if not output:
            print(f"Warning: Empty output from {model}")
        return output
    except subprocess.TimeoutExpired:
        print(f"Timeout for {model}")
        return ""
    except Exception as e:
        print(f"Error running {model}: {e}")
        return ""


def build_prompt(sentences):
    """Inject sentences into the classification prompt. Now handles lists only."""
    if not sentences:
        return ""
    task_intro = PROMPT_TEMPLATE.strip()
    task_intro += "\n\nClassify the following sentences:\n (Only output JSON, nothing else.)"

    # Append sentences in table-like style
    for idx, s in enumerate(sentences, start=1):
        task_intro += f"{idx}\t{s}\n"

    task_intro += """
IMPORTANT:
Return the output strictly as a JSON array.
Each item must be an object with keys: "index", "sentence", "label".
Only output JSON, nothing else.
"""

    return task_intro


def parse_predictions(raw_output, batch_size):
    """
    Parse Ollama output as JSON.
    If it fails, return a list of 'UNK' with batch_size length.
    """
    if not raw_output:
        print("Parse error: Empty raw output")
        return ["UNK"] * batch_size
    try:
        preds_json = json.loads(raw_output)
        preds = [item.get("label", "UNK") for item in preds_json]
        # Ensure same length as input batch
        if len(preds) < batch_size:
            preds.extend(["UNK"] * (batch_size - len(preds)))
        elif len(preds) > batch_size:
            preds = preds[:batch_size]  # Truncate if too many
        return preds
    except Exception as e:
        print("Parse error:", e)
        print(f"Raw output preview: {raw_output[:300]}...")  # Debug: Print raw for inspection
        return ["UNK"] * batch_size


# ==========================
# MAIN BENCHMARK LOOP
# ==========================
def benchmark():
    for trial in TRIALS:
        print(f"\n=== TRIAL {trial} ===")
        try:
            text_df = pd.read_excel(f"./Dataset/Trial{trial}_text.xlsx")
            region_df = pd.read_excel(f"./Dataset/Trial{trial}_region.xlsx")
        except FileNotFoundError as e:
            print(f"Error: Dataset file not found - {e}")
            continue
        except Exception as e:
            print(f"Error loading datasets: {e}")
            continue

        sentences = text_df["sent"].tolist()
        ground_truth = region_df["lang"].tolist()

        if len(sentences) != len(ground_truth):
            print(f"Warning: Mismatched lengths - sentences: {len(sentences)}, ground_truth: {len(ground_truth)}")
            min_len = min(len(sentences), len(ground_truth))
            sentences = sentences[:min_len]
            ground_truth = ground_truth[:min_len]

        for model in MODELS:
            print(f"\n--- Model: {model} ---")
            predictions = []

            # Process in batches
            for start_idx in range(0, len(sentences), BATCH_SIZE):
                batch_sentences = sentences[start_idx:start_idx + BATCH_SIZE]
                success = False
                for attempt in range(RETRIES):
                    prompt = build_prompt(batch_sentences)
                    raw = run_ollama(prompt, model)

                    # Debug: Uncomment for raw output inspection
                    # print(f"DEBUG RAW (first 300 chars): {raw[:300]}")

                    preds = parse_predictions(raw, len(batch_sentences))
                    if all(p != "UNK" for p in preds):  # Success if no UNK in batch
                        predictions.extend(preds)
                        success = True
                        break
                    else:
                        print(f"Retry {attempt + 1}/{RETRIES} for batch starting at {start_idx}")
                        time.sleep(1)  # Brief pause before retry

                if not success:
                    # If all retries fail, mark UNK for the batch
                    predictions.extend(["UNK"] * len(batch_sentences))

            # Evaluate accuracy
            correct = sum(p == g for p, g in zip(predictions, ground_truth))
            accuracy = correct / len(ground_truth)
            print(f"Accuracy = {accuracy*100:.2f}%")

            # Save results
            out_df = pd.DataFrame({
                "Sentence": sentences,
                "GroundTruth": ground_truth,
                "Prediction": predictions
            })
            out_df.to_csv(f"results_trial{trial}_{model}.csv", index=False, encoding="utf-8-sig")
            print(f"Results saved to results_trial{trial}_{model}.csv")


if __name__ == "__main__":
    benchmark()


=== TRIAL 1 ===

--- Model: gpt-oss:20b ---
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 1/3 for batch starting at 0
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 2/3 for batch starting at 0
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 3/3 for batch starting at 0
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 1/3 for batch starting at 5
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 2/3 for batch starting at 5
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 3/3 for batch starting at 5
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 1/3 for batch starting at 10
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 2/3 for batch starting at 10
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 3/3 for batch starting at 10
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 1/3 for batch starting at 15
Timeout for gpt-oss:20b
Parse error: Empty raw output
Retry 2/3 for bat

KeyboardInterrupt: 

In [12]:
y = pd.read_excel("./Dataset/Trial1_region.xlsx", na_values=[], keep_default_na=False)
y['lang'].unique()

array(['NA', 'IRAQ', 'YEM', 'GULF', 'MSA', 'LEV', 'NILE'], dtype=object)

In [18]:
actual_values = y.iloc[:,0].tolist()

with open("./results/results_trial1_fewshot_gptoss20b.json", "r", encoding="utf-8") as f:
    predictions_json = json.load(f)

# Extract predicted labels in the same order
predictions = [item["label"] for item in predictions_json]
print(f"{len(actual_values)} {len(predictions)}")
assert len(actual_values) == len(predictions), "Mismatch in number of labels!"

# Compare and calculate accuracy
correct = [1 if gt == pred else 0 for gt, pred in zip(actual_values, predictions)]
accuracy = sum(correct) / len(correct)
print(f"Accuracy: {accuracy*100:.2f}%")

# Optional: create a comparison DataFrame
df_compare = pd.DataFrame({
    "GroundTruth": actual_values,
    "Prediction": predictions
})
df_compare.to_csv("comparison_trial1.csv", index=False, encoding="utf-8-sig")

70 70
Accuracy: 40.00%
