In [4]:
import pandas as pd
import re

def sample_reviews_by_language(input_file='bmw_app_analysis/results/bmw_reviews_consolidated_20250504_155236.csv', 
                              output_file='bmw_app_analysis/results/bmw_reviews_sampledHE.csv'):
    """
    Sample 30 reviews EACH from German, Italian, Spanish and French languages
    with at least 5 words from the BMW reviews dataset.
    
    Args:
        input_file: Path to the input CSV file
        output_file: Path to save the sampled reviews
    """
    # Read the consolidated file
    print(f"Reading {input_file}...")
    df = pd.read_csv(input_file)
    
    # Verify columns exist
    required_columns = ['reviewId', 'content', 'language', 'content_english']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"ERROR: Missing required columns: {missing_columns}")
        print(f"Available columns: {df.columns.tolist()}")
        return None
    
    # Apply minimum word filter - INCREASED TO 5 WORDS
    min_words = 5  # Changed from 3 to 5
    print(f"Filtering reviews with at least {min_words} words...")
    df['word_count'] = df['content'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) if isinstance(x, str) else 0)
    df_filtered_by_length = df[df['word_count'] >= min_words].copy()
    
    print(f"Reviews with {min_words}+ words: {len(df_filtered_by_length)} out of {len(df)}")
    
    # Target languages - in desired order
    target_languages = ["German", "Italian", "Spanish", "French"]
    
    # Apply language filter
    df_filtered = df_filtered_by_length[df_filtered_by_length['language'].isin(target_languages)].copy()
    
    # Report on available reviews per language
    print("\nReviews available after filtering by language and length:")
    for lang in target_languages:
        count = len(df_filtered[df_filtered['language'] == lang])
        print(f"  {lang}: {count} reviews")
    
    # Sample 30 from each language group (or all available if less than 30)
    sampled_reviews = []
    target_per_language = 30  # 30 reviews per language
    
    for lang in target_languages:
        lang_df = df_filtered[df_filtered['language'] == lang]
        
        if len(lang_df) == 0:
            print(f"No reviews found for {lang}")
            continue
            
        # Sample up to target_per_language reviews, or all if fewer available
        sample_size = min(target_per_language, len(lang_df))
        sampled = lang_df.sample(sample_size, random_state=42)
        sampled_reviews.append(sampled)
        print(f"Sampled {len(sampled)} reviews from {lang}")
    
    # Combine all samples
    sampled_df = pd.concat(sampled_reviews, ignore_index=True)
    
    # Create language order mapping for sorting
    lang_order = {lang: i for i, lang in enumerate(target_languages)}
    
    # Select only the required columns and rename content_english to english_content
    output_df = sampled_df[['reviewId', 'content', 'language', 'content_english']].copy()
    
    # Clean up english_content: remove quotes
    output_df['content_english'] = output_df['content_english'].astype(str).str.replace('"', '')
    
    # Rename the column
    output_df = output_df.rename(columns={'content_english': 'english_content'})
    
    # Sort by language in the specified order
    output_df['lang_order'] = output_df['language'].map(lang_order)
    output_df = output_df.sort_values('lang_order').drop('lang_order', axis=1).reset_index(drop=True)
    
    # Save to CSV
    output_df.to_csv(output_file, index=False)
    print(f"Saved sampled reviews to {output_file}")
    
    print("\nFinal sample distribution:")
    for lang in target_languages:
        count = len(output_df[output_df['language'] == lang])
        print(f"  {lang}: {count} reviews")
    
    return output_df

# Execute the function
if __name__ == "__main__":
    sampled_reviews = sample_reviews_by_language()
    
    # Display summary
    if sampled_reviews is not None and not sampled_reviews.empty:
        # Show a couple examples from each language
        print("\nExample reviews from each language:")
        for lang in ["German", "Italian", "Spanish", "French"]:
            lang_samples = sampled_reviews[sampled_reviews['language'] == lang].head(1)
            if not lang_samples.empty:
                print(f"\n{lang} example:")
                for _, row in lang_samples.iterrows():
                    print(f"  Original: {row['content'][:50]}{'...' if len(row['content']) > 50 else ''}")
                    print(f"  English:  {row['english_content'][:50]}{'...' if len(row['english_content']) > 50 else ''}")

Reading bmw_app_analysis/results/bmw_reviews_consolidated_20250504_155236.csv...
Filtering reviews with at least 5 words...
Reviews with 5+ words: 13312 out of 18350

Reviews available after filtering by language and length:
  German: 3839 reviews
  Italian: 920 reviews
  Spanish: 670 reviews
  French: 1155 reviews
Sampled 30 reviews from German
Sampled 30 reviews from Italian
Sampled 30 reviews from Spanish
Sampled 30 reviews from French
Saved sampled reviews to bmw_app_analysis/results/bmw_reviews_sampledHE.csv

Final sample distribution:
  German: 30 reviews
  Italian: 30 reviews
  Spanish: 30 reviews
  French: 30 reviews

Example reviews from each language:

German example:
  Original: Die App kann schon viel, aber warum lassen sich di...
  English:  The app can already do a lot, but why can't the wi...

Italian example:
  Original: Dalla versione 2.11.0, con IDrive 6 e telefono And...
  English:  Since version 2.11.0, with iDrive 6 and an Android...

Spanish example:
  Original: A

In [None]:
# bmw_topic_cards.py  –  v6  (2025‑05‑02)
# ================================================================
# • Stratified random sampling (mirrors topic sentiment mix)
# • One paragraph summary  +  “‑ ” negative bullets  +  “+ ” positive bullets
# • Prompts stored inside TopicSummary.prompts  (audit / debugging)
# • No JSON parsing, so Ollama chatter can’t break the pipeline
# ---------------------------------------------------------------

from __future__ import annotations

import html
import logging
import subprocess
import textwrap
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from IPython.display import HTML, display
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm

# -------------------------------------------------------------------
#  CONFIGURATION
# -------------------------------------------------------------------
TOP_N_TOPICS        = 15          # how many topics to visualise
PROMPT_SAMPLE_SIZE  = 300         # reviews fed to each LLM prompt
N_KEYWORDS          = 8
N_PHRASES           = 5
MAX_BULLETS         = 7           # issues / positives
BMW_BLUE            = "#0066B1"

STAR_SVG = (
    "<svg width='14' height='14' viewBox='0 0 24 24' "
    "xmlns='http://www.w3.org/2000/svg' style='vertical-align:-2px'>"
    "<polygon fill='#FFD700' "
    "points='12 2 15.09 8.26 22 9.27 17 14.14 "
    "18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26'/></svg>"
)

# -------------------------------------------------------------------
#  NLTK – bootstrap stop‑words
# -------------------------------------------------------------------
for res in ("stopwords", "punkt"):
    try:
        nltk.data.find(f"corpora/{res}")
    except LookupError:
        nltk.download(res, quiet=True)

STOPWORDS = set(stopwords.words("english")) | {
    "bmw", "app", "car", "please", "would", "also", "get", "use", "using"
}

# -------------------------------------------------------------------
#  OLLAMA RUNNER  (temperature 0 for determinism)
# -------------------------------------------------------------------
def _subprocess_runner(prompt: str, model: str) -> str:
    proc = subprocess.run(
        ["ollama", "run", model, "-t", "0"],
        input=prompt,
        text=True,
        capture_output=True,
        check=False,
    )
    return proc.stdout

# -------------------------------------------------------------------
#  STRATIFIED SAMPLING  (mirror sentiment mix)
# -------------------------------------------------------------------
def _stratified_sample(df: pd.DataFrame, size: int, seed: int = 0) -> List[str]:
    dist = df["sentiment"].value_counts(normalize=True)
    quota = {s: int(round(dist.get(s, 0) * size)) for s in ("negative", "positive", "neutral")}
    diff = size - sum(quota.values())
    if diff:
        quota[max(dist, key=dist.get, default="negative")] += diff

    col = "content_english" if "content_english" in df.columns else "content"
    texts: List[str] = []

    for sentiment, n in quota.items():
        if n <= 0:
            continue
        bucket = df[df["sentiment"] == sentiment]
        sample_n = min(n, len(bucket))
        texts.extend(bucket.sample(sample_n, random_state=seed)[col].tolist())

    # top‑up if any bucket ran short
    if len(texts) < size:
        short = size - len(texts)
        remainder = df.drop(df.index[df[col].isin(texts)]).sample(short, random_state=seed)
        texts.extend(remainder[col].tolist())

    return texts

# -------------------------------------------------------------------
#  BULLET‑PARSER
# -------------------------------------------------------------------
def _parse_bullets(text: str, prefix: str) -> List[str]:
    """Return list of lines starting with the prefix (‘- ’ or ‘+ ’)."""
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    bullets = [ln[len(prefix):].strip() for ln in lines if ln.startswith(prefix)]
    # fallback to comma‑sep if model ignored prefixes
    if not bullets and "," in text:
        bullets = [part.strip() for part in text.split(",") if part.strip()]
    return bullets[:MAX_BULLETS]

# -------------------------------------------------------------------
#  DATACLASS
# -------------------------------------------------------------------
@dataclass
class TopicSummary:
    summary: str
    issues: List[str]
    positives: List[str]
    avg_rating: float
    review_count: int
    sentiment_dist: Dict[str, float]
    top_keywords: List[str]
    top_phrases: List[str]
    prompts: Dict[str, str] = field(default_factory=dict)

    @property
    def stars(self) -> str:
        full, half = int(self.avg_rating), self.avg_rating - int(self.avg_rating) >= 0.5
        return STAR_SVG * full + (STAR_SVG if half else "")

# -------------------------------------------------------------------
#  MAIN ENTRY
# -------------------------------------------------------------------
def create_topic_cards_from_classified(
    df: pd.DataFrame,
    *,
    ollama_model_name: str = "gemma3:12b",
    ollama_runner: Callable[[str, str], str] = _subprocess_runner,
) -> Dict[str, TopicSummary]:
    """
    Render topic cards & a rating chart; return dict[topic] -> TopicSummary.
    Prompts are stored in `TopicSummary.prompts`.
    """
    log = _setup_logger()

    # -------- sanity checks -----------------------------------------
    if {"topics", "sentiment"} - set(df.columns):
        raise ValueError("DataFrame must include 'topics' & 'sentiment'.")

    text_col = "content_english" if "content_english" in df.columns else "content"

    if "score" not in df.columns:
        df["score"] = (
            df["sentiment"]
            .map({"positive": 4.5, "neutral": 3.0, "negative": 1.5})
            .fillna(3.0)
        )

    # -------- pick top topics --------------------------------------
    topics_freq = df["topics"].str.get_dummies(sep=",").sum()
    top_topics = (
        topics_freq.sort_values(ascending=False).head(TOP_N_TOPICS).index
    )

    results: Dict[str, TopicSummary] = {}

    for topic in tqdm(top_topics, desc="Topics"):
        sub = df[df["topics"].str.contains(fr"\b{topic}\b", case=False, na=False)]
        if sub.empty:
            continue

        avg = sub["score"].mean()
        mix = sub["sentiment"].value_counts(normalize=True).to_dict()

        tokens = word_tokenize(" ".join(sub[text_col].fillna("").str.lower()))
        tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS and len(t) > 2]
        keywords = [w for w, _ in Counter(tokens).most_common(N_KEYWORDS)]
        bigrams = [f"{a} {b}" for a, b in zip(tokens, tokens[1:])]
        phrases = [p for p, _ in Counter(bigrams).most_common(N_PHRASES)]

        sample_n = min(PROMPT_SAMPLE_SIZE, len(sub))
        sample_texts = _stratified_sample(sub, sample_n)
        sample_blob = "\n".join(f"[{i}] {txt}" for i, txt in enumerate(sample_texts))

        ctx_block = (
            f"Average score {avg:.2f} (n={len(sub)}). "
            f"Sentiment mix: neg {mix.get('negative',0):.0%}, "
            f"pos {mix.get('positive',0):.0%}, "
            f"neu {mix.get('neutral',0):.0%}.\n"
            f"Top keywords: {', '.join(keywords)}.\n"
            f"Top phrases: {', '.join(phrases)}."
        )

        prompts: Dict[str, str] = {}

        def build(kind: str) -> str:
            """Return fully‑formed prompt & store it"""
            header = (
                "You are an analytical assistant. Follow ALL rules:\n"
                f"• Focus **only** on the topic '{topic}'.\n"
                "• Ignore unrelated features.\n"
                "• Use English even if reviews were translated.\n"
                "• Be factual, concise (temperature 0).\n"
            )
            if kind == "summary":
                body = (
                    "Write one concise paragraph (3‑5 sentences) that "
                    "summarises what users say about this topic, covering "
                    "both pain points and praise."
                )
            else:
                label = "negative issues" if kind == "issues" else "positive aspects"
                prefix = "-" if kind == "issues" else "+"
                body = (
                    f"List up to {MAX_BULLETS} main {label}. "
                    f"Each line MUST start with '{prefix} '. "
                    "Use short noun phrases, no duplication, no periods."
                )

            prompt = (
                f"{header}\n\nCONTEXT:\n{ctx_block}\n\nSAMPLES:\n{sample_blob}\n\nTASK:\n{body}"
            )
            prompts[kind] = prompt
            return prompt

        # ----- call Ollama (three prompts in parallel) ----------------
        with ThreadPoolExecutor(max_workers=3) as pool:
            futures = {
                pool.submit(ollama_runner, build(k), ollama_model_name): k
                for k in ("summary", "issues", "positives")
            }
            raw = {k: f.result().strip() for f, k in futures.items()}

        summary_text = raw["summary"]
        issues_list  = _parse_bullets(raw["issues"], "-")
        pos_list     = _parse_bullets(raw["positives"], "+")

        results[topic] = TopicSummary(
            summary=summary_text,
            issues=issues_list,
            positives=pos_list,
            avg_rating=avg,
            review_count=len(sub),
            sentiment_dist=mix,
            top_keywords=keywords,
            top_phrases=phrases,
            prompts=prompts,
        )

    # ------------- render & plot -------------------------------------
    _render_cards(results)
    _plot_chart(results, df)
    return {k: v.__dict__ for k, v in results.items()}

# -------------------------------------------------------------------
#  VISUAL HELPER – HTML CARDS
# -------------------------------------------------------------------
def _render_cards(data: Dict[str, TopicSummary]) -> None:
    css = textwrap.dedent(
        """
        <style>
          :root {{--blue:{blue};--neg:#E53935;--pos:#43A047;--neu:#FFA000;
                 --bg:#fff;--text:#1a1a1a;}}
          @media (prefers-color-scheme: dark) {{
              :root {{--bg:#121212;--text:#e0e0e0;}}
          }}
          body {{background:var(--bg);color:var(--text);}}

/* container */
          .cards {{max-width:1200px;margin:0 auto;font-family:'Helvetica Neue',Arial,sans-serif;}}

/* card */
          .card {{
              display:grid;grid-template-columns:220px 1fr;
              border:1px solid #bbb;border-radius:8px;margin:18px 0;
              overflow:hidden;box-shadow:0 4px 10px rgba(0,0,0,.12);
              animation:fadeIn .4s ease;
          }}
          @keyframes fadeIn {{from {{opacity:0;transform:translateY(8px);}}
                              to   {{opacity:1;transform:translateY(0);}}}}
          header {{
              grid-column:1/-1;background:linear-gradient(135deg,var(--blue) 0%,#0088cc 100%);
              color:#fff;padding:12px 18px;font-size:20px;font-weight:600;
          }}

/* sidebar */
          .side {{
              background:#f5f7fa;border-right:1px solid #ddd;
              padding:14px;display:flex;flex-direction:column;gap:14px;
          }}
          .stat .num {{font-weight:700;font-size:19px;line-height:1;color:var(--text);}}
          .stat .label{{font-size:11px;text-transform:uppercase;font-weight:600;
                       letter-spacing:.4px;color:var(--text);}}
          .bar {{display:flex;height:10px;border-radius:4px;overflow:hidden;margin-top:4px;}}
          .seg {{flex-shrink:0;transition:opacity .2s;}} .seg:hover {{opacity:.75;}}
          .neg{{background:var(--neg);}} .pos{{background:var(--pos);}} .neu{{background:var(--neu);}}

/* main */
          .main {{padding:18px;line-height:1.6;}}
          .summary {{
              background:rgba(0,102,177,.07);border-left:3px solid var(--blue);
              padding:10px;margin-bottom:16px;line-height:1.6;
          }}
          .cols {{display:flex;flex-wrap:wrap;gap:24px;margin-bottom:16px;}}
          h4 {{margin:0 0 6px;color:var(--blue);font-weight:600;}}
          ul {{margin:0;padding-left:18px;}}
          li {{margin:4px 0;}}

/* chips */
          .chips {{margin-bottom:10px;}}
          .chip {{
              display:inline-block;padding:4px 10px;border-radius:20px;
              font-size:11px;font-weight:700;margin:2px;cursor:pointer;
              transition:opacity .2s;
          }}
          .chip:hover {{opacity:.8;}}
          .kw{{background:#e3f2fd;color:var(--blue);}}
          .ph{{background:#e8f5e9;color:#2e7d32;border:1px solid #c8e6c9;}}

/* responsive */
          @media (max-width:768px){{
              .card{{grid-template-columns:1fr}}
              .side{{order:2;border:none;border-top:1px solid #ddd;
                    flex-direction:row;justify-content:space-around}}
          }}
        </style>
        """
    ).format(blue=BMW_BLUE)

    output: List[str] = [css, "<div class='cards'>"]

    # sort by review count
    for topic, info in sorted(data.items(), key=lambda x: x[1].review_count, reverse=True):
        if topic.lower() == "other":
            continue

        bar = "".join(_bar_seg(name, frac) for name, frac in info.sentiment_dist.items())
        kw_html = "".join(f"<span class='chip kw'>{html.escape(k)}</span>" for k in info.top_keywords)
        ph_html = "".join(f"<span class='chip ph'>{html.escape(p)}</span>" for p in info.top_phrases)
        issues_html = "".join(f"<li>{html.escape(i)}</li>" for i in info.issues) or "<li>No major issues</li>"
        pos_html = "".join(f"<li>{html.escape(p)}</li>" for p in info.positives) or "<li>No specific positives</li>"

        output.append(
            f"<div class='card'>"
            f"<header>{html.escape(topic)}</header>"

            f"<section class='side'>"
            f"<div class='stat'><span class='num'>{info.review_count}</span><span class='label'>Reviews</span></div>"
            f"<div class='stat'><span class='num'>{info.avg_rating:.2f}/5 {info.stars}</span>"
            f"<span class='label'>Average</span></div>"
            f"<div class='stat'><span class='label'>Sentiment</span><div class='bar'>{bar}</div></div>"
            "</section>"

            "<section class='main'>"
            f"<p class='summary'>{html.escape(info.summary)}</p>"
            "<div class='cols'>"
            f"<div><h4>Issues</h4><ul>{issues_html}</ul></div>"
            f"<div><h4>Positives</h4><ul>{pos_html}</ul></div>"
            "</div>"
            f"<div class='chips'><h4>Keywords</h4>{kw_html}</div>"
            f"<div class='chips'><h4>Phrases</h4>{ph_html}</div>"
            "</section>"
            "</div>"
        )

    output.append("</div>")
    display(HTML("\n".join(output)))

def _bar_seg(name: str, frac: float) -> str:
    cls = "pos" if name == "positive" else "neg" if name == "negative" else "neu"
    pct = max(frac * 100, 1)
    return f"<span class='seg {cls}' style='width:{pct}%' title='{name}: {frac:.1%}'></span>"

# -------------------------------------------------------------------
#  VISUAL HELPER – MATPLOTLIB CHART
# -------------------------------------------------------------------
def _plot_chart(data: Dict[str, TopicSummary], df: pd.DataFrame) -> None:
    rows = [(t, d.avg_rating, d.review_count) for t, d in data.items() if t.lower() != "other"]
    if not rows:
        return
    topics, ratings, counts = zip(*rows)
    order = sorted(range(len(topics)), key=lambda i: ratings[i])
    topics = [topics[i] for i in order]
    ratings = [ratings[i] for i in order]
    counts = [counts[i] for i in order]

    cmap = mpl.colormaps.get_cmap("viridis").resampled(len(topics))

    plt.figure(figsize=(12, 10))
    bars = plt.barh(topics, ratings, color=[cmap(i) for i in range(len(topics))])

    for bar, cnt, rating in zip(bars, counts, ratings):
        x = rating + 0.05 if rating < 2.5 else rating - 0.6
        clr = "black" if rating < 2.5 else "white"
        plt.text(x, bar.get_y() + bar.get_height() / 2, f"n={cnt}", va="center",
                 color=clr, fontweight="bold")

    mean = df["score"].mean()
    plt.axvline(mean, linestyle="--", linewidth=2, label=f"Overall {mean:.2f}")
    plt.xticks(range(1, 6), ["★" * i for i in range(1, 6)])
    plt.xlim(0, 5.2)
    plt.xlabel("Star Rating")
    plt.ylabel("Topic")
    plt.title("BMW App – Average Rating by Topic")
    plt.legend()
    plt.grid(axis="x", linestyle="--", alpha=0.5)
    plt.tight_layout()
    plt.show()

# -------------------------------------------------------------------
#  MISC HELPERS
# -------------------------------------------------------------------
def _setup_logger() -> logging.Logger:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    return logging.getLogger("bmw_topic_cards")

In [19]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Evaluate two candidate translations of app reviews by giving a simple 
numerical score (1-5) to each translation compared to the original.

Key features
----------------
* Simple 1-5 scoring for each translation
* Automatic delimiter detection (handles ';', ',', or '\t')
* Early validation that required columns exist
* Summary statistics broken down by language

Author: <your name>
Date: 2025‑05‑04
"""

import csv
import os
import subprocess
import time
import json
from typing import Optional, Dict, Any

import pandas as pd
from tqdm import tqdm


# --------------------------------------------------------------------------- #
#                              Helper functions                               #
# --------------------------------------------------------------------------- #
def detect_delimiter(file_path: str, sample_bytes: int = 2048) -> str:
    """Return the most likely delimiter in the CSV file."""
    with open(file_path, "r", encoding="utf-8") as f:
        sample = f.read(sample_bytes)

        # Quick manual checks
        if "\t" in sample:
            return "\t"
        if ";" in sample:
            return ";"

        # Fallback to csv.Sniffer
        try:
            dialect = csv.Sniffer().sniff(sample)
            return dialect.delimiter
        except Exception:
            return ","  # default


def run_ollama(prompt: str, model_name: str, max_retries: int = 2) -> str:
    """Call `ollama run` with retries and a timeout; return raw stdout."""
    for attempt in range(max_retries + 1):
        try:
            proc = subprocess.run(
                ["ollama", "run", model_name],
                input=prompt,
                text=True,
                capture_output=True,
                timeout=60,  # Reduced timeout for simpler prompt
            )
            return proc.stdout.strip()
        except subprocess.TimeoutExpired:
            if attempt < max_retries:
                print(f"Timeout. Retrying {attempt + 1}/{max_retries} …")
                time.sleep(2)
            else:
                return "ERROR: Timeout"
        except Exception as exc:
            if attempt < max_retries:
                print(f"{exc}. Retrying {attempt + 1}/{max_retries} …")
                time.sleep(2)
            else:
                return f"ERROR: {exc}"


def create_simple_prompt(row: pd.Series) -> str:
    """Build a simple scoring prompt for a single review row."""
    return f"""You are an expert translator evaluating machine translations.

SOURCE TEXT ({row['language']}):
{row['content']}

TRANSLATION A:
{row['Translation A']}

TRANSLATION B:
{row['Translation B']}

TASK:
Rate each translation on a scale from 1 (poor) to 5 (excellent) based on how well it captures 
the meaning, tone, and intent of the source text.

RESPONSE FORMAT:
Respond with ONLY a JSON object with two scores like this:
{{
  "score_A": 3,
  "score_B": 5
}}

Use only integers from 1-5. No explanation or additional text.
"""


def parse_score_response(response: str) -> Dict[str, Any]:
    """
    Parse the LLM response containing simple scores.
    Returns a dict with scores and winner determination.
    """
    if response.startswith("ERROR:"):
        return {"error": response, "score_A": None, "score_B": None, "winner": "Error"}
    
    # Try to extract JSON from the response
    try:
        # Find the JSON object in the response
        start_idx = response.find("{")
        end_idx = response.rfind("}")
        
        if start_idx >= 0 and end_idx > start_idx:
            json_str = response[start_idx:end_idx+1]
            data = json.loads(json_str)
            
            # Extract scores
            score_a = data.get("score_A")
            score_b = data.get("score_B")
            
            # Determine winner based on scores
            if score_a is not None and score_b is not None:
                if score_a > score_b:
                    winner = "A"
                elif score_b > score_a:
                    winner = "B"
                else:
                    winner = "Tie"
                
                return {"score_A": score_a, "score_B": score_b, "winner": winner}
            else:
                return {"error": "Missing scores in response", "winner": "Unclear"}
        else:
            # No JSON found
            return {"error": "No JSON found in response", "winner": "Unclear"}
    
    except json.JSONDecodeError:
        # Attempt manual parsing as fallback
        try:
            lines = response.strip().split('\n')
            score_a = None
            score_b = None
            
            for line in lines:
                line = line.lower().strip()
                if "score_a" in line or "score a" in line or "a:" in line:
                    # Try to extract a number
                    nums = [int(s) for s in line if s.isdigit()]
                    if nums and 1 <= nums[0] <= 5:
                        score_a = nums[0]
                
                if "score_b" in line or "score b" in line or "b:" in line:
                    # Try to extract a number
                    nums = [int(s) for s in line if s.isdigit()]
                    if nums and 1 <= nums[0] <= 5:
                        score_b = nums[0]
            
            if score_a is not None and score_b is not None:
                if score_a > score_b:
                    winner = "A"
                elif score_b > score_a:
                    winner = "B"
                else:
                    winner = "Tie"
                    
                return {"score_A": score_a, "score_B": score_b, "winner": winner}
            else:
                return {"error": "Could not extract scores", "winner": "Unclear"}
                
        except Exception as e:
            return {"error": f"Parsing error: {str(e)}", "winner": "Error"}
    
    except Exception as e:
        return {"error": str(e), "winner": "Error"}


# --------------------------------------------------------------------------- #
#                             Main entry‑point                                #
# --------------------------------------------------------------------------- #
def evaluate_translations(
    input_file: str,
    output_file: str,
    model_name: str = "llama3:latest",
) -> Optional[pd.DataFrame]:
    """Load review/translation CSV, evaluate each pair with simple scoring, save results."""
    print(f"Reading {input_file} …")
    delimiter = detect_delimiter(input_file)
    print(f"Detected delimiter '{delimiter}'")

    try:
        df = pd.read_csv(
            input_file,
            sep=delimiter,
            quotechar='"',
            doublequote=True,
            engine="python",  # forgiving
        )
    except Exception as exc:
        print(f"❌ Could not load CSV: {exc}")
        return None

    # Required columns
    required = ["content", "language", "Translation A", "Translation B"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        print("❌ Missing required columns:", missing)
        print("Available columns:", df.columns.tolist())
        return None

    # Add score and judgment columns if not present
    if "score_A" not in df.columns:
        df["score_A"] = pd.NA
    if "score_B" not in df.columns:
        df["score_B"] = pd.NA
    if "judgment" not in df.columns:
        df["judgment"] = pd.NA

    # Evaluate rows
    print(f"Evaluating {len(df)} rows with {model_name} …")
    for idx in tqdm(df.index):
        # Skip if already evaluated
        if pd.notna(df.at[idx, "score_A"]) and pd.notna(df.at[idx, "score_B"]):
            continue

        prompt = create_simple_prompt(df.loc[idx])
        response = run_ollama(prompt, model_name)

        # Parse the response
        result = parse_score_response(response)
        
        # Store the scores and judgment
        df.at[idx, "score_A"] = result.get("score_A", None)
        df.at[idx, "score_B"] = result.get("score_B", None)
        df.at[idx, "judgment"] = result.get("winner", "Error")
        
        # Save error info if there was an error
        if "error" in result:
            df.at[idx, "error"] = result["error"]
            print(f"⚠️ Row {idx}: {result['error']}")

    # Final save
    try:
        df.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL)
        print(f"✅ Results written to {output_file}")
    except Exception as exc:
        print(f"⚠️ CSV save failed ({exc}); trying Excel …")
        try:
            df.to_excel(output_file.replace(".csv", ".xlsx"), index=False)
            print("✅ Saved as Excel instead")
        except Exception as exc2:
            print(f"❌ Could not save results at all: {exc2}")
            return None

    # -------------------- UPDATED SUMMARY SECTION --------------------
    print("\n====== EVALUATION SUMMARY ======")
    
    # Overall summary
    print("\n🌐 OVERALL RESULTS:")
    counts = df["judgment"].value_counts(dropna=False)
    for label, cnt in counts.items():
        pct = cnt / len(df) * 100
        print(f"  {label:8}: {cnt:>4}  ({pct:5.1f} %)")
    
    a_avg = df["score_A"].mean()
    b_avg = df["score_B"].mean()
    diff = a_avg - b_avg
    
    print(f"\n  Average Scores (1-5):")
    print(f"  Translation A: {a_avg:.2f}")
    print(f"  Translation B: {b_avg:.2f}")
    print(f"  Difference (A-B): {diff:+.2f}")
    
    # Per-language statistics
    print("\n🔍 RESULTS BY LANGUAGE:")
    
    # Create a list of languages to process in a specific order
    languages = ["German", "Italian", "French", "Spanish"]
    
    # Ensure all expected languages are actually in the dataset
    present_languages = df["language"].unique()
    print(f"  Languages in dataset: {', '.join(sorted(present_languages))}")
    
    # For each language, calculate stats
    for lang in languages:
        lang_df = df[df["language"] == lang]
        
        if len(lang_df) == 0:
            print(f"\n  ❌ No reviews for {lang}")
            continue
        
        print(f"\n  📊 {lang} ({len(lang_df)} reviews):")
        
        # Judgment distribution
        lang_counts = lang_df["judgment"].value_counts(dropna=False)
        for label in ["A", "B", "Tie", "Error", "Unclear"]:
            if label in lang_counts:
                cnt = lang_counts[label]
                pct = cnt / len(lang_df) * 100
                print(f"    {label:8}: {cnt:>3}  ({pct:5.1f} %)")
        
        # Scores
        lang_a_avg = lang_df["score_A"].mean()
        lang_b_avg = lang_df["score_B"].mean()
        lang_diff = lang_a_avg - lang_b_avg
        
        print(f"\n    Scores (1-5):")
        print(f"    Translation A: {lang_a_avg:.2f}")
        print(f"    Translation B: {lang_b_avg:.2f}")
        print(f"    Difference (A-B): {lang_diff:+.2f}")
    
    # Distribution of score differences
    print("\n📈 SCORE DIFFERENCE DISTRIBUTION:")
    
    # Calculate score differences
    df["score_diff"] = df["score_A"] - df["score_B"]
    
    # Group differences into categories
    diff_categories = {
        "A much better (≥2)": (df["score_diff"] >= 2).sum(),
        "A better (1)": (df["score_diff"] == 1).sum(),
        "Tie (0)": (df["score_diff"] == 0).sum(),
        "B better (-1)": (df["score_diff"] == -1).sum(),
        "B much better (≤-2)": (df["score_diff"] <= -2).sum()
    }
    
    for label, count in diff_categories.items():
        pct = count / len(df) * 100
        print(f"  {label:20}: {count:>3}  ({pct:5.1f} %)")
    
    return df


# --------------------------------------------------------------------------- #
#                                 CLI hook                                    #
# --------------------------------------------------------------------------- #
if __name__ == "__main__":
    IN_FILE = "bmw_app_analysis/translations/bmw_reviews_sampledHE.csv"
    OUT_FILE = "bmw_app_analysis/translations/bmw_reviews_sampledLLM.csv"
    MODEL = "gemma3:12b"

    print(f"Starting simple translation evaluation with {MODEL} …")
    evaluate_translations(IN_FILE, OUT_FILE, MODEL)

Starting simple translation evaluation with gemma3:12b …
Reading bmw_app_analysis/translations/bmw_reviews_sampledHE.csv …
Detected delimiter ';'
Evaluating 120 rows with gemma3:12b …


  0%|          | 0/120 [00:00<?, ?it/s]

  3%|▎         | 4/120 [00:46<22:29, 11.63s/it]  


KeyboardInterrupt: 