In [4]:
import pandas as pd
import re

def sample_reviews_by_language(input_file='bmw_app_analysis/results/bmw_reviews_consolidated_20250504_155236.csv', 
                              output_file='bmw_app_analysis/results/bmw_reviews_sampledHE.csv'):
    """
    Sample 30 reviews EACH from German, Italian, Spanish and French languages
    with at least 5 words from the BMW reviews dataset.
    
    Args:
        input_file: Path to the input CSV file
        output_file: Path to save the sampled reviews
    """
    # Read the consolidated file
    print(f"Reading {input_file}...")
    df = pd.read_csv(input_file)
    
    # Verify columns exist
    required_columns = ['reviewId', 'content', 'language', 'content_english']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"ERROR: Missing required columns: {missing_columns}")
        print(f"Available columns: {df.columns.tolist()}")
        return None
    
    # Apply minimum word filter - INCREASED TO 5 WORDS
    min_words = 5  # Changed from 3 to 5
    print(f"Filtering reviews with at least {min_words} words...")
    df['word_count'] = df['content'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) if isinstance(x, str) else 0)
    df_filtered_by_length = df[df['word_count'] >= min_words].copy()
    
    print(f"Reviews with {min_words}+ words: {len(df_filtered_by_length)} out of {len(df)}")
    
    # Target languages - in desired order
    target_languages = ["German", "Italian", "Spanish", "French"]
    
    # Apply language filter
    df_filtered = df_filtered_by_length[df_filtered_by_length['language'].isin(target_languages)].copy()
    
    # Report on available reviews per language
    print("\nReviews available after filtering by language and length:")
    for lang in target_languages:
        count = len(df_filtered[df_filtered['language'] == lang])
        print(f"  {lang}: {count} reviews")
    
    # Sample 30 from each language group (or all available if less than 30)
    sampled_reviews = []
    target_per_language = 30  # 30 reviews per language
    
    for lang in target_languages:
        lang_df = df_filtered[df_filtered['language'] == lang]
        
        if len(lang_df) == 0:
            print(f"No reviews found for {lang}")
            continue
            
        # Sample up to target_per_language reviews, or all if fewer available
        sample_size = min(target_per_language, len(lang_df))
        sampled = lang_df.sample(sample_size, random_state=42)
        sampled_reviews.append(sampled)
        print(f"Sampled {len(sampled)} reviews from {lang}")
    
    # Combine all samples
    sampled_df = pd.concat(sampled_reviews, ignore_index=True)
    
    # Create language order mapping for sorting
    lang_order = {lang: i for i, lang in enumerate(target_languages)}
    
    # Select only the required columns and rename content_english to english_content
    output_df = sampled_df[['reviewId', 'content', 'language', 'content_english']].copy()
    
    # Clean up english_content: remove quotes
    output_df['content_english'] = output_df['content_english'].astype(str).str.replace('"', '')
    
    # Rename the column
    output_df = output_df.rename(columns={'content_english': 'english_content'})
    
    # Sort by language in the specified order
    output_df['lang_order'] = output_df['language'].map(lang_order)
    output_df = output_df.sort_values('lang_order').drop('lang_order', axis=1).reset_index(drop=True)
    
    # Save to CSV
    output_df.to_csv(output_file, index=False)
    print(f"Saved sampled reviews to {output_file}")
    
    print("\nFinal sample distribution:")
    for lang in target_languages:
        count = len(output_df[output_df['language'] == lang])
        print(f"  {lang}: {count} reviews")
    
    return output_df

# Execute the function
if __name__ == "__main__":
    sampled_reviews = sample_reviews_by_language()
    
    # Display summary
    if sampled_reviews is not None and not sampled_reviews.empty:
        # Show a couple examples from each language
        print("\nExample reviews from each language:")
        for lang in ["German", "Italian", "Spanish", "French"]:
            lang_samples = sampled_reviews[sampled_reviews['language'] == lang].head(1)
            if not lang_samples.empty:
                print(f"\n{lang} example:")
                for _, row in lang_samples.iterrows():
                    print(f"  Original: {row['content'][:50]}{'...' if len(row['content']) > 50 else ''}")
                    print(f"  English:  {row['english_content'][:50]}{'...' if len(row['english_content']) > 50 else ''}")

Reading bmw_app_analysis/results/bmw_reviews_consolidated_20250504_155236.csv...
Filtering reviews with at least 5 words...
Reviews with 5+ words: 13312 out of 18350

Reviews available after filtering by language and length:
  German: 3839 reviews
  Italian: 920 reviews
  Spanish: 670 reviews
  French: 1155 reviews
Sampled 30 reviews from German
Sampled 30 reviews from Italian
Sampled 30 reviews from Spanish
Sampled 30 reviews from French
Saved sampled reviews to bmw_app_analysis/results/bmw_reviews_sampledHE.csv

Final sample distribution:
  German: 30 reviews
  Italian: 30 reviews
  Spanish: 30 reviews
  French: 30 reviews

Example reviews from each language:

German example:
  Original: Die App kann schon viel, aber warum lassen sich di...
  English:  The app can already do a lot, but why can't the wi...

Italian example:
  Original: Dalla versione 2.11.0, con IDrive 6 e telefono And...
  English:  Since version 2.11.0, with iDrive 6 and an Android...

Spanish example:
  Original: A

In [None]:
# bmw_topic_cards.py  –  v6  (2025‑05‑02)
# ================================================================
# • Stratified random sampling (mirrors topic sentiment mix)
# • One paragraph summary  +  “‑ ” negative bullets  +  “+ ” positive bullets
# • Prompts stored inside TopicSummary.prompts  (audit / debugging)
# • No JSON parsing, so Ollama chatter can’t break the pipeline
# ---------------------------------------------------------------

from __future__ import annotations

import html
import logging
import subprocess
import textwrap
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from IPython.display import HTML, display
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm

# -------------------------------------------------------------------
#  CONFIGURATION
# -------------------------------------------------------------------
TOP_N_TOPICS        = 15          # how many topics to visualise
PROMPT_SAMPLE_SIZE  = 300         # reviews fed to each LLM prompt
N_KEYWORDS          = 8
N_PHRASES           = 5
MAX_BULLETS         = 7           # issues / positives
BMW_BLUE            = "#0066B1"

STAR_SVG = (
    "<svg width='14' height='14' viewBox='0 0 24 24' "
    "xmlns='http://www.w3.org/2000/svg' style='vertical-align:-2px'>"
    "<polygon fill='#FFD700' "
    "points='12 2 15.09 8.26 22 9.27 17 14.14 "
    "18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26'/></svg>"
)

# -------------------------------------------------------------------
#  NLTK – bootstrap stop‑words
# -------------------------------------------------------------------
for res in ("stopwords", "punkt"):
    try:
        nltk.data.find(f"corpora/{res}")
    except LookupError:
        nltk.download(res, quiet=True)

STOPWORDS = set(stopwords.words("english")) | {
    "bmw", "app", "car", "please", "would", "also", "get", "use", "using"
}

# -------------------------------------------------------------------
#  OLLAMA RUNNER  (temperature 0 for determinism)
# -------------------------------------------------------------------
def _subprocess_runner(prompt: str, model: str) -> str:
    proc = subprocess.run(
        ["ollama", "run", model, "-t", "0"],
        input=prompt,
        text=True,
        capture_output=True,
        check=False,
    )
    return proc.stdout

# -------------------------------------------------------------------
#  STRATIFIED SAMPLING  (mirror sentiment mix)
# -------------------------------------------------------------------
def _stratified_sample(df: pd.DataFrame, size: int, seed: int = 0) -> List[str]:
    dist = df["sentiment"].value_counts(normalize=True)
    quota = {s: int(round(dist.get(s, 0) * size)) for s in ("negative", "positive", "neutral")}
    diff = size - sum(quota.values())
    if diff:
        quota[max(dist, key=dist.get, default="negative")] += diff

    col = "content_english" if "content_english" in df.columns else "content"
    texts: List[str] = []

    for sentiment, n in quota.items():
        if n <= 0:
            continue
        bucket = df[df["sentiment"] == sentiment]
        sample_n = min(n, len(bucket))
        texts.extend(bucket.sample(sample_n, random_state=seed)[col].tolist())

    # top‑up if any bucket ran short
    if len(texts) < size:
        short = size - len(texts)
        remainder = df.drop(df.index[df[col].isin(texts)]).sample(short, random_state=seed)
        texts.extend(remainder[col].tolist())

    return texts

# -------------------------------------------------------------------
#  BULLET‑PARSER
# -------------------------------------------------------------------
def _parse_bullets(text: str, prefix: str) -> List[str]:
    """Return list of lines starting with the prefix (‘- ’ or ‘+ ’)."""
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    bullets = [ln[len(prefix):].strip() for ln in lines if ln.startswith(prefix)]
    # fallback to comma‑sep if model ignored prefixes
    if not bullets and "," in text:
        bullets = [part.strip() for part in text.split(",") if part.strip()]
    return bullets[:MAX_BULLETS]

# -------------------------------------------------------------------
#  DATACLASS
# -------------------------------------------------------------------
@dataclass
class TopicSummary:
    summary: str
    issues: List[str]
    positives: List[str]
    avg_rating: float
    review_count: int
    sentiment_dist: Dict[str, float]
    top_keywords: List[str]
    top_phrases: List[str]
    prompts: Dict[str, str] = field(default_factory=dict)

    @property
    def stars(self) -> str:
        full, half = int(self.avg_rating), self.avg_rating - int(self.avg_rating) >= 0.5
        return STAR_SVG * full + (STAR_SVG if half else "")

# -------------------------------------------------------------------
#  MAIN ENTRY
# -------------------------------------------------------------------
def create_topic_cards_from_classified(
    df: pd.DataFrame,
    *,
    ollama_model_name: str = "gemma3:12b",
    ollama_runner: Callable[[str, str], str] = _subprocess_runner,
) -> Dict[str, TopicSummary]:
    """
    Render topic cards & a rating chart; return dict[topic] -> TopicSummary.
    Prompts are stored in `TopicSummary.prompts`.
    """
    log = _setup_logger()

    # -------- sanity checks -----------------------------------------
    if {"topics", "sentiment"} - set(df.columns):
        raise ValueError("DataFrame must include 'topics' & 'sentiment'.")

    text_col = "content_english" if "content_english" in df.columns else "content"

    if "score" not in df.columns:
        df["score"] = (
            df["sentiment"]
            .map({"positive": 4.5, "neutral": 3.0, "negative": 1.5})
            .fillna(3.0)
        )

    # -------- pick top topics --------------------------------------
    topics_freq = df["topics"].str.get_dummies(sep=",").sum()
    top_topics = (
        topics_freq.sort_values(ascending=False).head(TOP_N_TOPICS).index
    )

    results: Dict[str, TopicSummary] = {}

    for topic in tqdm(top_topics, desc="Topics"):
        sub = df[df["topics"].str.contains(fr"\b{topic}\b", case=False, na=False)]
        if sub.empty:
            continue

        avg = sub["score"].mean()
        mix = sub["sentiment"].value_counts(normalize=True).to_dict()

        tokens = word_tokenize(" ".join(sub[text_col].fillna("").str.lower()))
        tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS and len(t) > 2]
        keywords = [w for w, _ in Counter(tokens).most_common(N_KEYWORDS)]
        bigrams = [f"{a} {b}" for a, b in zip(tokens, tokens[1:])]
        phrases = [p for p, _ in Counter(bigrams).most_common(N_PHRASES)]

        sample_n = min(PROMPT_SAMPLE_SIZE, len(sub))
        sample_texts = _stratified_sample(sub, sample_n)
        sample_blob = "\n".join(f"[{i}] {txt}" for i, txt in enumerate(sample_texts))

        ctx_block = (
            f"Average score {avg:.2f} (n={len(sub)}). "
            f"Sentiment mix: neg {mix.get('negative',0):.0%}, "
            f"pos {mix.get('positive',0):.0%}, "
            f"neu {mix.get('neutral',0):.0%}.\n"
            f"Top keywords: {', '.join(keywords)}.\n"
            f"Top phrases: {', '.join(phrases)}."
        )

        prompts: Dict[str, str] = {}

        def build(kind: str) -> str:
            """Return fully‑formed prompt & store it"""
            header = (
                "You are an analytical assistant. Follow ALL rules:\n"
                f"• Focus **only** on the topic '{topic}'.\n"
                "• Ignore unrelated features.\n"
                "• Use English even if reviews were translated.\n"
                "• Be factual, concise (temperature 0).\n"
            )
            if kind == "summary":
                body = (
                    "Write one concise paragraph (3‑5 sentences) that "
                    "summarises what users say about this topic, covering "
                    "both pain points and praise."
                )
            else:
                label = "negative issues" if kind == "issues" else "positive aspects"
                prefix = "-" if kind == "issues" else "+"
                body = (
                    f"List up to {MAX_BULLETS} main {label}. "
                    f"Each line MUST start with '{prefix} '. "
                    "Use short noun phrases, no duplication, no periods."
                )

            prompt = (
                f"{header}\n\nCONTEXT:\n{ctx_block}\n\nSAMPLES:\n{sample_blob}\n\nTASK:\n{body}"
            )
            prompts[kind] = prompt
            return prompt

        # ----- call Ollama (three prompts in parallel) ----------------
        with ThreadPoolExecutor(max_workers=3) as pool:
            futures = {
                pool.submit(ollama_runner, build(k), ollama_model_name): k
                for k in ("summary", "issues", "positives")
            }
            raw = {k: f.result().strip() for f, k in futures.items()}

        summary_text = raw["summary"]
        issues_list  = _parse_bullets(raw["issues"], "-")
        pos_list     = _parse_bullets(raw["positives"], "+")

        results[topic] = TopicSummary(
            summary=summary_text,
            issues=issues_list,
            positives=pos_list,
            avg_rating=avg,
            review_count=len(sub),
            sentiment_dist=mix,
            top_keywords=keywords,
            top_phrases=phrases,
            prompts=prompts,
        )

    # ------------- render & plot -------------------------------------
    _render_cards(results)
    _plot_chart(results, df)
    return {k: v.__dict__ for k, v in results.items()}

# -------------------------------------------------------------------
#  VISUAL HELPER – HTML CARDS
# -------------------------------------------------------------------
def _render_cards(data: Dict[str, TopicSummary]) -> None:
    css = textwrap.dedent(
        """
        <style>
          :root {{--blue:{blue};--neg:#E53935;--pos:#43A047;--neu:#FFA000;
                 --bg:#fff;--text:#1a1a1a;}}
          @media (prefers-color-scheme: dark) {{
              :root {{--bg:#121212;--text:#e0e0e0;}}
          }}
          body {{background:var(--bg);color:var(--text);}}

/* container */
          .cards {{max-width:1200px;margin:0 auto;font-family:'Helvetica Neue',Arial,sans-serif;}}

/* card */
          .card {{
              display:grid;grid-template-columns:220px 1fr;
              border:1px solid #bbb;border-radius:8px;margin:18px 0;
              overflow:hidden;box-shadow:0 4px 10px rgba(0,0,0,.12);
              animation:fadeIn .4s ease;
          }}
          @keyframes fadeIn {{from {{opacity:0;transform:translateY(8px);}}
                              to   {{opacity:1;transform:translateY(0);}}}}
          header {{
              grid-column:1/-1;background:linear-gradient(135deg,var(--blue) 0%,#0088cc 100%);
              color:#fff;padding:12px 18px;font-size:20px;font-weight:600;
          }}

/* sidebar */
          .side {{
              background:#f5f7fa;border-right:1px solid #ddd;
              padding:14px;display:flex;flex-direction:column;gap:14px;
          }}
          .stat .num {{font-weight:700;font-size:19px;line-height:1;color:var(--text);}}
          .stat .label{{font-size:11px;text-transform:uppercase;font-weight:600;
                       letter-spacing:.4px;color:var(--text);}}
          .bar {{display:flex;height:10px;border-radius:4px;overflow:hidden;margin-top:4px;}}
          .seg {{flex-shrink:0;transition:opacity .2s;}} .seg:hover {{opacity:.75;}}
          .neg{{background:var(--neg);}} .pos{{background:var(--pos);}} .neu{{background:var(--neu);}}

/* main */
          .main {{padding:18px;line-height:1.6;}}
          .summary {{
              background:rgba(0,102,177,.07);border-left:3px solid var(--blue);
              padding:10px;margin-bottom:16px;line-height:1.6;
          }}
          .cols {{display:flex;flex-wrap:wrap;gap:24px;margin-bottom:16px;}}
          h4 {{margin:0 0 6px;color:var(--blue);font-weight:600;}}
          ul {{margin:0;padding-left:18px;}}
          li {{margin:4px 0;}}

/* chips */
          .chips {{margin-bottom:10px;}}
          .chip {{
              display:inline-block;padding:4px 10px;border-radius:20px;
              font-size:11px;font-weight:700;margin:2px;cursor:pointer;
              transition:opacity .2s;
          }}
          .chip:hover {{opacity:.8;}}
          .kw{{background:#e3f2fd;color:var(--blue);}}
          .ph{{background:#e8f5e9;color:#2e7d32;border:1px solid #c8e6c9;}}

/* responsive */
          @media (max-width:768px){{
              .card{{grid-template-columns:1fr}}
              .side{{order:2;border:none;border-top:1px solid #ddd;
                    flex-direction:row;justify-content:space-around}}
          }}
        </style>
        """
    ).format(blue=BMW_BLUE)

    output: List[str] = [css, "<div class='cards'>"]

    # sort by review count
    for topic, info in sorted(data.items(), key=lambda x: x[1].review_count, reverse=True):
        if topic.lower() == "other":
            continue

        bar = "".join(_bar_seg(name, frac) for name, frac in info.sentiment_dist.items())
        kw_html = "".join(f"<span class='chip kw'>{html.escape(k)}</span>" for k in info.top_keywords)
        ph_html = "".join(f"<span class='chip ph'>{html.escape(p)}</span>" for p in info.top_phrases)
        issues_html = "".join(f"<li>{html.escape(i)}</li>" for i in info.issues) or "<li>No major issues</li>"
        pos_html = "".join(f"<li>{html.escape(p)}</li>" for p in info.positives) or "<li>No specific positives</li>"

        output.append(
            f"<div class='card'>"
            f"<header>{html.escape(topic)}</header>"

            f"<section class='side'>"
            f"<div class='stat'><span class='num'>{info.review_count}</span><span class='label'>Reviews</span></div>"
            f"<div class='stat'><span class='num'>{info.avg_rating:.2f}/5 {info.stars}</span>"
            f"<span class='label'>Average</span></div>"
            f"<div class='stat'><span class='label'>Sentiment</span><div class='bar'>{bar}</div></div>"
            "</section>"

            "<section class='main'>"
            f"<p class='summary'>{html.escape(info.summary)}</p>"
            "<div class='cols'>"
            f"<div><h4>Issues</h4><ul>{issues_html}</ul></div>"
            f"<div><h4>Positives</h4><ul>{pos_html}</ul></div>"
            "</div>"
            f"<div class='chips'><h4>Keywords</h4>{kw_html}</div>"
            f"<div class='chips'><h4>Phrases</h4>{ph_html}</div>"
            "</section>"
            "</div>"
        )

    output.append("</div>")
    display(HTML("\n".join(output)))

def _bar_seg(name: str, frac: float) -> str:
    cls = "pos" if name == "positive" else "neg" if name == "negative" else "neu"
    pct = max(frac * 100, 1)
    return f"<span class='seg {cls}' style='width:{pct}%' title='{name}: {frac:.1%}'></span>"

# -------------------------------------------------------------------
#  VISUAL HELPER – MATPLOTLIB CHART
# -------------------------------------------------------------------
def _plot_chart(data: Dict[str, TopicSummary], df: pd.DataFrame) -> None:
    rows = [(t, d.avg_rating, d.review_count) for t, d in data.items() if t.lower() != "other"]
    if not rows:
        return
    topics, ratings, counts = zip(*rows)
    order = sorted(range(len(topics)), key=lambda i: ratings[i])
    topics = [topics[i] for i in order]
    ratings = [ratings[i] for i in order]
    counts = [counts[i] for i in order]

    cmap = mpl.colormaps.get_cmap("viridis").resampled(len(topics))

    plt.figure(figsize=(12, 10))
    bars = plt.barh(topics, ratings, color=[cmap(i) for i in range(len(topics))])

    for bar, cnt, rating in zip(bars, counts, ratings):
        x = rating + 0.05 if rating < 2.5 else rating - 0.6
        clr = "black" if rating < 2.5 else "white"
        plt.text(x, bar.get_y() + bar.get_height() / 2, f"n={cnt}", va="center",
                 color=clr, fontweight="bold")

    mean = df["score"].mean()
    plt.axvline(mean, linestyle="--", linewidth=2, label=f"Overall {mean:.2f}")
    plt.xticks(range(1, 6), ["★" * i for i in range(1, 6)])
    plt.xlim(0, 5.2)
    plt.xlabel("Star Rating")
    plt.ylabel("Topic")
    plt.title("BMW App – Average Rating by Topic")
    plt.legend()
    plt.grid(axis="x", linestyle="--", alpha=0.5)
    plt.tight_layout()
    plt.show()

# -------------------------------------------------------------------
#  MISC HELPERS
# -------------------------------------------------------------------
def _setup_logger() -> logging.Logger:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    return logging.getLogger("bmw_topic_cards")

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Evaluate two candidate translations of app reviews by giving a simple 
numerical score (1-5) to each translation compared to the original.

Key features
----------------
* Simple 1-5 scoring for each translation
* Shows model output for each evaluation (truncated)
* Automatic delimiter detection (handles ';', ',', or '\t')
* Better error handling with fallback scores
* Summary statistics broken down by language

Author: <your name>
Date: 2025‑05‑04
"""

import csv
import os
import subprocess
import time
import json
import re
from typing import Optional, Dict, Any

import pandas as pd
from tqdm import tqdm


# --------------------------------------------------------------------------- #
#                              Helper functions                               #
# --------------------------------------------------------------------------- #
def detect_delimiter(file_path: str, sample_bytes: int = 2048) -> str:
    """Return the most likely delimiter in the CSV file."""
    with open(file_path, "r", encoding="utf-8") as f:
        sample = f.read(sample_bytes)

        # Quick manual checks
        if "\t" in sample:
            return "\t"
        if ";" in sample:
            return ";"

        # Fallback to csv.Sniffer
        try:
            dialect = csv.Sniffer().sniff(sample)
            return dialect.delimiter
        except Exception:
            return ","  # default


def run_ollama(prompt: str, model_name: str, max_retries: int = 2) -> str:
    """Call `ollama run` with retries and a timeout; return raw stdout."""
    for attempt in range(max_retries + 1):
        try:
            proc = subprocess.run(
                ["ollama", "run", model_name],
                input=prompt,
                text=True,
                capture_output=True,
                timeout=120,  # Increased timeout to 120 seconds
            )
            return proc.stdout.strip()
        except subprocess.TimeoutExpired:
            if attempt < max_retries:
                print(f"Timeout. Retrying {attempt + 1}/{max_retries} …")
                time.sleep(5)  # Longer cooling period between retries
            else:
                return "ERROR: Timeout"
        except Exception as exc:
            if attempt < max_retries:
                print(f"{exc}. Retrying {attempt + 1}/{max_retries} …")
                time.sleep(5)  # Longer cooling period between retries
            else:
                return f"ERROR: {exc}"


def create_simple_prompt(row: pd.Series) -> str:
    """Build a simple scoring prompt with clear scale definition."""
    return f"""Rate these two English translations of this {row['language']} text:

SOURCE TEXT ({row['language']}):
{row['content']}

TRANSLATION A:
{row['Translation A']}

TRANSLATION B:
{row['Translation B']}

SCORING SCALE:
1 = Very poor (major errors, missing information, or incomprehensible)
2 = Poor (significant errors or awkward phrasing)
3 = Acceptable (gets the main meaning across with minor issues)
4 = Good (fluent and accurate with very minor issues)
5 = Excellent (perfect or near-perfect translation)

IMPORTANT: Rate each translation against the source, not against each other.

RESPONSE FORMAT:
Please respond with ONLY:
score_A: [number]
score_B: [number]

No explanations or additional text.
"""


def parse_score_response(response: str) -> Dict[str, Any]:
    """
    Parse the LLM response containing simple scores.
    Returns a dict with scores and winner determination.
    """
    if response.startswith("ERROR:"):
        return {"error": response, "score_A": None, "score_B": None, "winner": "Error"}
    
    # Look for simple "score_A: X" and "score_B: Y" patterns
    score_a = None
    score_b = None
    
    # Check for score_A pattern
    a_match = re.search(r'score_?A:?\s*(\d+)', response, re.IGNORECASE)
    if a_match:
        try:
            score_val = int(a_match.group(1))
            if 1 <= score_val <= 5:
                score_a = score_val
        except ValueError:
            pass
    
    # Check for score_B pattern
    b_match = re.search(r'score_?B:?\s*(\d+)', response, re.IGNORECASE)
    if b_match:
        try:
            score_val = int(b_match.group(1))
            if 1 <= score_val <= 5:
                score_b = score_val
        except ValueError:
            pass
    
    # If we found both scores with regex
    if score_a is not None and score_b is not None:
        if score_a > score_b:
            winner = "A"
        elif score_b > score_a:
            winner = "B"
        else:
            winner = "Tie"
        return {"score_A": score_a, "score_B": score_b, "winner": winner}
    
    # Try to extract JSON from the response as fallback
    try:
        # Find the JSON object in the response
        start_idx = response.find("{")
        end_idx = response.rfind("}")
        
        if start_idx >= 0 and end_idx > start_idx:
            json_str = response[start_idx:end_idx+1]
            data = json.loads(json_str)
            
            # Extract scores
            score_a = data.get("score_A")
            score_b = data.get("score_B")
            
            # Determine winner based on scores
            if score_a is not None and score_b is not None:
                if score_a > score_b:
                    winner = "A"
                elif score_b > score_a:
                    winner = "B"
                else:
                    winner = "Tie"
                
                return {"score_A": score_a, "score_B": score_b, "winner": winner}
            else:
                return {"error": "Missing scores in response", "winner": "Unclear"}
        else:
            # No JSON found, but we already tried regex above
            return {"error": "Could not extract scores", "winner": "Unclear"}
    
    except Exception as e:
        return {"error": str(e), "winner": "Error"}


# --------------------------------------------------------------------------- #
#                             Main entry‑point                                #
# --------------------------------------------------------------------------- #
def evaluate_translations(
    input_file: str,
    output_file: str,
    model_name: str = "mistral:7b",  # Changed default to faster model
) -> Optional[pd.DataFrame]:
    """Load review/translation CSV, evaluate each pair with simple scoring, save results."""
    print(f"Reading {input_file} …")
    delimiter = detect_delimiter(input_file)
    print(f"Detected delimiter '{delimiter}'")

    try:
        df = pd.read_csv(
            input_file,
            sep=delimiter,
            quotechar='"',
            doublequote=True,
            engine="python",  # forgiving
        )
    except Exception as exc:
        print(f"❌ Could not load CSV: {exc}")
        return None

    # Required columns
    required = ["content", "language", "Translation A", "Translation B"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        print("❌ Missing required columns:", missing)
        print("Available columns:", df.columns.tolist())
        return None

    # Display sample data to verify it loaded correctly
    print("\nSample data from first row:")
    for col in required:
        val = df.iloc[0][col]
        # Truncate for display
        if isinstance(val, str) and len(val) > 50:
            val = val[:50] + "..."
        print(f"  {col}: {val}")

    # Add score and judgment columns if not present
    if "score_A" not in df.columns:
        df["score_A"] = pd.NA
    if "score_B" not in df.columns:
        df["score_B"] = pd.NA
    if "judgment" not in df.columns:
        df["judgment"] = pd.NA

    # Evaluate rows
    print(f"\nEvaluating {len(df)} rows with {model_name} …")
    
    # Track success/error counts
    success_count = 0
    error_count = 0
    
    for idx in tqdm(df.index):
        # Skip if already evaluated
        if pd.notna(df.at[idx, "score_A"]) and pd.notna(df.at[idx, "score_B"]):
            print(f"Skipping row {idx} (already evaluated)")
            success_count += 1
            continue

        print(f"\n--- Evaluating row {idx} ({df.at[idx, 'language']}) ---")
        
        prompt = create_simple_prompt(df.loc[idx])
        response = run_ollama(prompt, model_name)
        
        # Display truncated response
        truncated_response = response[:150] + "..." if len(response) > 150 else response
        print(f"Model output: {truncated_response}")

        # Parse the response
        result = parse_score_response(response)
        
        # Store the scores and judgment
        if "error" in result and result.get("score_A") is None:
            # Apply default scores on error
            df.at[idx, "score_A"] = 3  # Default neutral score
            df.at[idx, "score_B"] = 3  # Default neutral score
            df.at[idx, "judgment"] = "Error"
            df.at[idx, "error"] = result["error"]
            print(f"⚠️ Using default scores due to error: {result['error']}")
            error_count += 1
            time.sleep(5)  # Longer cooling after error
        else:
            score_a = result.get("score_A")
            score_b = result.get("score_B")
            winner = result.get("winner", "Unclear")
            
            df.at[idx, "score_A"] = score_a
            df.at[idx, "score_B"] = score_b
            df.at[idx, "judgment"] = winner
            
            print(f"✅ Scores: A={score_a}, B={score_b}, Winner={winner}")
            
            if "error" in result:
                df.at[idx, "error"] = result["error"]
                print(f"⚠️ {result['error']} (but got scores)")
            
            success_count += 1
        
        # Add cooling period between all calls
        time.sleep(2)  # Prevent rate limiting

    # Final save
    try:
        df.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL)
        print(f"✅ Results written to {output_file}")
    except Exception as exc:
        print(f"⚠️ CSV save failed ({exc}); trying Excel …")
        try:
            df.to_excel(output_file.replace(".csv", ".xlsx"), index=False)
            print("✅ Saved as Excel instead")
        except Exception as exc2:
            print(f"❌ Could not save results at all: {exc2}")
            return None

    # Process success/error stats
    print(f"\n📊 Processing stats: {success_count} successes, {error_count} errors")

    # -------------------- UPDATED SUMMARY SECTION --------------------
    print("\n====== EVALUATION SUMMARY ======")
    
    # Overall summary
    print("\n🌐 OVERALL RESULTS:")
    counts = df["judgment"].value_counts(dropna=False)
    for label, cnt in counts.items():
        pct = cnt / len(df) * 100
        print(f"  {label:8}: {cnt:>4}  ({pct:5.1f} %)")
    
    a_avg = df["score_A"].mean()
    b_avg = df["score_B"].mean()
    diff = a_avg - b_avg
    
    print(f"\n  Average Scores (1-5):")
    print(f"  Translation A: {a_avg:.2f}")
    print(f"  Translation B: {b_avg:.2f}")
    print(f"  Difference (A-B): {diff:+.2f}")
    
    # Per-language statistics
    print("\n🔍 RESULTS BY LANGUAGE:")
    
    # Create a list of languages to process in a specific order
    languages = ["German", "Italian", "French", "Spanish"]
    
    # Ensure all expected languages are actually in the dataset
    present_languages = df["language"].unique()
    print(f"  Languages in dataset: {', '.join(sorted(present_languages))}")
    
    # For each language, calculate stats
    for lang in languages:
        lang_df = df[df["language"] == lang]
        
        if len(lang_df) == 0:
            print(f"\n  ❌ No reviews for {lang}")
            continue
        
        print(f"\n  📊 {lang} ({len(lang_df)} reviews):")
        
        # Judgment distribution
        lang_counts = lang_df["judgment"].value_counts(dropna=False)
        for label in ["A", "B", "Tie", "Error", "Unclear"]:
            if label in lang_counts:
                cnt = lang_counts[label]
                pct = cnt / len(lang_df) * 100
                print(f"    {label:8}: {cnt:>3}  ({pct:5.1f} %)")
        
        # Scores
        lang_a_avg = lang_df["score_A"].mean()
        lang_b_avg = lang_df["score_B"].mean()
        lang_diff = lang_a_avg - lang_b_avg
        
        print(f"\n    Scores (1-5):")
        print(f"    Translation A: {lang_a_avg:.2f}")
        print(f"    Translation B: {lang_b_avg:.2f}")
        print(f"    Difference (A-B): {lang_diff:+.2f}")
    
    # Distribution of score differences
    print("\n📈 SCORE DIFFERENCE DISTRIBUTION:")
    
    # Calculate score differences
    df["score_diff"] = df["score_A"] - df["score_B"]
    
    # Group differences into categories
    diff_categories = {
        "A much better (≥2)": (df["score_diff"] >= 2).sum(),
        "A better (1)": (df["score_diff"] == 1).sum(),
        "Tie (0)": (df["score_diff"] == 0).sum(),
        "B better (-1)": (df["score_diff"] == -1).sum(),
        "B much better (≤-2)": (df["score_diff"] <= -2).sum()
    }
    
    for label, count in diff_categories.items():
        pct = count / len(df) * 100
        print(f"  {label:20}: {count:>3}  ({pct:5.1f} %)")
    
    # Scale meaning reminder
    print("\n📋 SCORING SCALE REFERENCE:")
    print("  1 = Very poor (major errors, missing information, or incomprehensible)")
    print("  2 = Poor (significant errors or awkward phrasing)")
    print("  3 = Acceptable (gets the main meaning across with minor issues)")
    print("  4 = Good (fluent and accurate with very minor issues)")
    print("  5 = Excellent (perfect or near-perfect translation)")
    
    return df


# --------------------------------------------------------------------------- #
#                                 CLI hook                                    #
# --------------------------------------------------------------------------- #
if __name__ == "__main__":
    IN_FILE = "bmw_app_analysis/translations/bmw_reviews_sampledHE.csv"
    OUT_FILE = "bmw_app_analysis/translations/bmw_reviews_sampledLLM.csv"
    
    # Changed to faster model options
    MODEL = "qwen3:8b"  # alternative options: mistral:7b, phi3:mini, llama3:8b
    
    print(f"Starting simple translation evaluation with {MODEL} …")
    evaluate_translations(IN_FILE, OUT_FILE, MODEL)

Starting simple translation evaluation with qwen3:8b …
Reading bmw_app_analysis/translations/bmw_reviews_sampledHE.csv …
Detected delimiter ';'

Sample data from first row:
  content: Die App kann schon viel, aber warum lassen sich di...
  language: German
  Translation A: The app can already do a lot, but why can't the wi...
  Translation B: The app can do a lot, but why can't the windows or...

Evaluating 120 rows with qwen3:8b …


  0%|          | 0/120 [00:00<?, ?it/s]


--- Evaluating row 0 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. The source is a complaint about an app not allowing cert...
✅ Scores: A=4, B=4, Winner=Tie


  1%|          | 1/120 [00:27<54:01, 27.24s/it]


--- Evaluating row 1 (German) ---
Model output: <think>
Okay, let's take a look at these two translations. The source text is in German, and I need to compare each translation against the original. ...
✅ Scores: A=3, B=4, Winner=B


  2%|▏         | 2/120 [00:53<51:59, 26.44s/it]


--- Evaluating row 2 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. The source is "Top App, verständlich, übersichtlich und...
✅ Scores: A=5, B=4, Winner=A


  2%|▎         | 3/120 [01:58<1:26:07, 44.17s/it]


--- Evaluating row 3 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. First, I need to understand the original German sentence...
✅ Scores: A=4, B=4, Winner=Tie


  3%|▎         | 4/120 [02:52<1:33:01, 48.12s/it]


--- Evaluating row 4 (German) ---
Model output: <think>
Okay, let's tackle this translation scoring. First, I need to compare each translation to the source text. The source is in German, and there ...
✅ Scores: A=5, B=5, Winner=Tie


  4%|▍         | 5/120 [03:43<1:34:07, 49.11s/it]


--- Evaluating row 5 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German sentence. The source is "Einfach eine sehr gute und hilfreiche...
✅ Scores: A=5, B=5, Winner=Tie


  5%|▌         | 6/120 [03:59<1:12:14, 38.02s/it]


--- Evaluating row 6 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. First, I need to understand the source text and then co...
✅ Scores: A=4, B=4, Winner=Tie


  6%|▌         | 7/120 [04:37<1:11:19, 37.88s/it]


--- Evaluating row 7 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. First, I need to understand the original German text.

...
✅ Scores: A=3, B=4, Winner=B


  7%|▋         | 8/120 [05:23<1:15:25, 40.41s/it]


--- Evaluating row 8 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German sentence. The source is "Alles wichtige ist schnell abrufbar."...
✅ Scores: A=5, B=5, Winner=Tie


  8%|▊         | 9/120 [06:36<1:33:26, 50.51s/it]


--- Evaluating row 9 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. The source is "Funktioniert einwandfrei. Bin bis jetzt v...
✅ Scores: A=5, B=5, Winner=Tie


  8%|▊         | 10/120 [06:59<1:17:08, 42.08s/it]


--- Evaluating row 10 (German) ---
Model output: <think>
Okay, let me look at these translations. The source text is in German. The user wants me to rate each translation against the original, not ag...
✅ Scores: A=4, B=3, Winner=A


  9%|▉         | 11/120 [08:00<1:27:04, 47.93s/it]


--- Evaluating row 11 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. First, I need to understand the source text and then com...
✅ Scores: A=3, B=4, Winner=B


 10%|█         | 12/120 [08:27<1:14:49, 41.57s/it]


--- Evaluating row 12 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. The source is about an app crashing on a new phone afte...
✅ Scores: A=4, B=4, Winner=Tie


 11%|█         | 13/120 [09:03<1:11:05, 39.86s/it]


--- Evaluating row 13 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. The source is about missing info on fuel and mileage fo...
✅ Scores: A=4, B=3, Winner=A


 12%|█▏        | 14/120 [09:24<1:00:28, 34.23s/it]


--- Evaluating row 14 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. The source is "schade dass der Concierge Service einges...
✅ Scores: A=4, B=5, Winner=B


 12%|█▎        | 15/120 [09:58<59:48, 34.18s/it]  


--- Evaluating row 15 (German) ---
Model output: <think>
Okay, I need to rate these two translations of the German text. Let's start by understanding the source text. The German says, "Die App hat im...
✅ Scores: A=4, B=3, Winner=A


 13%|█▎        | 16/120 [10:20<52:57, 30.55s/it]


--- Evaluating row 16 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. First, I need to understand the original German text and...
✅ Scores: A=4, B=5, Winner=B


 14%|█▍        | 17/120 [11:19<1:06:57, 39.00s/it]


--- Evaluating row 17 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German sentence. The source is "Remote Parking fehlt obwohl Fahrzeug ...
✅ Scores: A=5, B=5, Winner=Tie


 15%|█▌        | 18/120 [11:53<1:03:48, 37.53s/it]


--- Evaluating row 18 (German) ---
Model output: <think>
Okay, let's start by looking at the source text. The German says: "Es fehlen einfache Features die heute eigentlich Standart sind: -Kein Dark ...
✅ Scores: A=5, B=5, Winner=Tie


 16%|█▌        | 19/120 [12:30<1:02:56, 37.39s/it]


--- Evaluating row 19 (German) ---
Model output: <think>
Okay, let me try to figure out the scores for these two translations. First, I need to compare each translation against the original German te...
✅ Scores: A=4, B=3, Winner=A


 17%|█▋        | 20/120 [13:16<1:06:40, 40.00s/it]


--- Evaluating row 20 (German) ---
Model output: <think>
Okay, let's tackle this translation scoring. The user wants me to rate two translations of a German text. Let me start by understanding the so...
⚠️ Using default scores due to error: Could not extract scores


 18%|█▊        | 21/120 [14:42<1:28:27, 53.61s/it]


--- Evaluating row 21 (German) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of a German text. The source is about someone being happy with an app tha...
✅ Scores: A=4, B=4, Winner=Tie


 18%|█▊        | 22/120 [15:15<1:17:37, 47.52s/it]


--- Evaluating row 22 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German text. The source is about updating a BMW vehicle's status thro...
✅ Scores: A=5, B=1, Winner=A


 19%|█▉        | 23/120 [15:43<1:07:35, 41.81s/it]


--- Evaluating row 23 (German) ---
Model output: <think>
Okay, let's tackle this translation scoring. First, I need to compare each translation against the original German text. Let me start with Tra...
⚠️ Using default scores due to error: Could not extract scores


 20%|██        | 24/120 [17:18<1:32:01, 57.51s/it]


--- Evaluating row 24 (German) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of a German sentence. The source is "Bis jetzt funktionierte alles einwand...
✅ Scores: A=3, B=4, Winner=B


 21%|██        | 25/120 [17:40<1:14:21, 46.96s/it]


--- Evaluating row 25 (German) ---
Model output: <think>
Okay, let me tackle this translation scoring. The source text is in German: "is von BMW...muss also gut sein ;) tolle App funktioniert und mac...
✅ Scores: A=4, B=3, Winner=A


 22%|██▏       | 26/120 [18:08<1:04:50, 41.39s/it]


--- Evaluating row 26 (German) ---
Model output: <think>
Okay, let's tackle this translation evaluation. The source text is German: "Nach jedem Update wieder neu anmelden - genial." The user provided...
✅ Scores: A=4, B=5, Winner=B


 22%|██▎       | 27/120 [18:36<57:41, 37.23s/it]  


--- Evaluating row 27 (German) ---
Model output: <think>
Okay, let's tackle this translation comparison. The source is in German: "BMW ist besser als mein RangeRover. Vor allem was die digitale Techn...
✅ Scores: A=3, B=4, Winner=B


 23%|██▎       | 28/120 [19:09<55:25, 36.14s/it]


--- Evaluating row 28 (German) ---
Model output: <think>
Okay, let's tackle this. I need to rate the two translations of the German text. The source is "Einfach genial wenn man stets in ein temperier...
✅ Scores: A=4, B=5, Winner=B


 24%|██▍       | 29/120 [19:46<55:07, 36.34s/it]


--- Evaluating row 29 (German) ---
Model output: <think>
Okay, let's take a look at these translations. The source text is in German, and I need to evaluate both translations against it. 

First, the...
✅ Scores: A=5, B=5, Winner=Tie


 25%|██▌       | 30/120 [20:25<55:29, 36.99s/it]


--- Evaluating row 30 (Italian) ---
Model output: <think>
Okay, I need to rate these two translations of the Italian text. Let me start by understanding the source text thoroughly. The Italian text me...
✅ Scores: A=3, B=4, Winner=B


 26%|██▌       | 31/120 [21:19<1:02:42, 42.28s/it]


--- Evaluating row 31 (Italian) ---
Model output: <think>
Okay, let's tackle this. I need to rate the two translations of the Italian text based on the given scoring scale. Let me start by understandi...
✅ Scores: A=5, B=5, Winner=Tie


 27%|██▋       | 32/120 [22:09<1:05:21, 44.56s/it]


--- Evaluating row 32 (Italian) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of an Italian text. The source is in Italian, and there are two English t...
✅ Scores: A=4, B=5, Winner=B


 28%|██▊       | 33/120 [22:57<1:06:09, 45.62s/it]


--- Evaluating row 33 (Italian) ---
Model output: <think>
Okay, let's tackle these translations. The source text is in Italian, and I need to compare the two English translations. Let me start by read...
✅ Scores: A=5, B=5, Winner=Tie


 28%|██▊       | 34/120 [23:48<1:07:45, 47.28s/it]


--- Evaluating row 34 (Italian) ---
Model output: <think>
Okay, let's tackle this. The source text is "Comandi da remoto verso la macchina molto lenti." First, I need to understand the Italian sentenc...
✅ Scores: A=4, B=2, Winner=A


 29%|██▉       | 35/120 [24:03<53:16, 37.61s/it]  


--- Evaluating row 35 (Italian) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of an Italian sentence. The source is "Utile, peccato che a volte non fa i...
✅ Scores: A=3, B=4, Winner=B


 30%|███       | 36/120 [24:45<54:15, 38.76s/it]


--- Evaluating row 36 (Italian) ---
Model output: <think>
Okay, let's tackle this. I need to rate the two translations of the Italian text. First, I'll read the source text carefully to understand the...
✅ Scores: A=4, B=3, Winner=A


 31%|███       | 37/120 [25:14<49:34, 35.84s/it]


--- Evaluating row 37 (Italian) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of an Italian sentence. The source is "sarebbe molto utile connessione co...
✅ Scores: A=3, B=4, Winner=B


 32%|███▏      | 38/120 [25:46<47:21, 34.65s/it]


--- Evaluating row 38 (Italian) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of an Italian text. The source is "Digital key non compatibile con s22, c...
✅ Scores: A=4, B=4, Winner=Tie


 32%|███▎      | 39/120 [26:32<51:24, 38.08s/it]


--- Evaluating row 39 (Italian) ---
Model output: <think>
Okay, let's tackle this translation comparison. The source is in Italian: "decisamente un grande aiuto per avere sempre l'autovettura sotto co...
✅ Scores: A=4, B=4, Winner=Tie


 33%|███▎      | 40/120 [26:58<46:09, 34.62s/it]


--- Evaluating row 40 (Italian) ---
Model output: <think>
Okay, I need to rate these two translations of the Italian text. Let me start by understanding the source text. The user is talking about the ...
✅ Scores: A=4, B=4, Winner=Tie


 34%|███▍      | 41/120 [27:35<46:31, 35.34s/it]


--- Evaluating row 41 (Italian) ---
Model output: <think>
Okay, let me look at these translations. The source text is in Italian: "facile da usare, efficente e utile per avere il controllo dell'auto d...
✅ Scores: A=5, B=5, Winner=Tie


 35%|███▌      | 42/120 [28:06<44:04, 33.91s/it]


--- Evaluating row 42 (Italian) ---
Model output: <think>
Okay, let's tackle this. The user wants me to rate two translations of an Italian text. The source is about a problem with trip recording that...
✅ Scores: A=2, B=4, Winner=B


 36%|███▌      | 43/120 [28:37<42:26, 33.07s/it]


--- Evaluating row 43 (Italian) ---
Model output: <think>
Okay, let me tackle this. The user wants me to rate two translations of an Italian text. The source is "precisione e modalità di avviso sempre...
✅ Scores: A=2, B=4, Winner=B


 37%|███▋      | 44/120 [29:19<45:02, 35.56s/it]


--- Evaluating row 44 (Italian) ---
