In [2]:
# Cell [0]
# ======================================
# 1. Install/Import Required Libraries
# ======================================

import pandas as pd
from google_play_scraper import Sort, reviews
import subprocess
from tqdm import tqdm

In [3]:
# Cell [1]
# ======================================
# 2. Define Parameters
# ======================================

# Package name of the BMW app on Google Play:
app_id = "de.bmw.connected.mobile20.row"

# Ollama model
ollama_model_name = "gemma3:12b"

In [3]:
# Cell [2]
# ======================================
# 3. Fetch Reviews from Google Play Store
# ======================================

# Define languages to fetch (just language codes and labels)
languages = [
    ('en', 'English'),
    ('de', 'German'),
    ('fr', 'French'),
    ('it', 'Italian'),
    ('es', 'Spanish'),
    ('nl', 'Dutch'),
    ('sv', 'Swedish'),
    ('da', 'Danish'),
    ('no', 'Norwegian'),
    ('fi', 'Finnish'),
    ('pl', 'Polish'),
    ('cs', 'Czech'),
    ('pt', 'Portuguese'),
    ('zh', 'Chinese'),
    ('ja', 'Japanese'),
    ('ko', 'Korean'),
    ('ar', 'Arabic'),
    ('tr', 'Turkish'),
    ('ru', 'Russian'),
    ('he', 'Hebrew'),
    ('th', 'Thai'),
    ('vi', 'Vietnamese'),
    ('hi', 'Hindi'),
    ('el', 'Greek'),
    ('hu', 'Hungarian'),
    ('ro', 'Romanian'),
    ('sk', 'Slovak'),
    ('bg', 'Bulgarian'),
    ('hr', 'Croatian'),
    ('sr', 'Serbian'),
    ('uk', 'Ukrainian'),
    ('id', 'Indonesian'),
    ('ms', 'Malay'),
    ('fa', 'Persian'),
    ('ur', 'Urdu'),
    ('bn', 'Bengali'),
    ('ta', 'Tamil'),
    ('te', 'Telugu'),
    ('ml', 'Malayalam'),
    ('et', 'Estonian'),
    ('lv', 'Latvian'),
    ('lt', 'Lithuanian'),
    ('sl', 'Slovenian')
]

# Initialize empty list to store all reviews
all_reviews = []

# Fetch reviews for each language
for lang_code, lang_label in languages:
    continuation_token = None
    prev_length = len(all_reviews)
    
    while True:
        result, continuation_token = reviews(
            app_id,
            lang=lang_code,
            sort=Sort.NEWEST,
            count=100,
            continuation_token=continuation_token
        )
        
        # Add language label to each review
        for review in result:
            review['language'] = lang_label
        
        all_reviews.extend(result)
        
        # Break if no more reviews or if number of reviews isn't increasing
        current_length = len(all_reviews)
        if not continuation_token or current_length - prev_length < 100:
            break
            
        prev_length = current_length

# Convert all reviews into a pandas DataFrame
df = pd.DataFrame(all_reviews)

print("\nReview Statistics:")
print("=" * 50)
print(f"Total number of reviews collected: {len(df)}")
print("\nBreakdown by language:")
print("-" * 50)
language_counts = df['language'].value_counts()
print(language_counts)
print("-" * 50)
print(f"Number of languages with reviews: {len(language_counts)}")
# Inspect the first few rows
df.head()


Review Statistics:
Total number of reviews collected: 18350

Breakdown by language:
--------------------------------------------------
language
German        4914
English       4617
French        1711
Italian       1315
Dutch          974
Spanish        967
Polish         641
Portuguese     576
Russian        473
Romanian       286
Swedish        261
Norwegian      205
Japanese       201
Finnish        175
Czech          158
Greek          117
Hungarian      114
Danish         102
Thai            99
Turkish         65
Croatian        65
Slovak          58
Slovenian       53
Chinese         47
Bulgarian       43
Arabic          32
Serbian         21
Ukrainian       14
Lithuanian      12
Estonian         9
Indonesian       7
Latvian          7
Korean           4
Hebrew           4
Malay            2
Persian          1
Name: count, dtype: int64
--------------------------------------------------
Number of languages with reviews: 36


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,language
0,d3f785ff-3bf8-49da-9313-6763ffeacf2a,ALEX TEO,https://play-lh.googleusercontent.com/a-/ALV-U...,BEWARE!! absolutely useless. Everytime i locke...,1,0,5.3.4,2025-04-09 15:13:37,,NaT,5.3.4,English
1,2bedd865-d385-45e2-83f8-bbb3317587ba,Jean Richards,https://play-lh.googleusercontent.com/a/ACg8oc...,my bmw is just best super to drive so comforta...,5,0,,2025-04-09 14:54:20,,NaT,,English
2,54d14726-aa2d-4587-8647-b50b3bb316e9,Imran Ali Jamal,https://play-lh.googleusercontent.com/a/ACg8oc...,Great app for keeping track of your BMW.,4,0,5.3.3,2025-04-09 13:32:29,,NaT,5.3.3,English
3,73532596-6628-4b66-ab82-14af6d3f29ab,KENNETH ORJI,https://play-lh.googleusercontent.com/a-/ALV-U...,I love the fact that it is free to download an...,5,0,5.3.3,2025-04-09 11:24:57,,NaT,5.3.3,English
4,3d7d6de6-1f01-42f2-865c-49124326b268,tihomir jarnjevic,https://play-lh.googleusercontent.com/a/ACg8oc...,"no limit for charging, or timer",4,0,5.3.3,2025-04-09 07:46:06,,NaT,5.3.3,English


In [4]:
import os
import time
import json
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import subprocess
import atexit
import signal
import warnings

# Suppress the pandas FutureWarning about concatenation
warnings.filterwarnings('ignore', category=FutureWarning)

def translate_text(text, source_lang, model_name):
    """
    Translate text to English using Ollama with an enhanced prompt.
    """
    prompt = f"""You are a professional translator specialized in automotive app reviews. Translate the following review written in {source_lang} into English.

TASK: Translate this BMW app review accurately while preserving:
- The original tone and sentiment
- Technical terminology (e.g., Connected Drive, MyBMW App, iDrive, Digital Key)
- App-specific or BMW-specific expressions and informal language

Please observe the following guidelines:
1. Preserve technical terms as given.
2. Use consistent terminology (e.g., "Ladestation/Borne de recharge" → "charging station").
3. If text is unclear, translate literally rather than interpreting.
4. Keep numbers, percentages, units, error codes, emojis, and model numbers intact.

IMPORTANT:
- Return only the translated text, do not add any additional information, explanations, or comments.
- Maintain the original paragraph structure and tone.
- Do not extend or elaborate beyond what is in the original text.

Review:
"{text}"

Translation:"""
    
    process = subprocess.run(
        ["ollama", "run", model_name],
        input=prompt,
        text=True,
        capture_output=True
    )
    
    return process.stdout.strip()

def translate_all_reviews(df, model_name, base_dir="bmw_app_analysis", checkpoint_interval=100):
    """
    Translate all non-English reviews in a DataFrame to English with robust checkpointing.
    Saves English reviews as batch000.csv, then each batch of 100 translations as separate files.
    
    Args:
        df: DataFrame containing reviews with 'content' and 'language' columns
        model_name: Name of the Ollama model to use
        base_dir: Base directory for saving files
        checkpoint_interval: Save intermediate results after this many translations
    
    Returns:
        DataFrame with all reviews and added 'content_english' column
    """
    # Create a single translations directory for all files
    translations_dir = os.path.join(base_dir, "translations")
    os.makedirs(translations_dir, exist_ok=True)
    
    # Set up logging
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_id = f"run_{timestamp}"  # Keep a run ID for the progress file
    progress_file = os.path.join(translations_dir, "progress.json")
    
    # Variables to track state for autosave
    _current_batch_df = pd.DataFrame()
    _combined_df = None  # For emergency saves only
    _translated_indices = []
    _last_checkpoint_num = 0
    _progress_pct = 0
    
    # Function to save current progress (for emergency saves)
    def save_current_progress(signal_received=None, frame=None):
        nonlocal _current_batch_df, _combined_df, _translated_indices, _last_checkpoint_num, _progress_pct
        
        if len(_translated_indices) == 0:
            print("\nNo progress to save.")
            return
            
        # If we have unsaved translations in the current batch, save them
        if not _current_batch_df.empty:
            emergency_file = os.path.join(translations_dir, f"emergency_{_last_checkpoint_num+1:03d}.csv")
            _current_batch_df.to_csv(emergency_file, index=False)
            
            # Also save a combined file for recovery
            if _combined_df is not None:
                full_emergency = os.path.join(translations_dir, "emergency_all.csv")
                _combined_df.to_csv(full_emergency, index=False)
                print(f"Full emergency backup saved to: {full_emergency}")
            
        # Update progress file
        progress_data = {
            'run_id': run_id,
            'model_name': model_name,
            'total_reviews': len(df),
            'english_reviews': len(english_df) if 'english_df' in locals() else 0,
            'non_english_reviews': len(non_english_df) if 'non_english_df' in locals() else 0,
            'translated_reviews': len(_translated_indices),
            'progress_percent': _progress_pct,
            'checkpoint_number': _last_checkpoint_num + 1,
            'last_checkpoint': emergency_file if '_current_batch_df' in locals() and not _current_batch_df.empty else None,
            'translated_indices': [str(i) for i in _translated_indices],
            'status': 'paused'
        }
        
        with open(progress_file, 'w') as f:
            json.dump(progress_data, f, indent=4)
        
        print(f"\nEmergency progress saved.")
        print(f"Progress: {len(_translated_indices)} reviews translated ({_progress_pct}%)")
        print("You can resume translation later by running the function again.")
        
        if signal_received:
            exit(0)
    
    # Register handlers for various exit scenarios
    atexit.register(save_current_progress)
    signal.signal(signal.SIGINT, save_current_progress)  # Ctrl+C
    signal.signal(signal.SIGTERM, save_current_progress)  # Termination signal
    
    # Create a working copy of the DataFrame
    working_df = df.copy()
    
    # Ensure 'content_english' column exists
    if 'content_english' not in working_df.columns:
        working_df['content_english'] = working_df['content']
    
    # Separate English and non-English reviews
    english_df = working_df[working_df['language'] == 'English'].copy()
    non_english_df = working_df[working_df['language'] != 'English'].copy()
    
    print(f"Total reviews: {len(working_df)}")
    print(f"English reviews: {len(english_df)}")
    print(f"Non-English reviews: {len(non_english_df)}")
    
    # Save English-only reviews with simple name: batch000.csv
    english_checkpoint = os.path.join(translations_dir, "batch000.csv")
    english_df.to_csv(english_checkpoint, index=False)
    print(f"Saved English-only reviews as: {english_checkpoint}")
    
    # Create a combined DataFrame for tracking progress
    combined_df = english_df.copy()  # Start with English reviews
    _combined_df = combined_df  # Copy for emergency saves
    
    # Check for existing progress
    last_checkpoint_num = 0
    translated_indices = []
    
    if os.path.exists(progress_file):
        try:
            with open(progress_file, 'r') as f:
                progress_data = json.load(f)
                last_checkpoint_num = progress_data.get('checkpoint_number', 0)
                translated_indices = [int(idx) for idx in progress_data.get('translated_indices', [])]
                
                if translated_indices:
                    print(f"Found previous progress: {len(translated_indices)}/{len(non_english_df)} reviews translated")
                    print(f"Last checkpoint: {last_checkpoint_num}")
                    
                    resume = input("Resume from last checkpoint? (y/n): ")
                    if resume.lower() == 'y':
                        # Load all existing batch files and reconstruct the combined DataFrame
                        print("Loading existing batches...")
                        combined_df = english_df.copy()  # Start with English reviews
                        
                        # Load each numbered batch
                        for i in range(1, last_checkpoint_num + 1):
                            batch_file = os.path.join(translations_dir, f"batch{i:03d}.csv")
                            if os.path.exists(batch_file):
                                batch = pd.read_csv(batch_file)
                                print(f"Loading batch{i:03d}.csv ({len(batch)} reviews)")
                                combined_df = pd.concat([combined_df, batch])
                        
                        _combined_df = combined_df  # Copy for emergency saves
                        print(f"Loaded {len(combined_df) - len(english_df)} translated reviews from checkpoints")
                    else:
                        print("Starting fresh, but keeping English-only checkpoint")
                        translated_indices = []
                        last_checkpoint_num = 0
        except Exception as e:
            print(f"Error reading progress file: {e}")
            translated_indices = []
    
    # Update global variables for emergency saves
    _translated_indices = translated_indices
    _last_checkpoint_num = last_checkpoint_num
    
    # Get remaining reviews to translate
    remaining_indices = [idx for idx in non_english_df.index if idx not in translated_indices]
    print(f"Translating {len(remaining_indices)} remaining reviews...")
    
    # Option to stop before starting (in case they loaded the wrong checkpoint)
    if remaining_indices:
        proceed = input("Proceed with translation? (y/n): ")
        if proceed.lower() != 'y':
            print("Translation canceled. All loaded data is preserved.")
            return combined_df
    
    # Main translation loop
    try:
        # Process reviews in batches
        total_batches = (len(remaining_indices) + checkpoint_interval - 1) // checkpoint_interval
        
        for batch_idx in range(total_batches):
            next_checkpoint_num = last_checkpoint_num + 1
            
            print(f"\n========== BATCH {next_checkpoint_num:03d} ==========")
            print(f"Processing reviews {batch_idx * checkpoint_interval + 1} to {min((batch_idx + 1) * checkpoint_interval, len(remaining_indices))}")
            
            # Get the indices for this batch
            start_idx = batch_idx * checkpoint_interval
            end_idx = min((batch_idx + 1) * checkpoint_interval, len(remaining_indices))
            batch_size = end_idx - start_idx
            batch_indices = remaining_indices[start_idx:end_idx]
            
            # Create batch DataFrame
            batch_df = pd.DataFrame(columns=working_df.columns)
            _current_batch_df = batch_df  # Copy for emergency saves
            
            # Process reviews in this batch
            progress_bar = tqdm(
                total=batch_size,
                desc=f"Batch {next_checkpoint_num:03d}",
                ncols=100,
                bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
            )
            
            batch_translated = 0
            
            for idx in batch_indices:
                # Get review text and language
                original_text = working_df.loc[idx, 'content']
                source_lang = working_df.loc[idx, 'language']
                
                # Skip if empty
                if pd.isna(original_text) or not original_text.strip():
                    progress_bar.update(1)
                    continue
                
                # Translate the text
                translated_text = translate_text(original_text, source_lang, model_name)
                
                # Create a row to add 
                row = working_df.loc[[idx]].copy()
                row['content_english'] = translated_text
                
                # Add row to the batch DataFrame (using a method that avoids the warning)
                if batch_df.empty:
                    batch_df = row.copy()
                else:
                    batch_df = pd.concat([batch_df, row], ignore_index=False)
                
                # Update for emergency saves
                _current_batch_df = batch_df
                
                # Add to translated indices
                translated_indices.append(idx)
                _translated_indices = translated_indices
                batch_translated += 1
                
                # Update progress bar
                progress_bar.update(1)
                
                # Show occasional status updates within the batch
                if batch_translated % 10 == 0:
                    progress_pct = round(len(translated_indices) / len(non_english_df) * 100, 1)
                    _progress_pct = progress_pct
                    progress_bar.set_postfix({"Total": f"{len(translated_indices)}/{len(non_english_df)}", "Progress": f"{progress_pct}%"})
            
            # Close progress bar for this batch
            progress_bar.close()
            
            # Add batch to combined DataFrame (for tracking only)
            combined_df = pd.concat([combined_df, batch_df])
            _combined_df = combined_df  # Copy for emergency saves
            
            # Calculate progress percentage
            progress_pct = round(len(translated_indices) / len(non_english_df) * 100, 1)
            _progress_pct = progress_pct  # Update for emergency saves
            
            # Save ONLY THIS BATCH with simple name: batch001.csv, batch002.csv, etc.
            checkpoint_file = os.path.join(translations_dir, f"batch{next_checkpoint_num:03d}.csv")
            batch_df.to_csv(checkpoint_file, index=False)
            
            # Update progress file
            progress_data = {
                'run_id': run_id,
                'model_name': model_name,
                'total_reviews': len(working_df),
                'english_reviews': len(english_df),
                'non_english_reviews': len(non_english_df),
                'translated_reviews': len(translated_indices),
                'progress_percent': progress_pct,
                'checkpoint_number': next_checkpoint_num,
                'last_checkpoint': checkpoint_file,
                'translated_indices': [str(i) for i in translated_indices],
                'status': 'in_progress'
            }
            
            with open(progress_file, 'w') as f:
                json.dump(progress_data, f, indent=4)
            
            # Print batch summary
            print(f"\nBatch {next_checkpoint_num:03d} complete!")
            print(f"Saved batch with {len(batch_df)} translations: {checkpoint_file}")
            print(f"Overall progress: {len(translated_indices)}/{len(non_english_df)} reviews ({progress_pct}%)")
            
            # Show sample translations
            if not batch_df.empty:
                print("\nSample translations from this batch:")
                sample_count = min(3, len(batch_df))
                sample_indices = batch_df.index[-sample_count:]
                
                for idx in sample_indices:
                    lang = batch_df.loc[idx, 'language']
                    orig = batch_df.loc[idx, 'content']
                    trans = batch_df.loc[idx, 'content_english']
                    print(f"\n[{lang}] Original: {orig[:100]}..." if len(orig) > 100 else f"\n[{lang}] Original: {orig}")
                    print(f"[English] Translation: {trans[:100]}..." if len(trans) > 100 else f"[English] Translation: {trans}")
                    print("---")
            
            # Update tracking variables for next batch
            last_checkpoint_num = next_checkpoint_num
            _last_checkpoint_num = last_checkpoint_num  # Update for emergency saves
            
            # Ask to continue if not the last batch
            if batch_idx < total_batches - 1:
                continue_translation = input("\nContinue to next batch? (y/n): ")
                if continue_translation.lower() != 'y':
                    # Update progress status to paused
                    progress_data['status'] = 'paused'
                    with open(progress_file, 'w') as f:
                        json.dump(progress_data, f, indent=4)
                    
                    print(f"\nTranslation paused at {progress_pct}% complete.")
                    print(f"To resume later, run the function again and select 'y' when prompted to resume.")
                    return combined_df
    
    except KeyboardInterrupt:
        print("\nTranslation interrupted by user")
        if 'progress_bar' in locals() and progress_bar is not None:
            progress_bar.close()
        # The autosave handler will take care of saving progress
        return combined_df
    
    # Translation complete - save final merged result
    print("\nMerging all batches into final file...")
    
    # Start with just English reviews
    final_df = english_df.copy()
    
    # Load and merge all batch files
    for i in range(1, last_checkpoint_num + 1):
        batch_file = os.path.join(translations_dir, f"batch{i:03d}.csv")
        if os.path.exists(batch_file):
            batch = pd.read_csv(batch_file)
            print(f"Adding batch{i:03d}.csv ({len(batch)} reviews)")
            final_df = pd.concat([final_df, batch])
    
    final_file = os.path.join(translations_dir, "final_translated.csv")
    final_df.to_csv(final_file, index=False)
    
    # Update progress file
    progress_data = {
        'run_id': run_id,
        'model_name': model_name,
        'total_reviews': len(working_df),
        'english_reviews': len(english_df),
        'non_english_reviews': len(non_english_df),
        'translated_reviews': len(translated_indices),
        'progress_percent': 100,
        'status': 'completed',
        'checkpoint_number': last_checkpoint_num,
        'batch_count': last_checkpoint_num,
        'final_output': final_file,
        'translated_indices': [str(i) for i in translated_indices]
    }
    
    with open(progress_file, 'w') as f:
        json.dump(progress_data, f, indent=4)
    
    print(f"\nTranslation complete! All {len(translated_indices)} non-English reviews translated")
    print(f"Final merged output saved to: {final_file}")
    print(f"Final file contains {len(final_df)} total reviews (English + translated)")
    
    return final_df

# Helper function to merge all checkpoint files
def merge_translation_batches(base_dir="bmw_app_analysis"):
    """
    Merge all translation batch files into a single DataFrame.
    """
    translations_dir = os.path.join(base_dir, "translations")
    
    # Ensure the directory exists
    if not os.path.exists(translations_dir):
        print(f"Error: Translations directory {translations_dir} not found!")
        return None
        
    # Start with English reviews
    english_file = os.path.join(translations_dir, "batch000.csv")
    if not os.path.exists(english_file):
        print(f"Error: English file {english_file} not found!")
        return None
        
    merged_df = pd.read_csv(english_file)
    print(f"Loaded English reviews: {len(merged_df)}")
    
    # Find all batch files and sort them numerically
    batch_files = [f for f in os.listdir(translations_dir) if f.startswith("batch") and f != "batch000.csv"]
    batch_files.sort(key=lambda x: int(x.replace("batch", "").replace(".csv", "")))
    
    # Merge each batch
    for batch_file in batch_files:
        file_path = os.path.join(translations_dir, batch_file)
        batch_df = pd.read_csv(file_path)
        print(f"Adding {batch_file} ({len(batch_df)} reviews)")
        merged_df = pd.concat([merged_df, batch_df])
    
    # Save the merged result
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    merged_file = os.path.join(translations_dir, f"merged_translations_{timestamp}.csv")
    merged_df.to_csv(merged_file, index=False)
    
    print(f"Merged all batches successfully!")
    print(f"Total reviews: {len(merged_df)}")
    print(f"Saved to: {merged_file}")
    
    return merged_df

In [None]:
# Execute the translation
df_translated = translate_all_reviews(df, ollama_model_name, checkpoint_interval=1000)

# Print dimensions of the dataframe before and after translation
print(f"Original DataFrame dimensions: {df.shape} (rows, columns)")
print(f"Translated DataFrame dimensions: {df_translated.shape} (rows, columns)")

Total reviews: 18350
English reviews: 4617
Non-English reviews: 13733
Saved English-only reviews as: bmw_app_analysis/translations/batch000.csv
Found previous progress: 7000/13733 reviews translated
Last checkpoint: 7
Loading existing batches...
Loading batch001.csv (1000 reviews)
Loading batch002.csv (1000 reviews)
Loading batch003.csv (1000 reviews)
Loading batch004.csv (1000 reviews)
Loading batch005.csv (1000 reviews)
Loading batch006.csv (1000 reviews)
Loading batch007.csv (1000 reviews)
Loaded 7000 translated reviews from checkpoints
Translating 6733 remaining reviews...

Processing reviews 1 to 1000


Batch 008: 100%|███████████████████████████████████████████████████████████| 1000/1000 [31:46<00:00]



Batch 008 complete!
Saved batch with 1000 translations: bmw_app_analysis/translations/batch008.csv
Overall progress: 8000/13733 reviews (58.3%)

Sample translations from this batch:

[Spanish] Original: buena app para saber cómo está tu vehículo
[English] Translation: "Good app for knowing how your vehicle is."
---

[Spanish] Original: Fácil de manejar, intuitiva, controlas muchos aspectos del automóvil desde el celular
[English] Translation: Easy to use, intuitive, you control many aspects of the car from your cell phone.
---

[Spanish] Original: esta muy bien, tienes información de tu bmw todo el tiempo
[English] Translation: This is very good, you have information about your BMW all the time.
---

Processing reviews 1001 to 2000


Batch 009:  14%|████████▍                                                   | 141/1000 [03:22<19:35]

In [5]:
# ======================================
# BMW App Review Analysis - Single-Task Classification
# ======================================
import os
import glob
import time
import subprocess
import pandas as pd
import json
import logging
from tqdm import tqdm
from typing import Dict, List, Optional, Union
import traceback
from datetime import datetime

# Create organized folder structure
BASE_DIR = "bmw_app_analysis"
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
LOGS_DIR = os.path.join(BASE_DIR, "logs")

# Create all directories
for directory in [BASE_DIR, CHECKPOINT_DIR, RESULTS_DIR, LOGS_DIR]:
    os.makedirs(directory, exist_ok=True)

# File to track progress
PROGRESS_FILE = os.path.join(BASE_DIR, "analysis_progress.json")

# Ensure display is imported for notebooks
try:
    from IPython.display import display
except ImportError:
    display = print  # Fallback for non-notebook environments

# Set up logging
log_file = os.path.join(LOGS_DIR, f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()  # Also log to console
    ]
)

# Utility function to run Ollama (reused from original code)
def run_ollama(prompt: str, model_name: str) -> str:
    """Execute Ollama model with the provided prompt."""
    try:
        process = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt,
            text=True,
            capture_output=True,
            check=True,
            encoding='utf-8'
        )
        return process.stdout.strip()
    except subprocess.CalledProcessError as e:
        stderr_output = e.stderr.strip() if e.stderr else "No stderr output."
        logging.error(f"Ollama command failed with exit code {e.returncode}. Stderr: {stderr_output}")
        return ""
    except Exception as e:
        logging.error(f"An unexpected error occurred running Ollama: {e}")
        return ""

# ======================================
# Individual Classification Functions
# ======================================

def classify_sentiment(review_text: str, model_name: str) -> str:
    """
    Classify the sentiment of a review text as positive, negative, or neutral.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "positive", "negative", or "neutral" (lowercase)
    """
    if not review_text or not isinstance(review_text, str):
        return "neutral"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Classify the sentiment of this review:

"{review_text}"

CLASSIFICATION TASK:
Determine if the sentiment is positive, negative, or neutral.

DETAILED GUIDELINES:
- Positive: 
  * User explicitly expresses satisfaction, appreciation, praise
  * Uses positive adjectives (great, excellent, amazing, love)
  * Shows enthusiasm about features or performance
  * Reports problems being fixed or improvements made
  * Explicitly recommends the app to others
  * Contains predominantly positive language despite minor issues

- Negative:
  * User explicitly expresses dissatisfaction, frustration, anger
  * Reports bugs, crashes, failures, or malfunctions
  * Uses negative adjectives (terrible, awful, useless, poor)
  * States the app doesn't work as expected or advertised
  * User had to find workarounds for basic functionality
  * Contains predominantly critical language despite minor praise

- Neutral:
  * Balance of positive and negative points with neither dominating
  * Factual descriptions without emotional language
  * Questions about functionality without clear satisfaction/dissatisfaction
  * Suggestions for improvements without expressing frustration
  * Simple factual statements about the app's functions
  * Too vague to determine sentiment clearly
 
IMPORTANT DECISION RULES:
- If review mentions both positives and negatives, focus on:
  1. The strongest emotional language (which sentiment has stronger expressions?)
  2. The most recent experience (latest update/version)
  3. Core functionality issues outweigh minor aesthetic praise
  4. Essential features working outweighs minor inconveniences
- Very short reviews with just "good" = positive, "bad" = negative, "ok" = neutral
- Sarcasm should be interpreted for the underlying sentiment ("Great, another crash" = negative)
- If review is exceptionally ambiguous, default to "neutral"

EXAMPLES:
1. "Great app, works perfectly every time!" → positive
2. "App keeps crashing when I try to check my car status." → negative
3. "The app is okay but could use some improvements." → neutral
4. "Used to crash constantly but recent update fixed most issues." → positive (most recent experience)
5. "Nice design but completely useless as it fails to connect to my car." → negative (core functionality issue)
6. "The app has some bugs but generally works well enough for what I need." → neutral (balanced)
7. "I'm impressed with the range of features, though it occasionally lags." → positive (stronger positive than negative)
8. "Loading times are frustratingly slow but at least it doesn't crash anymore." → neutral (balanced positives/negatives)

RESPONSE FORMAT:
Respond with ONLY ONE WORD: positive, negative, or neutral (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        valid_sentiments = ["positive", "negative", "neutral"]
        if response in valid_sentiments:
            return response
        
        # Handle potential extra text by checking for valid sentiment words
        for sentiment in valid_sentiments:
            if sentiment in response:
                logging.warning(f"Extracted '{sentiment}' from response: '{response}'")
                return sentiment
                
        # Default if response is invalid
        logging.warning(f"Invalid sentiment response: '{response}'. Defaulting to 'neutral'")
        return "neutral"
    except Exception as e:
        logging.error(f"Sentiment classification failed: {str(e)}")
        return "neutral"


def classify_topics(review_text: str, model_name: str) -> str:
    """
    Identify the relevant topics in a review from a predefined list.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: Comma-separated topic list, or "other"
    """
    if not review_text or not isinstance(review_text, str):
        return "other"
    
    # Valid topics list - used for verification
    valid_topics = [
        "ui/ux", "performance", "connectivity", "authentication", 
        "vehicle status", "remote controls", "trip planning", 
        "charging management", "map/navigation", "mobile features", 
        "data & privacy", "updates", "customer support",
        "connected store", "bmw digital premium", "digital key/mobile key",
        "vehicle configuration & personalization", "multimedia integration",
        "smartphone integration", "service & maintenance", "parking solutions",
        "voice assistant", "my garage/vehicle management", "localization & language",
        "bmw connected ecosystem", "ev-specific features", "notification management",
        "usage statistics", "tutorial/help section", "other"
    ]
    
    # BMW review topics list (abbreviated for prompt space)
    topics_list = """
    1. ui/ux - design, usability, navigation, visual appeal
    2. performance - speed, crashes, bugs, stability, battery drain
    3. connectivity - connection issues, bluetooth, server integration
    4. authentication - login, account issues, multi-factor
    5. vehicle status - battery/fuel level, location, diagnostics
    6. remote controls - lock/unlock, climate, remote start
    7. trip planning - route optimization, scheduling
    8. charging management - status, stations, scheduling
    9. map/navigation - maps, route planning, gps accuracy
    10. mobile features - widgets, notifications, interactions
    11. data & privacy - data handling, security concerns
    12. updates - app updates, version issues, bugs
    13. customer support - support experience, response time
    14. connected store - in-app store, purchases, products
    15. bmw digital premium - subscription services, premium features
    16. digital key/mobile key - phone as key, sharing, access
    17. vehicle configuration & personalization - profiles, settings
    18. multimedia integration - music control, media streaming
    19. smartphone integration - carplay, android auto
    20. service & maintenance - scheduling, alerts, history
    21. parking solutions - location, payments, availability
    22. voice assistant - voice commands, assistant functionality
    23. my garage/vehicle management - multiple vehicles, profiles
    24. localization & language - translations, regional features
    25. bmw connected ecosystem - integration with other bmw services
    26. ev-specific features - range, charging, battery features
    27. notification management - app alerts, push notifications, alert settings
    28. usage statistics - mileage tracking, fuel/energy consumption, driving history
    29. tutorial/help section - in-app guidance, manuals, feature explanations
    """
    
    prompt = f"""You are an expert at analyzing BMW app reviews. Identify the main topics discussed in this review:

"{review_text}"

CLASSIFICATION TASK:
Identify 1-5 most relevant topics from this list:
{topics_list}

STRICT RESPONSE RULES:
1. RESPOND ONLY with topic names from the list, separated by commas
2. DO NOT write any explanations, introductions, or reasoning
3. DO NOT write complete sentences
4. USE ONLY the exact topic names listed above
5. If no topics apply, just respond with "other"
6. Use lowercase only

EXAMPLES:
Review: "The app crashes every time I try to check my battery level"
Valid response: performance, vehicle status

Review: "Love the new design! Very easy to use."
Valid response: ui/ux

Review: "Can't connect to my car. Bluetooth always fails."
Valid response: connectivity

Review: "ok"
Valid response: other

Review: "Would be nice if it had a way to schedule charging on my i4"
Valid response: feature requests, charging management, ev-specific features

IMPORTANT: Your entire response must be ONLY the topic names, nothing else. No explanations or additional text allowed.
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Clean basic things like quotes and periods
        response = response.replace('"', '').replace("'", '').replace(".", "").replace("!", "").replace("?", "")
        
        # VALIDATION: Split by commas and validate each topic
        if not response or len(response) > 200:  # Avoid extremely long responses
            return "other"
            
        # Split the response
        topics = [t.strip() for t in response.split(',')]
        
        # Filter to keep only valid topics
        valid_results = []
        for topic in topics:
            if topic in valid_topics:
                valid_results.append(topic)
            # Skip invalid topics
        
        # If no valid topics remain, return "other"
        if not valid_results:
            return "other"
            
        # Return validated topics
        return ", ".join(valid_results)
        
    except Exception as e:
        logging.error(f"Topic classification failed: {str(e)}")
        return "other"

def classify_vehicle_type(review_text: str, model_name: str) -> str:
    """
    Determine if the review refers to an electric/hybrid vehicle, combustion engine, or is unclear.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "ev_hybrid", "combustion", or "unclear"
    """
    if not review_text or not isinstance(review_text, str):
        return "unclear"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Determine what type of vehicle the user has based on this review:

"{review_text}"

CLASSIFICATION TASK:
Determine if the user has an electric/hybrid vehicle (BMW EV or PHEV), a combustion engine vehicle, or if it's unclear.

GUIDELINES:
- Classify as "ev_hybrid" if the review mentions:
  * Charging or battery level (in %, kWh)
  * Electric range or range anxiety
  * Charging stations or charging schedule
  * Preconditioning related to battery (warming up the battery)
  * Regenerative braking
  * Any BMW EV or hybrid model names (i3, i4, i5, i7, iX, 330e, 530e, X5 45e/50e)
  * Explicitly says "electric" or "EV" or "plug-in hybrid"

- Classify as "combustion" if the review mentions:
  * Fuel level, gas, petrol, diesel explicitly
  * Engine sounds or non-electric engine characteristics
  * MPG, l/100km in context of fuel
  * Explicitly mentions combustion-only models (e.g., "my M3", "my 330i")
  * Refers to refueling or gas stations

- Classify as "unclear" if:
  * No specific vehicle type indicators are present
  * Only mentions general features that apply to both types
  * Only mentions "my BMW" with no specific model
  * Cannot determine confidently from the text

EXAMPLES:
1. "App shows my battery at 80% but my actual i4 shows 75%" → ev_hybrid
2. "I can't find where to set up my charging schedule" → ev_hybrid
3. "Fuel gauge is incorrect, shows half tank when I just filled up my X3" → combustion
4. "Can't connect to my car at all" → unclear
5. "Love that I can precondition the cabin" → unclear (both vehicle types have this)
6. "Shows range but not how much gas is left" → combustion
7. "The range estimation is way off on my 330e" → ev_hybrid
8. "Where is the button to lock my car?" → unclear

RESPONSE FORMAT:
Respond with ONLY ONE WORD: ev_hybrid, combustion, or unclear (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        valid_types = ["ev_hybrid", "combustion", "unclear"]
        if response in valid_types:
            return response
        
        # Handle potential extra text by checking for valid vehicle type words
        for vehicle_type in valid_types:
            if vehicle_type in response:
                logging.warning(f"Extracted '{vehicle_type}' from response: '{response}'")
                return vehicle_type
                
        # Default if response is invalid
        logging.warning(f"Invalid vehicle type response: '{response}'. Defaulting to 'unclear'")
        return "unclear"
    except Exception as e:
        logging.error(f"Vehicle type classification failed: {str(e)}")
        return "unclear"


def classify_user_experience(review_text: str, model_name: str) -> str:
    """
    Determine if the user is new to the app, an experienced user, or if it's unclear.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "new_user", "experienced_user", or "unclear"
    """
    if not review_text or not isinstance(review_text, str):
        return "unclear"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Determine how experienced the user is with the BMW app based on this review:

"{review_text}"

CLASSIFICATION TASK:
Classify whether the user is new to the app, experienced with it, or if it's unclear.

GUIDELINES:
- Classify as "new_user" if the review mentions:
  * Just downloaded or installed
  * First impressions
  * Just got the car
  * Recently purchased
  * Setting up for the first time
  * Initial experience
  * New to BMW or the app

- Classify as "experienced_user" if the review mentions:
  * Using the app for a period of time (months/years)
  * References to previous versions of the app
  * Comparisons to how the app used to work
  * Updates changing functionality they're familiar with
  * Being a long-time BMW owner
  * Historical perspective on app changes

- Classify as "unclear" if:
  * No time references or experience level indicators
  * Cannot determine confidently from the text
  * Only gives current impression without historical context

EXAMPLES:
1. "Just got my new BMW and can't figure out how to set up the app" → new_user
2. "Been using this app for 3 years and the latest update broke everything" → experienced_user
3. "The app keeps crashing when I check vehicle status" → unclear
4. "The old version was much better, this redesign is terrible" → experienced_user
5. "First day using it and I'm already impressed with the features" → new_user
6. "This app sucks" → unclear

RESPONSE FORMAT:
Respond with ONLY ONE WORD: new_user, experienced_user, or unclear (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        valid_types = ["new_user", "experienced_user", "unclear"]
        if response in valid_types:
            return response
        
        # Handle common variations
        if "new" in response:
            return "new_user"
        if "experienced" in response or "experience" in response:
            return "experienced_user"
                
        # Default if response is invalid
        logging.warning(f"Invalid user experience response: '{response}'. Defaulting to 'unclear'")
        return "unclear"
    except Exception as e:
        logging.error(f"User experience classification failed: {str(e)}")
        return "unclear"


def classify_usage_profile(review_text: str, model_name: str) -> str:
    """
    Determine if the user is a power user, casual user, or if it's unclear.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "power_user", "casual_user", or "unclear"
    """
    if not review_text or not isinstance(review_text, str):
        return "unclear"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Determine the user's usage pattern based on this review:

"{review_text}"

CLASSIFICATION TASK:
Classify whether the user is a power user who uses advanced features, a casual user who uses basic features, or if it's unclear.

GUIDELINES:
- Classify as "power_user" if the review mentions:
  * Multiple advanced features (trip planning, automation, custom settings)
  * Integration with smart home or other services
  * Technical details about functionality
  * Complex use cases beyond basic car controls
  * Digital key plus or advanced features
  * Detailed technical feedback suggesting deep engagement
  * Regular/daily use of multiple features

- Classify as "casual_user" if the review mentions:
  * Only basic features (lock/unlock, climate control, basic status)
  * Simple use cases like checking fuel/charge or location
  * General non-technical feedback
  * Occasional or infrequent use
  * Focuses on core simple functions only

- Classify as "unclear" if:
  * No specific features or usage patterns mentioned
  * Cannot determine usage depth from the text
  * General comments that don't indicate how they use the app

EXAMPLES:
1. "Can't get the digital key to work with my smart home automation" → power_user
2. "I just use it to check my fuel level and lock the doors occasionally" → casual_user
3. "App keeps crashing" → unclear
4. "Love how I can set up charging schedules and have the climate start automatically based on my calendar" → power_user
5. "It's annoying that I have to restart it every time I want to check where I parked" → casual_user
6. "Decent app but needs improvement" → unclear

RESPONSE FORMAT:
Respond with ONLY ONE WORD: power_user, casual_user, or unclear (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        valid_types = ["power_user", "casual_user", "unclear"]
        if response in valid_types:
            return response
        
        # Handle common variations
        if "power" in response:
            return "power_user"
        if "casual" in response:
            return "casual_user"
                
        # Default if response is invalid
        logging.warning(f"Invalid usage profile response: '{response}'. Defaulting to 'unclear'")
        return "unclear"
    except Exception as e:
        logging.error(f"Usage profile classification failed: {str(e)}")
        return "unclear"


def classify_pain_point(review_text: str, model_name: str) -> str:
    """
    Determine if the review mentions a pain point (yes/no).
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "yes" or "no"
    """
    if not review_text or not isinstance(review_text, str):
        return "no"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Determine if this review mentions any pain points:

"{review_text}"

CLASSIFICATION TASK:
Determine if the user mentions any pain points, issues, or problems with the app.

GUIDELINES:
Classify as "yes" if the review mentions:
- Crashes, bugs, glitches, or technical issues
- Features not working as expected
- Frustration or difficulty using the app
- Complaints about design or performance
- Connection failures or syncing problems
- Missing expected functionality
- Errors or unexpected behavior
- Battery drain or other resource issues
- Anything the user clearly finds problematic

Classify as "no" if:
- The review is generally positive
- No specific issues or problems are mentioned
- The user is only making general comments or asking questions
- The review only contains feature requests without complaints

EXAMPLES:
1. "App keeps crashing when I try to check status" → yes
2. "Works great every time!" → no
3. "Why is it so hard to find the charging settings?" → yes
4. "Would be nice if you could add a widget" → no (feature request without complaint)
5. "Can't connect to my car half the time. Very frustrating!" → yes
6. "Just got the app. Looking forward to using it." → no

RESPONSE FORMAT:
Respond with ONLY ONE WORD: yes or no (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        if response in ["yes", "no"]:
            return response
            
        # Handle potential extra text
        if "yes" in response:
            return "yes"
        if "no" in response:
            return "no"
                
        # Default if response is invalid
        logging.warning(f"Invalid pain point response: '{response}'. Defaulting to 'no'")
        return "no"
    except Exception as e:
        logging.error(f"Pain point classification failed: {str(e)}")
        return "no"


def classify_feature_request(review_text: str, model_name: str) -> str:
    """
    Determine if the review contains a feature request (yes/no).
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        str: "yes" or "no"
    """
    if not review_text or not isinstance(review_text, str):
        return "no"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Determine if this review contains a feature request:

"{review_text}"

CLASSIFICATION TASK:
Determine if the user is explicitly asking for or suggesting new features or improvements.

GUIDELINES:
Classify as "yes" if the review:
- Explicitly asks for a new feature to be added
- Suggests improvements to existing functionality
- Uses phrases like "would be nice if", "wish it had", "please add"
- Compares to missing features in other apps they want implemented
- Describes functionality they want but that doesn't exist yet
- Makes specific suggestions for changes or additions
- Expresses desire for missing capabilities

Classify as "no" if:
- The review doesn't suggest any improvements or new features
- The user is only reporting bugs or issues with existing features
- The user is only describing current functionality
- The review only contains complaints without suggesting solutions

EXAMPLES:
1. "Would be great if you could add Apple Watch support" → yes
2. "The app keeps crashing" → no
3. "Please add the ability to schedule charging" → yes
4. "Why can't I see my trip history like in the Mercedes app?" → yes
5. "The UI is terrible and confusing" → no (complaint without suggestion)
6. "You should include a way to share my location with family members" → yes
7. "I wish there was a widget for quick access" → yes
8. "Works great" → no

RESPONSE FORMAT:
Respond with ONLY ONE WORD: yes or no (lowercase, no punctuation).
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Validate response
        if response in ["yes", "no"]:
            return response
            
        # Handle potential extra text
        if "yes" in response:
            return "yes"
        if "no" in response:
            return "no"
                
        # Default if response is invalid
        logging.warning(f"Invalid feature request response: '{response}'. Defaulting to 'no'")
        return "no"
    except Exception as e:
        logging.error(f"Feature request classification failed: {str(e)}")
        return "no"


def extract_competitor(review_text: str, model_name: str) -> str:
    """
    Extract which competitor brands are mentioned in the review.
    """
    if not review_text or not isinstance(review_text, str):
        return "none"
        
    prompt = f"""You are an expert at analyzing BMW app reviews. Identify any competitor car brands mentioned in this review:

"{review_text}"

CLASSIFICATION TASK:
Determine if the user mentions any BMW competitors and extract the specific competitor brand name(s).

DETAILED GUIDELINES:
- Extract ONLY car manufacturer brands EXPLICITLY mentioned in the review
- DO NOT infer or guess competitors that aren't directly mentioned
- If NO competitor is mentioned, return "none"
- [additional guidelines remain the same]

CRITICAL INSTRUCTION:
Only return a competitor name if it EXPLICITLY appears in the review text. Do not hallucinate brands.
"""

    try:
        response = run_ollama(prompt, model_name).strip().lower()
        
        # Clean response - remove any punctuation except commas
        response = response.replace('"', '').replace("'", '').replace(".", "").replace("!", "").replace("?", "")
        
        if "none" in response or not response:
            return "none"
        
        # VERIFICATION: Check if the response actually appears in the original text
        review_lower = review_text.lower()
        
        # Check each competitor name in the response
        competitors = response.split(',')
        verified_competitors = []
        
        for competitor in competitors:
            # Common name variations
            variations = {
                "mercedes": ["mercedes", "merc", "mercedes-benz", "mercedes benz"],
                "volkswagen": ["volkswagen", "vw", "volkswagon"],
                "chevrolet": ["chevrolet", "chevy"]
            }
            
            # Check if this competitor or its variations appear in the text
            if competitor in review_lower:
                verified_competitors.append(competitor)
                continue
                
            # Check variations if available
            if competitor in variations:
                for variation in variations[competitor]:
                    if variation in review_lower:
                        verified_competitors.append(competitor)
                        break
        
        if verified_competitors:
            return ",".join(verified_competitors)
        else:
            return "none"
            
    except Exception as e:
        logging.error(f"Competitor extraction failed: {str(e)}")
        return "none"


def analyze_review_step_by_step(review_text: str, model_name: str) -> Dict:
    """
    Analyze a review by performing each classification task separately.
    
    Args:
        review_text: The text of the review
        model_name: Name of the Ollama model to use
        
    Returns:
        Dict: Dictionary with all classification results
    """
    # Ensure the review text is a string
    if not isinstance(review_text, str):
        review_text = str(review_text) if review_text is not None else ""
    
    # Process each classification in sequence
    sentiment = classify_sentiment(review_text, model_name)
    topics = classify_topics(review_text, model_name)
    vehicle_type = classify_vehicle_type(review_text, model_name)
    user_experience = classify_user_experience(review_text, model_name)
    usage_profile = classify_usage_profile(review_text, model_name)
    is_pain_point = classify_pain_point(review_text, model_name)
    is_feature_request = classify_feature_request(review_text, model_name)
    competitor_mentioned = extract_competitor(review_text, model_name)
    
    # Return all results in a dictionary
    return {
        "sentiment": sentiment,
        "topics": topics,
        "vehicle_type": vehicle_type,
        "user_experience": user_experience,
        "usage_profile": usage_profile,
        "is_pain_point": is_pain_point,
        "is_feature_request": is_feature_request,
        "competitor_mentioned": competitor_mentioned
    }


def process_reviews_step_by_step(df: pd.DataFrame, model_name: str, batch_size: int = 10, 
                                start_batch: int = 1) -> pd.DataFrame:
    """
    Process all reviews with step-by-step individual classifications.
    
    Args:
        df: DataFrame with reviews in 'content_english' column
        model_name: Name of the Ollama model to use
        batch_size: Number of reviews to process per batch
        start_batch: Which batch to start processing from (for resuming)
        
    Returns:
        DataFrame with all classification results added
    """
    if 'content_english' not in df.columns:
        raise ValueError("Input DataFrame must contain 'content_english' column")
    
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    total_reviews = len(result_df)
    total_batches = (total_reviews + batch_size - 1) // batch_size
    
    # Initialize columns with default values (only if starting from the beginning)
    if start_batch == 1:
        result_df['sentiment'] = 'neutral'
        result_df['topics'] = 'other'
        result_df['vehicle_type'] = 'unclear'
        result_df['user_experience'] = 'unclear' 
        result_df['usage_profile'] = 'unclear'
        result_df['is_pain_point'] = 'no'
        result_df['is_feature_request'] = 'no'
        result_df['competitor_mentioned'] = 'none'
    
    logging.info(f"Starting step-by-step analysis from batch {start_batch}/{total_batches}")
    
    # Keep track of checkpoint filenames
    checkpoint_files = []
    
    # Process in batches
    for batch_num in range(start_batch, total_batches + 1):
        start_idx = (batch_num - 1) * batch_size
        end_idx = min(start_idx + batch_size, total_reviews)
        batch_indices = result_df.index[start_idx:end_idx]
        
        logging.info(f"Processing batch {batch_num}/{total_batches} (reviews {start_idx+1}-{end_idx})")
        
        # Process each review in the batch
        for idx in tqdm(batch_indices, desc=f"Batch {batch_num}", unit="review"):
            review_text = result_df.loc[idx, 'content_english']
            
            # Skip empty reviews
            if pd.isna(review_text) or not str(review_text).strip():
                logging.warning(f"Skipping empty review at index {idx}")
                continue
            
            # Run all classifications
            try:
                results = analyze_review_step_by_step(str(review_text), model_name)
                
                # Update the DataFrame with results
                result_df.loc[idx, 'sentiment'] = results.get('sentiment', 'neutral')
                result_df.loc[idx, 'topics'] = results.get('topics', 'other')
                result_df.loc[idx, 'vehicle_type'] = results.get('vehicle_type', 'unclear')
                result_df.loc[idx, 'user_experience'] = results.get('user_experience', 'unclear')
                result_df.loc[idx, 'usage_profile'] = results.get('usage_profile', 'unclear')
                result_df.loc[idx, 'is_pain_point'] = results.get('is_pain_point', 'no')
                result_df.loc[idx, 'is_feature_request'] = results.get('is_feature_request', 'no')
                result_df.loc[idx, 'competitor_mentioned'] = results.get('competitor_mentioned', 'none')
                
            except Exception as e:
                logging.error(f"Error processing review at index {idx}: {e}")
                # Keep default values for this review
        
        # Save checkpoint after each batch
        checkpoint_filename = os.path.join(CHECKPOINT_DIR, f"batch_{batch_num}_of_{total_batches}.csv")
        batch_df = result_df.iloc[start_idx:end_idx].copy()
        batch_df.to_csv(checkpoint_filename, index=False)
        checkpoint_files.append(checkpoint_filename)
        logging.info(f"Saved checkpoint to {checkpoint_filename}")
        
        # Save progress information
        progress = {
            "last_completed_batch": batch_num,
            "total_batches": total_batches,
            "batch_size": batch_size,
            "total_reviews": total_reviews,
            "model_name": model_name,
            "last_processed_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        with open(PROGRESS_FILE, 'w') as f:
            json.dump(progress, f, indent=4)
        
        # Ask user if they want to continue
        if batch_num < total_batches:
            continue_processing = input(f"\nBatch {batch_num}/{total_batches} completed. Continue to next batch? (y/n): ")
            if continue_processing.lower() != 'y':
                logging.info(f"Processing paused after batch {batch_num}. Run again to continue from batch {batch_num + 1}.")
                return result_df  # Return the partially processed DataFrame
    
    # All batches completed, merge results
    logging.info("All batches completed. Merging results...")
    merge_checkpoints()
    
    return result_df

def merge_checkpoints():
    """Merge all checkpoint files into one consolidated file"""
    checkpoint_files = sorted(glob.glob(os.path.join(CHECKPOINT_DIR, "batch_*.csv")))
    
    if not checkpoint_files:
        logging.warning("No checkpoint files found to merge")
        return
    
    # Read and combine all checkpoint files
    dfs = []
    for file in checkpoint_files:
        try:
            df = pd.read_csv(file)
            dfs.append(df)
            logging.info(f"Added {file} to merge list ({len(df)} rows)")
        except Exception as e:
            logging.error(f"Error reading {file}: {e}")
    
    if not dfs:
        logging.error("No valid checkpoint files could be read")
        return
    
    # Concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Save consolidated file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    consolidated_filename = os.path.join(RESULTS_DIR, f"bmw_reviews_consolidated_{timestamp}.csv")
    
    # Save full results
    merged_df.to_csv(consolidated_filename, index=False)
    logging.info(f"Saved consolidated results with {len(merged_df)} reviews to {consolidated_filename}")
    
    # Print quick summary
    print(f"\n=== Classification Summary ({len(merged_df)} reviews) ===")
    for col in ['sentiment', 'vehicle_type', 'user_experience', 'usage_profile', 
               'is_pain_point', 'is_feature_request', 'competitor_mentioned']:
        print(f"\n{col.replace('_', ' ').title()} distribution:")
        print(merged_df[col].value_counts())

def run_analysis(df, model_name, batch_size=50):
    """Main function to run or resume analysis"""
    start_batch = 1
    total_batches = (len(df) + batch_size - 1) // batch_size
    
    # Check if we have a progress file to resume from
    if os.path.exists(PROGRESS_FILE):
        try:
            with open(PROGRESS_FILE, 'r') as f:
                progress = json.load(f)
            
            last_batch = progress.get("last_completed_batch", 0)
            saved_total_batches = progress.get("total_batches", 0)
            prev_model = progress.get("model_name", "")
            last_time = progress.get("last_processed_time", "unknown time")
            
            if last_batch < total_batches:
                print(f"Previous run found (completed {last_batch}/{saved_total_batches} batches at {last_time}).")
                
                # Let user choose which batch to start from
                print(f"\nBatch information:")
                print(f"- Total batches: {total_batches}")
                print(f"- Completed batches: 1 to {last_batch}")
                print(f"- Remaining batches: {last_batch + 1} to {total_batches}")
                
                while True:
                    batch_input = input(f"\nEnter the batch number to start from (1-{total_batches}) or 'q' to quit: ")
                    
                    if batch_input.lower() == 'q':
                        logging.info("Analysis cancelled by user")
                        return None
                    
                    try:
                        selected_batch = int(batch_input)
                        if 1 <= selected_batch <= total_batches:
                            start_batch = selected_batch
                            logging.info(f"Starting from user-selected batch {start_batch}")
                            
                            # Warn if starting from an incomplete batch
                            if selected_batch <= last_batch:
                                overwrite = input(f"Batch {selected_batch} was already completed. Reprocess this batch? (y/n): ")
                                if overwrite.lower() != 'y':
                                    # User changed their mind - ask again
                                    continue
                            
                            # Warn if model changed
                            if prev_model and prev_model != model_name:
                                logging.warning(f"Using a different model ({model_name}) than previous run ({prev_model})")
                            
                            break
                        else:
                            print(f"Invalid batch number. Please enter a number between 1 and {total_batches}.")
                    except ValueError:
                        print("Please enter a valid number.")
            else:
                print("Previous run completed all batches. Starting over.")
        except Exception as e:
            logging.error(f"Error reading progress file: {e}")
            print(f"Error reading progress file: {e}")
    else:
        print("No previous run found. Starting from the beginning.")
    
    # Start or resume processing
    print(f"Starting classification from batch {start_batch}...")
    start_time = time.time()
    
    df_classified = process_reviews_step_by_step(
        df=df,
        model_name=model_name,
        batch_size=batch_size,
        start_batch=start_batch
    )
    
    # Calculate elapsed time
    total_time = time.time() - start_time
    hours, remainder = divmod(total_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    print(f"Classification complete! Time taken: {int(hours)}h {int(minutes)}m {int(seconds)}s")
    
    return df_classified

In [5]:
df_translated = pd.read_csv("bmw_app_analysis/translations/final_translated.csv")

In [9]:
# Run the full analysis (with resume capability)
df_classified = run_analysis(
    df=df_translated,
    model_name=ollama_model_name,
    batch_size=1000 # Process in batches of 5
)

# Final output is already saved as part of run_analysis
print("Classification complete!")

Previous run found (completed 3/19 batches at 2025-04-14 01:26:44).

Batch information:
- Total batches: 19
- Completed batches: 1 to 3
- Remaining batches: 4 to 19


2025-04-14 01:31:52,291 - INFO - Starting from user-selected batch 4
2025-04-14 01:31:52,308 - INFO - Starting step-by-step analysis from batch 4/19
2025-04-14 01:31:52,308 - INFO - Processing batch 4/19 (reviews 3001-4000)


Starting classification from batch 4...


Batch 4: 100%|██████████| 1000/1000 [6:41:22<00:00, 24.08s/review] 
2025-04-14 08:13:14,718 - INFO - Saved checkpoint to bmw_app_analysis/checkpoints/batch_4_of_19.csv
2025-04-14 11:17:15,986 - INFO - Processing paused after batch 4. Run again to continue from batch 5.


Classification complete! Time taken: 9h 45m 23s
Classification complete!


In [1]:
import pandas as pd
import glob
import os
import re

def merge_checkpoint_batches(checkpoint_dir="bmw_app_analysis/checkpoints"):
    """
    Merge all checkpoint batch files into a single DataFrame.
    """
    # Get all batch files, sorted numerically
    pattern = os.path.join(checkpoint_dir, "batch_*_of_*.csv")
    checkpoint_files = glob.glob(pattern)
    
    # Sort by batch number
    def get_batch_num(filename):
        # Extract number between "batch_" and "_of"
        match = re.search(r'batch_(\d+)_of', filename)
        if match:
            return int(match.group(1))
        return 0
        
    checkpoint_files = sorted(checkpoint_files, key=get_batch_num)
    
    if not checkpoint_files:
        print(f"No batch files found in {checkpoint_dir}")
        return None
    
    print(f"Found {len(checkpoint_files)} batch files:")
    for file in checkpoint_files:
        print(f"  - {os.path.basename(file)}")
    
    # Read and combine all batch files
    dfs = []
    for file in checkpoint_files:
        try:
            df = pd.read_csv(file)
            batch_num = get_batch_num(file)
            print(f"Added batch {batch_num} ({len(df)} reviews)")
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    
    if not dfs:
        print("No valid checkpoint files could be read")
        return None
    
    # Concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    print(f"Successfully merged {len(merged_df)} total reviews")
    
    # Save the merged result
    merged_file = "bmw_app_analysis/results/merged_classification.csv"
    os.makedirs(os.path.dirname(merged_file), exist_ok=True)
    merged_df.to_csv(merged_file, index=False)
    print(f"Saved merged results to: {merged_file}")
    
    return merged_df

# Run the function
if __name__ == "__main__":
    df_merged = merge_checkpoint_batches()
    
    # Print some basic statistics
    if df_merged is not None:
        print("\n=== Classification Summary ===")
        for col in ['sentiment', 'vehicle_type', 'user_experience', 'usage_profile', 
                  'is_pain_point', 'is_feature_request', 'competitor_mentioned']:
            if col in df_merged.columns:
                print(f"\n{col.replace('_', ' ').title()} distribution:")
                print(df_merged[col].value_counts())

Found 14 batch files:
  - batch_1_of_19.csv
  - batch_2_of_19.csv
  - batch_3_of_19.csv
  - batch_4_of_19.csv
  - batch_5_of_19.csv
  - batch_6_of_19.csv
  - batch_7_of_19.csv
  - batch_8_of_19.csv
  - batch_9_of_19.csv
  - batch_10_of_19.csv
  - batch_11_of_19.csv
  - batch_12_of_19.csv
  - batch_13_of_19.csv
  - batch_14_of_19.csv
Added batch 1 (1000 reviews)
Added batch 2 (1000 reviews)
Added batch 3 (1000 reviews)
Added batch 4 (1000 reviews)
Added batch 5 (1000 reviews)
Added batch 6 (1000 reviews)
Added batch 7 (1000 reviews)
Added batch 8 (1000 reviews)
Added batch 9 (1000 reviews)
Added batch 10 (1000 reviews)
Added batch 11 (1000 reviews)
Added batch 12 (1000 reviews)
Added batch 13 (1000 reviews)
Added batch 14 (1000 reviews)
Successfully merged 14000 total reviews
Saved merged results to: bmw_app_analysis/results/merged_classification.csv

=== Classification Summary ===

Sentiment distribution:
sentiment
positive    6422
negative    6398
neutral     1179
Name: count, dtype: 

In [2]:
df_classified = pd.read_csv("bmw_app_analysis/results/merged_classification.csv")

In [4]:
RESULTS_DIR = "bmw_app_analysis/results"

def analyze_classification_results(df_classified, display_specific_review=None):
    """
    Analyze and display classification results with summary statistics 
    and optionally show specific reviews.
    
    Args:
        df_classified: DataFrame with classified reviews
        display_specific_review: Index of specific review to display (optional)
    """
    # Import display from IPython if available, otherwise use print as fallback
    try:
        from IPython.display import display, HTML
    except ImportError:
        # Define a simple display function if not in a notebook environment
        def display(obj):
            print(obj)
    
    if df_classified is None or len(df_classified) == 0:
        print("No classification results to analyze.")
        return
    
    # Define the display columns for consistency
    display_columns = [
        'content_english', 'sentiment', 'topics', 
        'is_pain_point', 'is_feature_request', 'competitor_mentioned',
        'vehicle_type', 'user_experience', 'usage_profile'
    ]
    
    # If a specific review is requested, display it first
    if display_specific_review is not None:
        try:
            review_idx = int(display_specific_review)
            if 0 <= review_idx < len(df_classified):
                print(f"\n=== Review #{review_idx+1} (Index {review_idx}) ===")
                selected_review = df_classified.iloc[review_idx]
                display(selected_review[display_columns])
                
                print(f"\n=== Review #{review_idx+1} (Detailed) ===")
                for col in display_columns:
                    print(f"{col}: {selected_review[col]}")
            else:
                print(f"Review index {review_idx} is out of range (0-{len(df_classified)-1}).")
        except ValueError:
            print("Please provide a valid review index (number).")
    
    # Display sample of classified reviews (first 5 by default)
    print("\n=== Sample Classified Reviews ===")
    display(df_classified[display_columns].head())
    
    # Display classification summary statistics
    print(f"\n=== Classification Summary ({len(df_classified)} reviews) ===")
    
    # Helper function to show value distributions with percentages
    def show_distribution(df, column):
        counts = df[column].value_counts()
        percentages = df[column].value_counts(normalize=True) * 100
        distribution = pd.DataFrame({
            'Count': counts,
            'Percentage': percentages.round(1)
        })
        print(f"\n--- {column.replace('_', ' ').title()} Distribution ---")
        display(distribution)
    
    # Show distributions for each classification
    show_distribution(df_classified, 'sentiment')
    show_distribution(df_classified, 'vehicle_type')
    show_distribution(df_classified, 'user_experience')
    show_distribution(df_classified, 'usage_profile')
    show_distribution(df_classified, 'is_pain_point')
    show_distribution(df_classified, 'is_feature_request')
    show_distribution(df_classified, 'competitor_mentioned')
    
    # For topics, we need to split and count individually
    print("\n--- Topics Distribution ---")
    all_topics = [topic.strip() for topics_str in df_classified['topics'].dropna() 
                  for topic in topics_str.split(',') if topic.strip()]
    topic_counts = pd.Series(all_topics).value_counts()
    display(topic_counts)
    
    print("\n=== Review Lookup ===")
    print("To view a specific review, run: analyze_classification_results(df_classified, review_index)")
    print("Where review_index is the index of the review you want to see (0 to", len(df_classified)-1, ")")

# When doing the full analysis:
if df_classified is not None:
    # Save results to CSV
    output_file = os.path.join(RESULTS_DIR, "bmw_reviews_classified.csv")
    df_classified.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    # Show initial summary statistics
    analyze_classification_results(df_classified)

Results saved to bmw_app_analysis/results/bmw_reviews_classified.csv

=== Sample Classified Reviews ===


Unnamed: 0,content_english,sentiment,topics,is_pain_point,is_feature_request,competitor_mentioned,vehicle_type,user_experience,usage_profile
0,BEWARE!! absolutely useless. Everytime i locke...,negative,"connectivity, remote controls",yes,no,none,unclear,unclear,casual_user
1,my bmw is just best super to drive so comforta...,positive,"performance, mobile features",no,no,none,unclear,unclear,casual_user
2,Great app for keeping track of your BMW.,positive,vehicle status,no,no,none,unclear,unclear,casual_user
3,I love the fact that it is free to download an...,positive,"charging management, remote controls, vehicle ...",no,no,none,ev_hybrid,unclear,casual_user
4,"no limit for charging, or timer",negative,"charging management, ev-specific features",yes,yes,none,ev_hybrid,unclear,casual_user



=== Classification Summary (14000 reviews) ===

--- Sentiment Distribution ---


Unnamed: 0_level_0,Count,Percentage
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
positive,6422,45.9
negative,6398,45.7
neutral,1179,8.4



--- Vehicle Type Distribution ---


Unnamed: 0_level_0,Count,Percentage
vehicle_type,Unnamed: 1_level_1,Unnamed: 2_level_1
unclear,11965,85.5
ev_hybrid,1589,11.4
combustion,445,3.2



--- User Experience Distribution ---


Unnamed: 0_level_0,Count,Percentage
user_experience,Unnamed: 1_level_1,Unnamed: 2_level_1
unclear,8817,63.0
experienced_user,4918,35.1
new_user,264,1.9



--- Usage Profile Distribution ---


Unnamed: 0_level_0,Count,Percentage
usage_profile,Unnamed: 1_level_1,Unnamed: 2_level_1
casual_user,7945,56.8
unclear,3577,25.6
power_user,2477,17.7



--- Is Pain Point Distribution ---


Unnamed: 0_level_0,Count,Percentage
is_pain_point,Unnamed: 1_level_1,Unnamed: 2_level_1
yes,7806,55.8
no,6193,44.2



--- Is Feature Request Distribution ---


Unnamed: 0_level_0,Count,Percentage
is_feature_request,Unnamed: 1_level_1,Unnamed: 2_level_1
no,10680,76.3
yes,3319,23.7



--- Competitor Mentioned Distribution ---


Unnamed: 0_level_0,Count,Percentage
competitor_mentioned,Unnamed: 1_level_1,Unnamed: 2_level_1
none,13423,95.9
bmw,104,0.7
tesla,45,0.3
mercedes,42,0.3
samsung,40,0.3
...,...,...
korean manufacturers,1,0.0
ios,1,0.0
tomtom,1,0.0
chevy,1,0.0



--- Topics Distribution ---


performance                                4211
vehicle status                             3601
ui/ux                                      3579
connectivity                               2713
other                                      2211
updates                                    1880
authentication                             1641
remote controls                            1612
mobile features                            1227
ev-specific features                        876
bmw connected ecosystem                     874
charging management                         761
map/navigation                              648
customer support                            464
trip planning                               456
usage statistics                            372
bmw digital premium                         320
service & maintenance                       320
notification management                     278
digital key/mobile key                      254
localization & language                 


=== Review Lookup ===
To view a specific review, run: analyze_classification_results(df_classified, review_index)
Where review_index is the index of the review you want to see (0 to 13999 )


In [26]:
# bmw_topic_cards.py  –  v6  (2025‑05‑02)
# ================================================================
# • Stratified random sampling (mirrors topic sentiment mix)
# • One paragraph summary  +  “‑ ” negative bullets  +  “+ ” positive bullets
# • Prompts stored inside TopicSummary.prompts  (audit / debugging)
# • No JSON parsing, so Ollama chatter can’t break the pipeline
# ---------------------------------------------------------------

from __future__ import annotations

import html
import logging
import subprocess
import textwrap
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from IPython.display import HTML, display
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm

# -------------------------------------------------------------------
#  CONFIGURATION
# -------------------------------------------------------------------
TOP_N_TOPICS        = 15          # how many topics to visualise
PROMPT_SAMPLE_SIZE  = 300         # reviews fed to each LLM prompt
N_KEYWORDS          = 8
N_PHRASES           = 5
MAX_BULLETS         = 7           # issues / positives
BMW_BLUE            = "#0066B1"

STAR_SVG = (
    "<svg width='14' height='14' viewBox='0 0 24 24' "
    "xmlns='http://www.w3.org/2000/svg' style='vertical-align:-2px'>"
    "<polygon fill='#FFD700' "
    "points='12 2 15.09 8.26 22 9.27 17 14.14 "
    "18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26'/></svg>"
)

# -------------------------------------------------------------------
#  NLTK – bootstrap stop‑words
# -------------------------------------------------------------------
for res in ("stopwords", "punkt"):
    try:
        nltk.data.find(f"corpora/{res}")
    except LookupError:
        nltk.download(res, quiet=True)

STOPWORDS = set(stopwords.words("english")) | {
    "bmw", "app", "car", "please", "would", "also", "get", "use", "using"
}

# -------------------------------------------------------------------
#  OLLAMA RUNNER  (temperature 0 for determinism)
# -------------------------------------------------------------------
def _subprocess_runner(prompt: str, model: str) -> str:
    proc = subprocess.run(
        ["ollama", "run", model, "-t", "0"],
        input=prompt,
        text=True,
        capture_output=True,
        check=False,
    )
    return proc.stdout

# -------------------------------------------------------------------
#  STRATIFIED SAMPLING  (mirror sentiment mix)
# -------------------------------------------------------------------
def _stratified_sample(df: pd.DataFrame, size: int, seed: int = 0) -> List[str]:
    dist = df["sentiment"].value_counts(normalize=True)
    quota = {s: int(round(dist.get(s, 0) * size)) for s in ("negative", "positive", "neutral")}
    diff = size - sum(quota.values())
    if diff:
        quota[max(dist, key=dist.get, default="negative")] += diff

    col = "content_english" if "content_english" in df.columns else "content"
    texts: List[str] = []

    for sentiment, n in quota.items():
        if n <= 0:
            continue
        bucket = df[df["sentiment"] == sentiment]
        sample_n = min(n, len(bucket))
        texts.extend(bucket.sample(sample_n, random_state=seed)[col].tolist())

    # top‑up if any bucket ran short
    if len(texts) < size:
        short = size - len(texts)
        remainder = df.drop(df.index[df[col].isin(texts)]).sample(short, random_state=seed)
        texts.extend(remainder[col].tolist())

    return texts

# -------------------------------------------------------------------
#  BULLET‑PARSER
# -------------------------------------------------------------------
def _parse_bullets(text: str, prefix: str) -> List[str]:
    """Return list of lines starting with the prefix (‘- ’ or ‘+ ’)."""
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    bullets = [ln[len(prefix):].strip() for ln in lines if ln.startswith(prefix)]
    # fallback to comma‑sep if model ignored prefixes
    if not bullets and "," in text:
        bullets = [part.strip() for part in text.split(",") if part.strip()]
    return bullets[:MAX_BULLETS]

# -------------------------------------------------------------------
#  DATACLASS
# -------------------------------------------------------------------
@dataclass
class TopicSummary:
    summary: str
    issues: List[str]
    positives: List[str]
    avg_rating: float
    review_count: int
    sentiment_dist: Dict[str, float]
    top_keywords: List[str]
    top_phrases: List[str]
    prompts: Dict[str, str] = field(default_factory=dict)

    @property
    def stars(self) -> str:
        full, half = int(self.avg_rating), self.avg_rating - int(self.avg_rating) >= 0.5
        return STAR_SVG * full + (STAR_SVG if half else "")

# -------------------------------------------------------------------
#  MAIN ENTRY
# -------------------------------------------------------------------
def create_topic_cards_from_classified(
    df: pd.DataFrame,
    *,
    ollama_model_name: str = "gemma3:12b",
    ollama_runner: Callable[[str, str], str] = _subprocess_runner,
) -> Dict[str, TopicSummary]:
    """
    Render topic cards & a rating chart; return dict[topic] -> TopicSummary.
    Prompts are stored in `TopicSummary.prompts`.
    """
    log = _setup_logger()

    # -------- sanity checks -----------------------------------------
    if {"topics", "sentiment"} - set(df.columns):
        raise ValueError("DataFrame must include 'topics' & 'sentiment'.")

    text_col = "content_english" if "content_english" in df.columns else "content"

    if "score" not in df.columns:
        df["score"] = (
            df["sentiment"]
            .map({"positive": 4.5, "neutral": 3.0, "negative": 1.5})
            .fillna(3.0)
        )

    # -------- pick top topics --------------------------------------
    topics_freq = df["topics"].str.get_dummies(sep=",").sum()
    top_topics = (
        topics_freq.sort_values(ascending=False).head(TOP_N_TOPICS).index
    )

    results: Dict[str, TopicSummary] = {}

    for topic in tqdm(top_topics, desc="Topics"):
        sub = df[df["topics"].str.contains(fr"\b{topic}\b", case=False, na=False)]
        if sub.empty:
            continue

        avg = sub["score"].mean()
        mix = sub["sentiment"].value_counts(normalize=True).to_dict()

        tokens = word_tokenize(" ".join(sub[text_col].fillna("").str.lower()))
        tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS and len(t) > 2]
        keywords = [w for w, _ in Counter(tokens).most_common(N_KEYWORDS)]
        bigrams = [f"{a} {b}" for a, b in zip(tokens, tokens[1:])]
        phrases = [p for p, _ in Counter(bigrams).most_common(N_PHRASES)]

        sample_n = min(PROMPT_SAMPLE_SIZE, len(sub))
        sample_texts = _stratified_sample(sub, sample_n)
        sample_blob = "\n".join(f"[{i}] {txt}" for i, txt in enumerate(sample_texts))

        ctx_block = (
            f"Average score {avg:.2f} (n={len(sub)}). "
            f"Sentiment mix: neg {mix.get('negative',0):.0%}, "
            f"pos {mix.get('positive',0):.0%}, "
            f"neu {mix.get('neutral',0):.0%}.\n"
            f"Top keywords: {', '.join(keywords)}.\n"
            f"Top phrases: {', '.join(phrases)}."
        )

        prompts: Dict[str, str] = {}

        def build(kind: str) -> str:
            """Return fully‑formed prompt & store it"""
            header = (
                "You are an analytical assistant. Follow ALL rules:\n"
                f"• Focus **only** on the topic '{topic}'.\n"
                "• Ignore unrelated features.\n"
                "• Use English even if reviews were translated.\n"
                "• Be factual, concise (temperature 0).\n"
            )
            if kind == "summary":
                body = (
                    "Write one concise paragraph (3‑5 sentences) that "
                    "summarises what users say about this topic, covering "
                    "both pain points and praise."
                )
            else:
                label = "negative issues" if kind == "issues" else "positive aspects"
                prefix = "-" if kind == "issues" else "+"
                body = (
                    f"List up to {MAX_BULLETS} main {label}. "
                    f"Each line MUST start with '{prefix} '. "
                    "Use short noun phrases, no duplication, no periods."
                )

            prompt = (
                f"{header}\n\nCONTEXT:\n{ctx_block}\n\nSAMPLES:\n{sample_blob}\n\nTASK:\n{body}"
            )
            prompts[kind] = prompt
            return prompt

        # ----- call Ollama (three prompts in parallel) ----------------
        with ThreadPoolExecutor(max_workers=3) as pool:
            futures = {
                pool.submit(ollama_runner, build(k), ollama_model_name): k
                for k in ("summary", "issues", "positives")
            }
            raw = {k: f.result().strip() for f, k in futures.items()}

        summary_text = raw["summary"]
        issues_list  = _parse_bullets(raw["issues"], "-")
        pos_list     = _parse_bullets(raw["positives"], "+")

        results[topic] = TopicSummary(
            summary=summary_text,
            issues=issues_list,
            positives=pos_list,
            avg_rating=avg,
            review_count=len(sub),
            sentiment_dist=mix,
            top_keywords=keywords,
            top_phrases=phrases,
            prompts=prompts,
        )

    # ------------- render & plot -------------------------------------
    _render_cards(results)
    _plot_chart(results, df)
    return {k: v.__dict__ for k, v in results.items()}

# -------------------------------------------------------------------
#  VISUAL HELPER – HTML CARDS
# -------------------------------------------------------------------
def _render_cards(data: Dict[str, TopicSummary]) -> None:
    css = textwrap.dedent(
        """
        <style>
          :root {{--blue:{blue};--neg:#E53935;--pos:#43A047;--neu:#FFA000;
                 --bg:#fff;--text:#1a1a1a;}}
          @media (prefers-color-scheme: dark) {{
              :root {{--bg:#121212;--text:#e0e0e0;}}
          }}
          body {{background:var(--bg);color:var(--text);}}

/* container */
          .cards {{max-width:1200px;margin:0 auto;font-family:'Helvetica Neue',Arial,sans-serif;}}

/* card */
          .card {{
              display:grid;grid-template-columns:220px 1fr;
              border:1px solid #bbb;border-radius:8px;margin:18px 0;
              overflow:hidden;box-shadow:0 4px 10px rgba(0,0,0,.12);
              animation:fadeIn .4s ease;
          }}
          @keyframes fadeIn {{from {{opacity:0;transform:translateY(8px);}}
                              to   {{opacity:1;transform:translateY(0);}}}}
          header {{
              grid-column:1/-1;background:linear-gradient(135deg,var(--blue) 0%,#0088cc 100%);
              color:#fff;padding:12px 18px;font-size:20px;font-weight:600;
          }}

/* sidebar */
          .side {{
              background:#f5f7fa;border-right:1px solid #ddd;
              padding:14px;display:flex;flex-direction:column;gap:14px;
          }}
          .stat .num {{font-weight:700;font-size:19px;line-height:1;color:var(--text);}}
          .stat .label{{font-size:11px;text-transform:uppercase;font-weight:600;
                       letter-spacing:.4px;color:var(--text);}}
          .bar {{display:flex;height:10px;border-radius:4px;overflow:hidden;margin-top:4px;}}
          .seg {{flex-shrink:0;transition:opacity .2s;}} .seg:hover {{opacity:.75;}}
          .neg{{background:var(--neg);}} .pos{{background:var(--pos);}} .neu{{background:var(--neu);}}

/* main */
          .main {{padding:18px;line-height:1.6;}}
          .summary {{
              background:rgba(0,102,177,.07);border-left:3px solid var(--blue);
              padding:10px;margin-bottom:16px;line-height:1.6;
          }}
          .cols {{display:flex;flex-wrap:wrap;gap:24px;margin-bottom:16px;}}
          h4 {{margin:0 0 6px;color:var(--blue);font-weight:600;}}
          ul {{margin:0;padding-left:18px;}}
          li {{margin:4px 0;}}

/* chips */
          .chips {{margin-bottom:10px;}}
          .chip {{
              display:inline-block;padding:4px 10px;border-radius:20px;
              font-size:11px;font-weight:700;margin:2px;cursor:pointer;
              transition:opacity .2s;
          }}
          .chip:hover {{opacity:.8;}}
          .kw{{background:#e3f2fd;color:var(--blue);}}
          .ph{{background:#e8f5e9;color:#2e7d32;border:1px solid #c8e6c9;}}

/* responsive */
          @media (max-width:768px){{
              .card{{grid-template-columns:1fr}}
              .side{{order:2;border:none;border-top:1px solid #ddd;
                    flex-direction:row;justify-content:space-around}}
          }}
        </style>
        """
    ).format(blue=BMW_BLUE)

    output: List[str] = [css, "<div class='cards'>"]

    # sort by review count
    for topic, info in sorted(data.items(), key=lambda x: x[1].review_count, reverse=True):
        if topic.lower() == "other":
            continue

        bar = "".join(_bar_seg(name, frac) for name, frac in info.sentiment_dist.items())
        kw_html = "".join(f"<span class='chip kw'>{html.escape(k)}</span>" for k in info.top_keywords)
        ph_html = "".join(f"<span class='chip ph'>{html.escape(p)}</span>" for p in info.top_phrases)
        issues_html = "".join(f"<li>{html.escape(i)}</li>" for i in info.issues) or "<li>No major issues</li>"
        pos_html = "".join(f"<li>{html.escape(p)}</li>" for p in info.positives) or "<li>No specific positives</li>"

        output.append(
            f"<div class='card'>"
            f"<header>{html.escape(topic)}</header>"

            f"<section class='side'>"
            f"<div class='stat'><span class='num'>{info.review_count}</span><span class='label'>Reviews</span></div>"
            f"<div class='stat'><span class='num'>{info.avg_rating:.2f}/5 {info.stars}</span>"
            f"<span class='label'>Average</span></div>"
            f"<div class='stat'><span class='label'>Sentiment</span><div class='bar'>{bar}</div></div>"
            "</section>"

            "<section class='main'>"
            f"<p class='summary'>{html.escape(info.summary)}</p>"
            "<div class='cols'>"
            f"<div><h4>Issues</h4><ul>{issues_html}</ul></div>"
            f"<div><h4>Positives</h4><ul>{pos_html}</ul></div>"
            "</div>"
            f"<div class='chips'><h4>Keywords</h4>{kw_html}</div>"
            f"<div class='chips'><h4>Phrases</h4>{ph_html}</div>"
            "</section>"
            "</div>"
        )

    output.append("</div>")
    display(HTML("\n".join(output)))

def _bar_seg(name: str, frac: float) -> str:
    cls = "pos" if name == "positive" else "neg" if name == "negative" else "neu"
    pct = max(frac * 100, 1)
    return f"<span class='seg {cls}' style='width:{pct}%' title='{name}: {frac:.1%}'></span>"

# -------------------------------------------------------------------
#  VISUAL HELPER – MATPLOTLIB CHART
# -------------------------------------------------------------------
def _plot_chart(data: Dict[str, TopicSummary], df: pd.DataFrame) -> None:
    rows = [(t, d.avg_rating, d.review_count) for t, d in data.items() if t.lower() != "other"]
    if not rows:
        return
    topics, ratings, counts = zip(*rows)
    order = sorted(range(len(topics)), key=lambda i: ratings[i])
    topics = [topics[i] for i in order]
    ratings = [ratings[i] for i in order]
    counts = [counts[i] for i in order]

    cmap = mpl.colormaps.get_cmap("viridis").resampled(len(topics))

    plt.figure(figsize=(12, 10))
    bars = plt.barh(topics, ratings, color=[cmap(i) for i in range(len(topics))])

    for bar, cnt, rating in zip(bars, counts, ratings):
        x = rating + 0.05 if rating < 2.5 else rating - 0.6
        clr = "black" if rating < 2.5 else "white"
        plt.text(x, bar.get_y() + bar.get_height() / 2, f"n={cnt}", va="center",
                 color=clr, fontweight="bold")

    mean = df["score"].mean()
    plt.axvline(mean, linestyle="--", linewidth=2, label=f"Overall {mean:.2f}")
    plt.xticks(range(1, 6), ["★" * i for i in range(1, 6)])
    plt.xlim(0, 5.2)
    plt.xlabel("Star Rating")
    plt.ylabel("Topic")
    plt.title("BMW App – Average Rating by Topic")
    plt.legend()
    plt.grid(axis="x", linestyle="--", alpha=0.5)
    plt.tight_layout()
    plt.show()

# -------------------------------------------------------------------
#  MISC HELPERS
# -------------------------------------------------------------------
def _setup_logger() -> logging.Logger:
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    return logging.getLogger("bmw_topic_cards")

In [27]:
# Create visual topic cards
topic_summaries = create_topic_cards_from_classified(df_classified)

Topics:   0%|          | 0/15 [00:00<?, ?it/s]

TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'