Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path for imports
sys.path.append(os.path.dirname(os.path.abspath('')))

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report,
    confusion_matrix
)

# Import local modules
from utils.preprocess import (
    normalize_case, clean_punct_and_numbers, tokenize,
    remove_stopwords, lemmatize, stem_tokens, identity,
    preprocess_text, get_vectorizer
)
from llm.gpt_zero_shot_classifier import classify_zero_shot, classify_zero_shot_batch
from llm.mistral_classifier import classify_mistral, classify_mistral_batch

DATA_PATH = os.path.join("data", "klikšķēsma.txt")
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

plt.style.use('default')
sns.set_palette("husl")

print("Setup complete!")

Load and Analyze Data

In [None]:
# Load dataset
df = pd.read_csv(
    DATA_PATH,
    sep="\t",
    names=["title", "label"],
    encoding="utf-8"
)

print(f"Total samples: {len(df)}")
assert len(df) == 4930, f"Expected 4930 samples, got {len(df)}"

# Class distribution analysis
print("\nClass distribution:")
print(df['label'].value_counts().sort_index())
print(f"\nClass percentages:")
print((df['label'].value_counts(normalize=True).sort_index() * 100).round(1))

# Validate data
expected_counts = {1: 3306, 2: 440, 3: 1184}
actual_counts = df['label'].value_counts().to_dict()
print("\nData validation:")
for label, expected in expected_counts.items():
    actual = actual_counts.get(label, 0)
    status = "✓" if expected == actual else "✗"
    print(f"Class {label}: Expected {expected}, Got {actual} {status}")

# Show sample headlines
print("\n" + "="*80)
print("Sample headlines by class:")
print("="*80)
for label in [1, 2, 3]:
    class_name = {1: 'Nav klikšķēsma', 2: 'Daļēja klikšķēsma', 3: 'Ir klikšķēsma'}[label]
    print(f"\nClass {label} ({class_name}):")
    samples = df[df['label'] == label].sample(3, random_state=42)
    for idx, (_, row) in enumerate(samples.iterrows(), 1):
        print(f"  {idx}. {row['title']}")

Text Analysis

In [None]:
# Add text features
df['word_count'] = df['title'].str.split().str.len()
df['char_count'] = df['title'].str.len()
df['has_number'] = df['title'].str.contains(r'\d+', regex=True)
df['has_exclamation'] = df['title'].str.contains('!')
df['has_question'] = df['title'].str.contains('\?')

# Statistical analysis by class
print("Statistical analysis by class:")
print("="*60)
stats = df.groupby('label').agg({
    'word_count': ['mean', 'std'],
    'char_count': ['mean', 'std'],
    'has_number': 'mean',
    'has_exclamation': 'mean',
    'has_question': 'mean'
}).round(2)

for label in [1, 2, 3]:
    class_name = {1: 'Nav klikšķēsma', 2: 'Daļēja klikšķēsma', 3: 'Ir klikšķēsma'}[label]
    print(f"\nClass {label} ({class_name}):")
    print(f"  Average word count: {stats.loc[label, ('word_count', 'mean')]:.1f} (±{stats.loc[label, ('word_count', 'std')]:.1f})")
    print(f"  Average char count: {stats.loc[label, ('char_count', 'mean')]:.1f} (±{stats.loc[label, ('char_count', 'std')]:.1f})")
    print(f"  Contains numbers: {stats.loc[label, ('has_number', 'mean')]*100:.1f}%")
    print(f"  Contains exclamation: {stats.loc[label, ('has_exclamation', 'mean')]*100:.1f}%")
    print(f"  Contains question: {stats.loc[label, ('has_question', 'mean')]*100:.1f}%")

Data Visualization

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Clickbait Data Analysis', fontsize=16)

# Word count distribution
for label in [1, 2, 3]:
    data = df[df['label'] == label]['word_count']
    axes[0, 0].hist(data, alpha=0.6, bins=20, label=f'Class {label}', edgecolor='black')
axes[0, 0].set_xlabel('Word Count')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Word Count Distribution by Class')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Average word count bar chart
avg_words = df.groupby('label')['word_count'].mean()
bars = axes[0, 1].bar(['1-Nav', '2-Daļēja', '3-Ir'], avg_words.values, 
                      color=['green', 'orange', 'red'], alpha=0.7)
axes[0, 1].set_ylabel('Average Word Count')
axes[0, 1].set_title('Average Word Count by Class')
axes[0, 1].grid(True, alpha=0.3, axis='y')

for bar in bars:
    height = bar.get_height()
    axes[0, 1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.1f}', ha='center', va='bottom')

# Feature presence comparison
features = ['has_number', 'has_exclamation', 'has_question']
feature_means = df.groupby('label')[features].mean()
x = np.arange(len(feature_means.index))
width = 0.25

for i, feature in enumerate(features):
    offset = (i - 1) * width
    feature_label = {'has_number': 'Numbers', 
                    'has_exclamation': 'Exclamation', 
                    'has_question': 'Question'}[feature]
    axes[1, 0].bar(x + offset, feature_means[feature], width, 
                   label=feature_label, alpha=0.8)

axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Proportion')
axes[1, 0].set_title('Feature Presence by Class')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(['1-Nav', '2-Daļēja', '3-Ir'])
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Class distribution pie chart
class_counts = df['label'].value_counts().sort_index()
colors = ['#90EE90', '#FFD700', '#FF6B6B']
explode = (0.05, 0.05, 0.05)
axes[1, 1].pie(class_counts.values, labels=['1-Nav\n(67.1%)', '2-Daļēja\n(8.9%)', '3-Ir\n(24.0%)'], 
               autopct='%d', startangle=90, colors=colors, explode=explode)
axes[1, 1].set_title('Class Distribution')

plt.tight_layout()
plt.show()

Data Preprocessing

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df["title"],
    df["label"],
    test_size=0.3,
    random_state=42,
    stratify=df["label"]
)

print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
assert len(X_test) == 1479, f"Expected 1479 test samples, got {len(X_test)}"

# Preprocess texts
print("\nPreprocessing texts...")
X_train_clean = X_train.apply(
    lambda t: preprocess_text(t, casefold=False, morpho_method="lemmatize")
)
X_test_clean = X_test.apply(
    lambda t: preprocess_text(t, casefold=False, morpho_method="lemmatize")
)

# Vectorize with TF-IDF
print("\nVectorizing with TF-IDF...")
print("Parameters: max_features=1000, ngram_range=(1,2), min_df=2, sublinear_tf=True")
vectorizer = get_vectorizer(
    max_features=1000,
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True
)
X_train_vec = vectorizer.fit_transform(X_train_clean)
X_test_vec = vectorizer.transform(X_test_clean)

print(f"\nTF-IDF matrix shapes:")
print(f"Train: {X_train_vec.shape}")
print(f"Test: {X_test_vec.shape}")

Train Classical Models

In [None]:
# Define classifiers
classifiers = {
    "logistic_regression": LogisticRegression(max_iter=1000, random_state=42),
    "naive_bayes": MultinomialNB(),
    "svm": LinearSVC(random_state=42, max_iter=2000),
    "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate
results = {}
print("\nTraining Classical Models...")
print("="*80)
header = f"{'Model':<20} {'Acc':<6} {'Prec':<6} {'Rec':<6} {'F1':<6} {'ROC-AUC':<8}"
print(header)
print("-" * 80)

for name, clf in classifiers.items():
    # Train model
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    
    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="macro")
    
    # Calculate ROC-AUC if possible
    roc_auc = None
    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(X_test_vec)
        roc_auc = roc_auc_score(y_test, y_score, multi_class="ovo", average="macro")
    elif hasattr(clf, "decision_function"):
        y_score = clf.decision_function(X_test_vec)
        if len(y_score.shape) > 1:
            y_score = np.exp(y_score) / np.sum(np.exp(y_score), axis=1, keepdims=True)
            roc_auc = roc_auc_score(y_test, y_score, multi_class="ovo", average="macro")
    
    # Store results
    results[name] = {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'roc_auc': roc_auc,
        'predictions': y_pred,
        'model': clf
    }
    
    # Print results
    roc_str = f"{roc_auc:.3f}" if roc_auc else "N/A"
    print(f"{name:<20} {acc:<6.3f} {prec:<6.3f} {rec:<6.3f} {f1:<6.3f} {roc_str:<8}")
    
    # Save model
    joblib.dump(clf, os.path.join(MODELS_DIR, f"{name}.joblib"))

# Save vectorizer
joblib.dump(vectorizer, os.path.join(MODELS_DIR, "tfidf_vectorizer.joblib"))
print("\nModels and vectorizer saved to:", MODELS_DIR)

Text Analysis

In [None]:
# Add text features
df['word_count'] = df['title'].str.split().str.len()
df['char_count'] = df['title'].str.len()
df['has_number'] = df['title'].str.contains(r'\d+', regex=True)
df['has_exclamation'] = df['title'].str.contains('!')
df['has_question'] = df['title'].str.contains('\?')

# Statistical analysis by class
print("Statistical analysis by class:")
print("="*60)
stats = df.groupby('label').agg({
    'word_count': ['mean', 'std'],
    'char_count': ['mean', 'std'],
    'has_number': 'mean',
    'has_exclamation': 'mean',
    'has_question': 'mean'
}).round(2)

for label in [1, 2, 3]:
    class_name = {1: 'Nav klikšķēsma', 2: 'Daļēja klikšķēsma', 3: 'Ir klikšķēsma'}[label]
    print(f"\nClass {label} ({class_name}):")
    print(f"  Average word count: {stats.loc[label, ('word_count', 'mean')]:.1f} (±{stats.loc[label, ('word_count', 'std')]:.1f})")
    print(f"  Average char count: {stats.loc[label, ('char_count', 'mean')]:.1f} (±{stats.loc[label, ('char_count', 'std')]:.1f})")
    print(f"  Contains numbers: {stats.loc[label, ('has_number', 'mean')]*100:.1f}%")
    print(f"  Contains exclamation: {stats.loc[label, ('has_exclamation', 'mean')]*100:.1f}%")
    print(f"  Contains question: {stats.loc[label, ('has_question', 'mean')]*100:.1f}%")

Results Visualization

In [None]:
# Create performance comparison chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# F1 scores comparison
models = list(results.keys())
f1_scores = [results[m]['f1'] for m in models]
colors = plt.cm.viridis(np.linspace(0, 1, len(models)))

bars = ax1.bar(range(len(models)), f1_scores, color=colors, alpha=0.8)
ax1.set_xticks(range(len(models)))
ax1.set_xticklabels([m.replace('_', ' ').title() for m in models], rotation=45, ha='right')
ax1.set_ylabel('F1 Score')
ax1.set_title('Classical Models F1 Score Comparison')
ax1.set_ylim(0, 1)
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, score in zip(bars, f1_scores):
    ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
             f'{score:.3f}', ha='center', va='bottom')

# All metrics comparison
metrics = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics):
    values = [results[m][metric] for m in models]
    ax2.bar(x + i*width - 1.5*width, values, width, label=metric.capitalize(), alpha=0.8)

ax2.set_xlabel('Models')
ax2.set_ylabel('Score')
ax2.set_title('All Metrics Comparison')
ax2.set_xticks(x)
ax2.set_xticklabels([m.replace('_', ' ').title() for m in models], rotation=45, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

Best Model Analysis

In [None]:
best_model_name = max(results.items(), key=lambda x: x[1]['f1'])[0]
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"Best Classical Model: {best_model_name.replace('_', ' ').title()}")
print("="*60)

print("\nClassification Report:")
print(classification_report(y_test, best_predictions, 
                          target_names=['1-Nav', '2-Daļēja', '3-Ir']))

cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['1-Nav', '2-Daļēja', '3-Ir'],
            yticklabels=['1-Nav', '2-Daļēja', '3-Ir'])
plt.title(f'Confusion Matrix - {best_model_name.replace("_", " ").title()}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\nError Analysis:")
errors = y_test != best_predictions
error_count = errors.sum()
print(f"Total errors: {error_count} out of {len(y_test)} ({error_count/len(y_test)*100:.1f}%)")

Results Visualization


In [None]:
# Create performance comparison chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# F1 scores comparison
models = list(results.keys())
f1_scores = [results[m]['f1'] for m in models]
colors = plt.cm.viridis(np.linspace(0, 1, len(models)))

bars = ax1.bar(range(len(models)), f1_scores, color=colors, alpha=0.8)
ax1.set_xticks(range(len(models)))
ax1.set_xticklabels([m.replace('_', ' ').title() for m in models], rotation=45, ha='right')
ax1.set_ylabel('F1 Score')
ax1.set_title('Classical Models F1 Score Comparison')
ax1.set_ylim(0, 1)
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, score in zip(bars, f1_scores):
    ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
             f'{score:.3f}', ha='center', va='bottom')

# All metrics comparison
metrics = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics):
    values = [results[m][metric] for m in models]
    ax2.bar(x + i*width - 1.5*width, values, width, label=metric.capitalize(), alpha=0.8)

ax2.set_xlabel('Models')
ax2.set_ylabel('Score')
ax2.set_title('All Metrics Comparison')
ax2.set_xticks(x)
ax2.set_xticklabels([m.replace('_', ' ').title() for m in models], rotation=45, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

Best Model Analysis

In [None]:
# Find best model
best_model_name = max(results.items(), key=lambda x: x[1]['f1'])[0]
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"Best Classical Model: {best_model_name.replace('_', ' ').title()}")
print("="*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, best_predictions, 
                          target_names=['1-Nav', '2-Daļēja', '3-Ir']))

# Confusion matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['1-Nav', '2-Daļēja', '3-Ir'],
            yticklabels=['1-Nav', '2-Daļēja', '3-Ir'])
plt.title(f'Confusion Matrix - {best_model_name.replace("_", " ").title()}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Error analysis
print("\nError Analysis:")
errors = y_test != best_predictions
error_count = errors.sum()
print(f"Total errors: {error_count} out of {len(y_test)} ({error_count/len(y_test)*100:.1f}%)")

LLM Evaluation

In [None]:
# Check OpenAI API availability
import openai

api_key = os.getenv("OPENAI_API_KEY")
if api_key:
    print("OpenAI API key found.")
else:
    print("Warning: OPENAI_API_KEY not set. GPT models will be skipped.")
    print("To set: export OPENAI_API_KEY='your-key-here'")

# Demo mode for LLM testing
DEMO_MODE = True
if DEMO_MODE:
    print("\n*** DEMO MODE: Using only 50 samples for LLM evaluation ***")
    test_headlines = X_test.iloc[:50].tolist()
    y_test_llm = y_test.iloc[:50]
else:
    test_headlines = X_test.tolist()
    y_test_llm = y_test

print(f"\nEvaluating LLM Models on {len(test_headlines)} headlines...")
print("="*60)

llm_results = {}

# Test GPT-3.5 if API key is available
if api_key and DEMO_MODE:
    try:
        print("Testing GPT-3.5 Turbo...")
        preds_gpt35 = classify_zero_shot_batch(test_headlines, model="gpt-3.5-turbo")
        
        # Filter valid predictions
        valid_indices = [i for i, p in enumerate(preds_gpt35) if p is not None]
        if len(valid_indices) < len(preds_gpt35):
            print(f"Warning: {len(preds_gpt35) - len(valid_indices)} failed classifications")
        
        y_test_valid = y_test_llm.iloc[valid_indices].values
        preds_valid = [preds_gpt35[i] for i in valid_indices]
        
        # Calculate metrics
        acc = accuracy_score(y_test_valid, preds_valid)
        prec = precision_score(y_test_valid, preds_valid, average="macro")
        rec = recall_score(y_test_valid, preds_valid, average="macro")
        f1 = f1_score(y_test_valid, preds_valid, average="macro")
        
        llm_results['GPT-3.5'] = {
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1
        }
        
        print(f"GPT-3.5: Acc={acc:.3f}, Prec={prec:.3f}, Rec={rec:.3f}, F1={f1:.3f}")
    except Exception as e:
        print(f"GPT-3.5 error: {e}")

if not DEMO_MODE:
    print("\nFor full evaluation with all LLM models, run train_classical_models.py and evaluate_models.py")

Final Results


In [None]:
# Combine all results
all_f1_scores = {name: res['f1'] for name, res in results.items()}

# Add LLM results
if not DEMO_MODE:
    # Full evaluation results (from paper)
    all_f1_scores.update({
        'GPT-3.5': 0.761,
        'GPT-4 Turbo': 0.817,
        'Mistral 7B': 0.773
    })
else:
    # Demo mode results
    all_f1_scores.update({name: res['f1'] for name, res in llm_results.items()})

# Create final comparison chart
plt.figure(figsize=(12, 8))

models = list(all_f1_scores.keys())
scores = list(all_f1_scores.values())
colors = ['skyblue'] * 5 + ['lightcoral'] * (len(models) - 5)

bars = plt.bar(range(len(models)), scores, color=colors, alpha=0.8, edgecolor='black')

# Highlight best model
best_idx = scores.index(max(scores))
bars[best_idx].set_color('gold')
bars[best_idx].set_edgecolor('darkgoldenrod')
bars[best_idx].set_linewidth(3)

plt.xticks(range(len(models)), [m.replace('_', ' ').title() for m in models], rotation=45, ha='right')
plt.ylabel('F1 Score')
plt.title('All Models Comparison: Classical ML vs LLM', fontsize=14)
plt.ylim(0, 1)
plt.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
             f'{score:.3f}', ha='center', va='bottom')

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='skyblue', alpha=0.8, label='Classical Models'),
    Patch(facecolor='lightcoral', alpha=0.8, label='LLM Models'),
    Patch(facecolor='gold', alpha=0.8, label='Best Model')
]
plt.legend(handles=legend_elements, loc='upper left')

plt.tight_layout()
plt.show()

# Print summary
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
best_model = max(all_f1_scores.items(), key=lambda x: x[1])
print(f"Best model overall: {best_model[0]} with F1-score: {best_model[1]:.3f}")
print(f"Best classical model: {best_model_name} with F1-score: {results[best_model_name]['f1']:.3f}")

Testing

In [None]:
def predict_headline(headline, model_name="svm"):
    """Predict if a headline is clickbait using specified model."""
    # Preprocess text
    clean = preprocess_text(headline, casefold=False, morpho_method="lemmatize")
    vec = vectorizer.transform([clean])
    
    # Load model
    if model_name in results:
        model = results[model_name]['model']
    else:
        model_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
        model = joblib.load(model_path)
    
    # Make prediction
    pred = model.predict(vec)[0]
    
    # Get probabilities if available
    proba = None
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(vec)[0]
    
    # Display results
    class_names = {1: 'Nav klikšķēsma', 2: 'Daļēja klikšķēsma', 3: 'Ir klikšķēsma'}
    
    print(f"\nHeadline: '{headline}'")
    print(f"Model: {model_name}")
    print(f"Prediction: Class {pred} - {class_names[pred]}")
    if proba is not None:
        print(f"Confidence: [Class 1: {proba[0]:.2%}, Class 2: {proba[1]:.2%}, Class 3: {proba[2]:.2%}]")
    
    return pred

# Test examples
test_examples = [
    "Prezidents paziņo par jauniem ierobežojumiem",
    "6 padomi, kā ietaupīt naudu",
    "Šī viena metode mainīs tavu rītu!"
]

print("\nInteractive Prediction Demo:")
print("="*80)
for headline in test_examples:
    predict_headline(headline, model_name=best_model_name)
    print("-"*60)

Save Results


In [None]:
from datetime import datetime
import json

# Assuming df, X_train, X_test, results, llm_results, and best_model are defined earlier

# Build dynamic summary
summary = {
    'experiment_date': datetime.now().isoformat(),
    'dataset_info': {
        'total_size': len(df),
        'train_size': len(X_train),
        'test_size': len(X_test),
        'class_distribution': df['label'].value_counts().to_dict()
    },
    'preprocessing_config': preprocessing_config,
    'vectorizer_config': vectorizer_config,
    'classical_models': {
        name: {
            'accuracy': res['accuracy'],
            'precision': res['precision'],
            'recall': res['recall'],
            'f1': res['f1'],
            'roc_auc': res['roc_auc']
        }
        for name, res in results.items()
    },
    'llm_models': llm_results,  # always use dynamic LLM results
    'best_model': {
        'name': best_model[0],
        'f1_score': best_model[1]
    }
}

# Save to JSON
results_file = 'complete_experiment_results.json'
with open(results_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Complete results saved to '{results_file}'")