# Training Accuracy Visualization for Sentiment Analysis Model

This notebook demonstrates multiple ways to visualize training accuracy from your sentiment analysis model training.

## 1. Import Required Libraries
Import matplotlib, seaborn, numpy, and other necessary libraries for data visualization and handling training metrics.

In [None]:
%pip install plotly

# Import Required Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import os
import glob
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

# Set style for better looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Load Training Data from TensorBoard Logs
Load training history data from TensorBoard event files or create sample training accuracy data for demonstration purposes.

In [None]:
def load_tensorboard_data(log_dir):
    """Load training metrics from TensorBoard event files"""
    
    # Find all event files in the log directory
    event_files = glob.glob(os.path.join(log_dir, "events.out.tfevents.*"))
    
    if not event_files:
        print(f"No TensorBoard event files found in {log_dir}")
        return None
    
    # Use the most recent event file
    latest_file = max(event_files, key=os.path.getctime)
    print(f"Loading data from: {latest_file}")
    
    # Load the event accumulator
    ea = EventAccumulator(latest_file)
    ea.Reload()
    
    # Get available scalar tags
    available_tags = ea.Tags()['scalars']
    print(f"Available metrics: {available_tags}")
    
    # Extract training metrics
    metrics_data = {}
    
    for tag in available_tags:
        scalar_events = ea.Scalars(tag)
        steps = [event.step for event in scalar_events]
        values = [event.value for event in scalar_events]
        metrics_data[tag] = {'steps': steps, 'values': values}
    
    return metrics_data

# Try to load actual training data
log_dir = "./logs"
training_data = load_tensorboard_data(log_dir)

# If no TensorBoard data available, create sample data
if training_data is None:
    print("\nCreating sample training data for demonstration...")
    
    # Sample training data that mimics real training
    epochs = np.arange(1, 4)  # 3 epochs as in your config
    steps = np.arange(0, 3000, 100)  # Steps every 100 iterations
    
    # Simulate training accuracy that improves over time
    train_acc = 0.65 + 0.25 * (1 - np.exp(-steps / 800)) + np.random.normal(0, 0.02, len(steps))
    train_acc = np.clip(train_acc, 0.5, 0.95)
    
    # Simulate validation accuracy (slightly lower and more volatile)
    val_acc = train_acc - 0.05 + np.random.normal(0, 0.03, len(steps))
    val_acc = np.clip(val_acc, 0.45, 0.90)
    
    # Simulate loss (decreasing)
    train_loss = 1.2 * np.exp(-steps / 1000) + 0.1 + np.random.normal(0, 0.05, len(steps))
    train_loss = np.clip(train_loss, 0.1, 1.5)
    
    training_data = {
        'train/accuracy': {'steps': steps.tolist(), 'values': train_acc.tolist()},
        'eval/accuracy': {'steps': steps[::5].tolist(), 'values': val_acc[::5].tolist()},  # Eval less frequent
        'train/loss': {'steps': steps.tolist(), 'values': train_loss.tolist()}
    }
    
print("\nData loaded successfully!")
print(f"Available metrics: {list(training_data.keys())}")

## 3. Plot Training Accuracy Over Steps
Create line plots showing how training accuracy changes over training steps using matplotlib.

In [None]:
# Use epochs instead of steps for x-axis
# Construct df_export DataFrame from training_data, aligning by position
min_len = min(
    len(training_data['eval/accuracy']['values']),
    len(training_data['eval/loss']['values']),
    len(training_data['train/epoch']['values'])
)

df_export = pd.DataFrame({
    'eval/accuracy': training_data['eval/accuracy']['values'][:min_len],
    'eval/loss': training_data['eval/loss']['values'][:min_len],
    'train/epoch': training_data['train/epoch']['values'][:min_len]
})

eval_df = df_export.dropna(subset=['eval/accuracy', 'eval/loss', 'train/epoch'])

epochs = eval_df['train/epoch'].values
eval_acc = eval_df['eval/accuracy'].values
eval_loss = eval_df['eval/loss'].values

fig, axes = plt.subplots(2, 2, figsize=(8,5))
fig.suptitle('Sentiment Evaluation Progress Visualization', fontsize=16, fontweight='bold')

# Plot 1: Eval Accuracy over Epochs
axes[0, 0].plot(epochs, eval_acc, 'b-', linewidth=2, label='Eval Accuracy')
axes[0, 0].set_title('Evaluation Accuracy Over Epochs')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].legend()
axes[0, 0].text(0.7, 0.1, f'Final Accuracy: {eval_acc[-1]:.3f}',
                transform=axes[0, 0].transAxes,
                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

# Plot 2: Eval Loss over Epochs
axes[0, 1].plot(epochs, eval_loss, 'r-', linewidth=2, label='Eval Loss')
axes[0, 1].set_title('Evaluation Loss Over Epochs')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].legend()
axes[0, 1].text(0.7, 0.8, f'Final Loss: {eval_loss[-1]:.3f}',
                transform=axes[0, 1].transAxes,
                bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))

# Plot 3: Accuracy Histogram
axes[1, 0].hist(eval_acc, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 0].set_title('Evaluation Accuracy Distribution')
axes[1, 0].set_xlabel('Accuracy')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(np.mean(eval_acc), color='red', linestyle='--',
                   label=f'Mean: {np.mean(eval_acc):.3f}')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Smoothed Accuracy (rolling average)
window_size = max(1, len(eval_acc) // 10)
acc_df = pd.DataFrame({'accuracy': eval_acc})
smoothed_acc = acc_df['accuracy'].rolling(window=window_size, center=True).mean()

axes[1, 1].plot(epochs, eval_acc, 'lightblue', alpha=0.5, label='Raw Accuracy')
axes[1, 1].plot(epochs, smoothed_acc, 'darkblue', linewidth=3, label='Smoothed Accuracy')
axes[1, 1].set_title('Raw vs Smoothed Evaluation Accuracy')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].legend()

plt.tight_layout()
plt.show()


## 4. Compare Training vs Validation Accuracy
Create side-by-side plots or overlaid plots to compare training and validation accuracy to identify overfitting.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

# === Find and Load Hugging Face trainer_state.json ===
possible_paths = [
    "./trainer_state.json",                   
    "./results/trainer_state.json",           
    "./fine_tuned_sentiment_model/trainer_state.json",  
]

search_pattern = "**/trainer_state.json"
found_files = glob.glob(search_pattern, recursive=True)
filtered_files = [f for f in found_files if 'multilabel' not in f.lower()]
possible_paths.extend(filtered_files)

trainer_state_path = None
for path in possible_paths:
    if os.path.exists(path):
        trainer_state_path = path
        print(f"Found trainer_state.json at: {path}")
        break

if trainer_state_path:
    try:
        with open(trainer_state_path, "r") as f:
            state = json.load(f)
        logs = state.get("log_history", [])
        print(f"Successfully loaded {len(logs)} log entries")
    except Exception as e:
        print(f"Error reading {trainer_state_path}: {e}")
        logs = []
else:
    print("trainer_state.json not found in any of these locations:")
    for path in possible_paths[:3]:
        print(f"  - {path}")
    print("\nCreating sample data for demonstration...")
    logs = []

if not logs:
    print("Using sample training data...")
    sample_logs = []
    for step in range(0, 1500, 100):
        epoch = step / 500
        sample_logs.append({
            "step": step,
            "loss": 1.2 * np.exp(-step / 800) + 0.1 + np.random.normal(0, 0.05),
            "learning_rate": 2e-5 * (1 - step / 1500),
            "epoch": epoch
        })
        if step % 500 == 0 and step > 0:
            sample_logs.append({
                "step": step,
                "eval_loss": 1.0 * np.exp(-step / 800) + 0.15 + np.random.normal(0, 0.03),
                "eval_accuracy": 0.65 + 0.25 * (1 - np.exp(-step / 800)) + np.random.normal(0, 0.02),
                "eval_runtime": 10.5,
                "eval_samples_per_second": 50.0,
                "epoch": epoch
            })
    logs = sample_logs

# Extract metrics (using epochs)
train_epochs, train_loss, learning_rates = [], [], []
eval_epochs, eval_acc, eval_loss = [], [], []

for entry in logs:
    if "loss" in entry and "eval_loss" not in entry:
        train_epochs.append(entry["epoch"])
        train_loss.append(entry["loss"])
        if "learning_rate" in entry:
            learning_rates.append(entry["learning_rate"])
    if "eval_accuracy" in entry:
        eval_epochs.append(entry["epoch"])
        eval_acc.append(entry["eval_accuracy"])
        if "eval_loss" in entry:
            eval_loss.append(entry["eval_loss"])

train_epochs = np.array(train_epochs)
train_loss = np.array(train_loss)
learning_rates = np.array(learning_rates) if learning_rates else np.array([])
eval_epochs = np.array(eval_epochs)
eval_acc = np.array(eval_acc)
eval_loss = np.array(eval_loss)

print(f"\nExtracted metrics:")
print(f"  Training epochs: {len(train_epochs)}")
print(f"  Evaluation epochs: {len(eval_epochs)}")
print(f"  Learning rate entries: {len(learning_rates)}")

fig, axes = plt.subplots(2, 2, figsize=(8, 5))
fig.suptitle('Sentiment Model Training Progress Analysis', fontsize=16, fontweight='bold')

# Plot 1: Training Loss over epochs
if len(train_epochs) > 0:
    axes[0, 0].plot(train_epochs, train_loss, 'b-', linewidth=2, label='Training Loss', alpha=0.8)
    axes[0, 0].set_title('Training Loss Over Epochs', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Epochs')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    if len(train_loss) > 0:
        final_loss = train_loss[-1]
        axes[0, 0].text(0.02, 0.98, f"Final Loss: {final_loss:.3f}",
                       transform=axes[0, 0].transAxes, verticalalignment='top',
                       bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

# Plot 2: Validation Accuracy over epochs
if len(eval_epochs) > 0:
    axes[0, 1].plot(eval_epochs, eval_acc, 'r-', linewidth=2, label='Validation Accuracy', alpha=0.8)
    axes[0, 1].set_title('Validation Accuracy Over Epochs', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Epochs')
    axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    final_acc = eval_acc[-1]
    axes[0, 1].text(0.02, 0.98, f"Final Accuracy: {final_acc:.3f}",
                   transform=axes[0, 1].transAxes, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))

# Plot 3: Learning Rate Schedule (if available)
if len(learning_rates) > 0:
    axes[1, 0].plot(train_epochs[:len(learning_rates)], learning_rates, 'g-', linewidth=2, alpha=0.8)
    axes[1, 0].set_title('Learning Rate Schedule', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Epochs')
    axes[1, 0].set_ylabel('Learning Rate')
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 0].ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
else:
    axes[1, 0].text(0.5, 0.5, 'No Learning Rate Data Available', 
                   transform=axes[1, 0].transAxes, ha='center', va='center',
                   fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    axes[1, 0].set_title('Learning Rate Schedule', fontsize=12, fontweight='bold')

# Plot 4: Validation Accuracy Improvement (epoch-to-epoch change)
if len(eval_acc) > 1:
    improvement = np.diff(eval_acc)
    axes[1, 1].plot(eval_epochs[1:], improvement, 'purple', linewidth=2, alpha=0.7)
    axes[1, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
    axes[1, 1].fill_between(eval_epochs[1:], improvement, 0,
                           where=(improvement > 0),
                           color='green', alpha=0.3, label='Improvement')
    axes[1, 1].fill_between(eval_epochs[1:], improvement, 0,
                           where=(improvement <= 0),
                           color='red', alpha=0.3, label='Degradation')
    axes[1, 1].set_title('Validation Accuracy Change per Epoch', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Epochs')
    axes[1, 1].set_ylabel('Accuracy Change')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
else:
    axes[1, 1].text(0.5, 0.5, 'Insufficient Evaluation Data', 
                   transform=axes[1, 1].transAxes, ha='center', va='center',
                   fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    axes[1, 1].set_title('Validation Accuracy Change per Epoch', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('sentiment_training_summary.png', dpi=300, bbox_inches='tight')
plt.savefig('sentiment_training_summary.pdf', bbox_inches='tight')
print("Summary plot saved as 'sentiment_training_summary.png' and 'sentiment_training_summary.pdf'")
plt.show()

print("\n" + "="*60)
print("SENTIMENT MODEL TRAINING SUMMARY")
print("="*60)

if len(train_loss) > 0:
    print(f"Training Loss:")
    print(f"  Initial Loss: {train_loss[0]:.4f}")
    print(f"  Final Loss: {train_loss[-1]:.4f}")
    print(f"  Loss Reduction: {train_loss[0] - train_loss[-1]:.4f}")
    print(f"  Best (Lowest) Loss: {np.min(train_loss):.4f}")

if len(eval_acc) > 0:
    print(f"\nValidation Accuracy:")
    print(f"  Initial Accuracy: {eval_acc[0]:.4f}")
    print(f"  Final Accuracy: {eval_acc[-1]:.4f}")
    print(f"  Total Improvement: {eval_acc[-1] - eval_acc[0]:.4f}")
    print(f"  Average Accuracy: {np.mean(eval_acc):.4f}")
    print(f"  Best Accuracy: {np.max(eval_acc):.4f}")
    print(f"  Accuracy Std Dev: {np.std(eval_acc):.4f}")

if len(train_loss) > 0 and len(eval_acc) > 0:
    print(f"\nOverall Training Health:")
    loss_trend = "decreasing" if train_loss[-1] < train_loss[0] else "increasing"
    acc_trend = "improving" if eval_acc[-1] > eval_acc[0] else "declining"
    print(f"  Loss trend: {loss_trend}")
    print(f"  Accuracy trend: {acc_trend}")
    if loss_trend == "decreasing" and acc_trend == "improving":
        print(f" Training appears healthy - loss decreasing, accuracy improving")
    else:
        print(f"  Check training - unusual trends detected")

print("\n" + "="*60)


## 5. Create Interactive Accuracy Plots
Use plotly to create interactive plots that allow zooming and hovering over data points for detailed information.

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import os

trainer_state_path = "./multilabel_results/checkpoint-810/trainer_state.json"

# Load Trainer State JSON
if os.path.exists(trainer_state_path):
    try:
        with open(trainer_state_path, "r") as f:
            state = json.load(f)
        logs = state.get("log_history", [])
        print(f"Successfully loaded {len(logs)} log entries from {trainer_state_path}")
    except Exception as e:
        print(f"Error reading {trainer_state_path}: {e}")
        logs = []
else:
    print(f"⚠️ {trainer_state_path} not found. Using synthetic demo data...")
    logs = []

# If no logs, create sample data
if not logs:
    sample_logs = []
    for step in range(0, 1500, 100):
        sample_logs.append({
            "step": step,
            "loss": 1.2 * np.exp(-step / 800) + 0.1 + np.random.normal(0, 0.05),
            "learning_rate": 2e-5 * (1 - step / 1500),
            "epoch": step / 500
        })
        if step % 500 == 0 and step > 0:
            sample_logs.append({
                "step": step,
                "eval_loss": 1.0 * np.exp(-step / 800) + 0.15 + np.random.normal(0, 0.03),
                "eval_accuracy": 0.65 + 0.25 * (1 - np.exp(-step / 800)) + np.random.normal(0, 0.02),
                "eval_f1_macro": 0.55 + 0.35 * (1 - np.exp(-step / 800)) + np.random.normal(0, 0.02),
                "eval_runtime": 10.5,
                "eval_samples_per_second": 50.0,
                "epoch": step / 500
            })
    logs = sample_logs

# Extract metrics
train_steps, train_loss, learning_rates = [], [], []
eval_steps, eval_acc, eval_loss, eval_f1 = [], [], [], []

for entry in logs:
    if "loss" in entry and "eval_loss" not in entry:
        train_steps.append(entry.get("step", len(train_steps)))
        train_loss.append(entry["loss"])
        if "learning_rate" in entry:
            learning_rates.append(entry["learning_rate"])
    if "eval_accuracy" in entry or "eval_f1_macro" in entry:
        eval_steps.append(entry.get("step", len(eval_steps)))
        eval_loss.append(entry.get("eval_loss", np.nan))
        eval_acc.append(entry.get("eval_accuracy", np.nan))
        eval_f1.append(entry.get("eval_f1_macro", np.nan))

train_steps = np.array(train_steps)
train_loss = np.array(train_loss)
learning_rates = np.array(learning_rates)
eval_steps = np.array(eval_steps)
eval_acc = np.array(eval_acc)
eval_loss = np.array(eval_loss)
eval_f1 = np.array(eval_f1)

print(f"\n📊 Extracted metrics:")
print(f"  Training steps: {len(train_steps)}")
print(f"  Evaluation steps: {len(eval_steps)}")
print(f"  Learning rate entries: {len(learning_rates)}")
print(f"  Eval Accuracy entries: {np.count_nonzero(~np.isnan(eval_acc))}")
print(f"  Eval F1 entries: {np.count_nonzero(~np.isnan(eval_f1))}")

# Plot 2×2 summary
fig, axes = plt.subplots(2, 2, figsize=(10, 6))
fig.suptitle('Multilabel Model Training Progress Analysis', fontsize=16, fontweight='bold')

if len(train_steps) > 0:
    axes[0, 0].plot(train_steps, train_loss, 'b-', linewidth=2, label='Training Loss', alpha=0.8)
    axes[0, 0].set_title('Training Loss', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Steps'); axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend(); axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].text(0.02, 0.98, f"Final: {train_loss[-1]:.3f}",
                   transform=axes[0, 0].transAxes, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

if len(eval_acc) > 0:
    axes[0, 1].plot(eval_steps, eval_acc, 'r-', linewidth=2, label='Val Accuracy', alpha=0.8)
    axes[0, 1].set_title('Validation Accuracy', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Steps'); axes[0, 1].set_ylabel('Accuracy')
    axes[0, 1].legend(); axes[0, 1].grid(True, alpha=0.3)
    axes[0, 1].text(0.02, 0.98, f"Final: {eval_acc[-1]:.3f}",
                   transform=axes[0, 1].transAxes, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.8))

if len(learning_rates) > 0:
    axes[1, 0].plot(train_steps[:len(learning_rates)], learning_rates, 'g-', linewidth=2, alpha=0.8)
    axes[1, 0].set_title('Learning Rate Schedule', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Steps'); axes[1, 0].set_ylabel('LR')
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 0].ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
else:
    axes[1, 0].text(0.5, 0.5, 'No LR Data',
                   transform=axes[1, 0].transAxes, ha='center', va='center',
                   fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    axes[1, 0].set_title('Learning Rate Schedule', fontsize=12, fontweight='bold')

if len(eval_f1) > 0:
    axes[1, 1].plot(eval_steps, eval_f1, 'm-', linewidth=2, label='Val F1 (macro)', alpha=0.8)
    axes[1, 1].set_title('Validation F1 (macro)', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Steps'); axes[1, 1].set_ylabel('F1 Score')
    axes[1, 1].legend(); axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].text(0.02, 0.98, f"Final: {eval_f1[-1]:.3f}",
                   transform=axes[1, 1].transAxes, verticalalignment='top',
                   bbox=dict(boxstyle='round', facecolor='violet', alpha=0.8))
else:
    axes[1, 1].text(0.5, 0.5, 'No F1 Data',
                   transform=axes[1, 1].transAxes, ha='center', va='center',
                   fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    axes[1, 1].set_title('Validation F1 (macro)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('multilabel_training_summary.png', dpi=300, bbox_inches='tight')
plt.show()

# Print Training Summary
print("\n" + "="*60)
print("📋 MULTILABEL MODEL TRAINING SUMMARY")
print("="*60)

if len(train_loss) > 0:
    print(f"Training Loss:")
    print(f"  Initial: {train_loss[0]:.4f}")
    print(f"  Final: {train_loss[-1]:.4f}")
    print(f"  Reduction: {train_loss[0] - train_loss[-1]:.4f}")
    print(f"  Best (min): {np.min(train_loss):.4f}")

if len(eval_acc) > 0:
    print(f"\nValidation Accuracy:")
    print(f"  Initial: {eval_acc[0]:.4f}")
    print(f"  Final: {eval_acc[-1]:.4f}")
    print(f"  Best: {np.nanmax(eval_acc):.4f}")

if len(eval_f1) > 0:
    print(f"\nValidation F1 (macro):")
    print(f"  Initial: {eval_f1[0]:.4f}")
    print(f"  Final: {eval_f1[-1]:.4f}")
    print(f"  Best: {np.nanmax(eval_f1):.4f}")

print("="*60)
