# Llama Model Training

The Llama model's performance across the different datasets reveals interesting patterns in how it handles narrative classification tasks. When trained on the Climate Change (CC) dataset alone, the model achieved its strongest results with a final accuracy of 0.625 and an F1 score of 0.665. This suggests that the model was able to effectively learn and distinguish between different climate change-related narratives.

However, when trained on the full combined dataset, which included both climate change and Ukraine-related narratives, the model's performance decreased notably, achieving an accuracy of 0.316 and an F1 score of 0.431. This decline in performance tells us something important about how Llama handles increasing narrative complexity. The lower metrics on the combined dataset likely indicate that the model struggled to maintain clear boundaries between similar narrative types when dealing with a broader context spanning multiple domains.

The difference in performance between the focused CC dataset and the combined dataset highlights a fundamental challenge in narrative classification: as the number and variety of possible narratives increase, the task of distinguishing between them becomes exponentially more complex. This is particularly relevant when narratives from different domains might share similar linguistic patterns or rhetorical structures, making it harder for the model to make clean distinctions.

In [1]:
import os
import pandas as pd
import wandb
import torch
import logging
from datetime import datetime
from huggingface_hub import login

from model import initialize_model, setup_peft
from data_utils import prepare_data, get_predictions_batch, prepare_data_for_model, ensure_model_on_device
from trainer import train_model
from debug_utils import debug_misclassifications

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [4]:
def setup_training():
    try:
        # Login to Hugging Face
        login('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')

        # Check CUDA availability
        print(f"CUDA Available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

        # Set paths
        def find_repo_root():
            current = os.getcwd()
            while current != os.path.dirname(current):
                if os.path.exists(os.path.join(current, '.git')):
                    return current
                current = os.path.dirname(current)
            raise Exception("No .git directory found - repository root could not be determined")

        # Set paths using repository root
        repo_root = find_repo_root()
        code_path = os.path.join(repo_root, "code")
        current_date = datetime.now().strftime("%Y%m%d")
        output_dir = os.path.join(code_path, "models", f"llama_{current_date}")
        os.makedirs(output_dir, exist_ok=True)

        # Load data from code directory
        print("\nLoading datasets...")
        print(f"Repository root: {repo_root}")
        print(f"Looking for data files in: {code_path}")
        input_file_full = os.path.join(code_path, "df_normalized.csv")
        input_file_ua = os.path.join(code_path, "df_normalized_ua.csv")
        input_file_cc = os.path.join(code_path, "df_normalized_cc.csv")

        df_normalized = pd.read_csv(input_file_full)
        df_normalized_ua = pd.read_csv(input_file_ua)
        df_normalized_cc = pd.read_csv(input_file_cc)

        # Model configuration
        model_name = "openlm-research/open_llama_3b"
        
        return {
            'output_dir': output_dir,
            'current_date': current_date,
            'model_name': model_name,
            'df_normalized': df_normalized,
            'df_normalized_ua': df_normalized_ua,
            'df_normalized_cc': df_normalized_cc
        }

    except Exception as e:
        print(f"Error in setup: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise



In [None]:
def debug_model(model, dataset, tokenizer, label_mapping, dataset_type="Training"):
    """Run debug analysis on model predictions"""
    try:
        # Set up model and device
        model, device = ensure_model_on_device(model)
        print(f"\nAnalyzing {dataset_type} dataset...")
        
        # Prepare texts
        texts = dataset['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        true_labels = torch.tensor([
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in dataset['narrative_subnarrative_pairs']
        ]).to(device)
        
        print(f"Total samples: {len(texts)}")
        
        # Get predictions in batches
        batch_size = 8
        predictions = []
        confidences = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_preds, batch_confs = get_predictions_batch(model, batch_texts, tokenizer, device)
            predictions.append(batch_preds)
            confidences.append(batch_confs)
        
        # Concatenate and move to CPU
        predictions = torch.cat(predictions).cpu().numpy()
        confidences = torch.cat(confidences).cpu().numpy()
        true_labels = true_labels.cpu().numpy()
        
        # Track misclassifications
        misclassifications = []
        for idx, (pred, true, conf) in enumerate(zip(predictions, true_labels, confidences)):
            if pred != true:
                misclassifications.append({
                    'text': texts[idx][:200],
                    'predicted': pred,
                    'actual': true,
                    'confidence': conf,
                    'dataset_type': dataset_type
                })
        
        # Create DataFrame and display results
        misclass_df = pd.DataFrame(misclassifications)
        print(f"\nTotal misclassifications: {len(misclass_df)}")
        print(f"Accuracy: {1 - len(misclass_df)/len(texts):.4f}")
        
        if len(misclass_df) > 0:
            print("\nMisclassification distribution:")
            print(misclass_df.groupby(['actual', 'predicted']).size().unstack(fill_value=0))
            
            print("\nSample misclassifications:")
            for i, row in misclass_df.head().iterrows():
                print(f"\nExample {i+1}:")
                print(f"Text: {row['text']}")
                print(f"Predicted: {row['predicted']}, Actual: {row['actual']}")
                print(f"Confidence: {row['confidence']:.4f}")
        
        return misclass_df
    
    except Exception as e:
        print(f"Error in debug analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

In [None]:
def train_single_dataset(df, model_name, output_dir, current_date, dataset_name):
    """Train model on a single dataset and return results"""
    try:
        # Create dataset-specific output directory
        dataset_output_dir = os.path.join(output_dir, f"{dataset_name}_{current_date}")
        os.makedirs(dataset_output_dir, exist_ok=True)
        
        print(f"\nTraining on {dataset_name} dataset...")
        
        # Initialize wandb run for this dataset
        wandb.init(project="llama-classification", 
                  name=f"llama-classification-{dataset_name}-{current_date}",
                  reinit=True)
        
        # Prepare data
        train_dataset, val_dataset, tokenizer, label_mapping, num_labels = prepare_data(
            df, model_name, dataset_output_dir
        )

        # Initialize and setup model
        print("\nInitializing model...")
        model = initialize_model(model_name, num_labels)
        model = setup_peft(model)

        # Train model
        trainer = train_model(model, train_dataset, val_dataset, dataset_output_dir, 
                            current_date, dataset_name)

        # Evaluate model
        print("\nEvaluating model...")
        results = trainer.evaluate()
        
        print(f"\nEvaluation results for {dataset_name} dataset:")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")

        # Save model and tokenizer
        print(f"\nSaving {dataset_name} model...")
        trainer.save_model(dataset_output_dir)
        tokenizer.save_pretrained(dataset_output_dir)

        # End wandb run
        wandb.finish()
        
        return {
            'results': results,
            'model': model,
            'tokenizer': tokenizer,
            'label_mapping': label_mapping,
            'trainer': trainer
        }

    except Exception as e:
        print(f"Error in training {dataset_name} dataset: {str(e)}")
        traceback.print_exc()
        wandb.finish()
        raise

def debug_dataset(df, model, tokenizer, label_mapping, dataset_name):
    """Debug misclassifications for a specific dataset"""
    try:
        print(f"\nDebugging {dataset_name} dataset...")
        misclass_df = debug_misclassifications(
            df, model, tokenizer, label_mapping, dataset_type=dataset_name
        )
        
        # Save misclassifications to CSV
        output_file = f"misclassifications_{dataset_name}.csv"
        misclass_df.to_csv(output_file, index=False)
        print(f"\nMisclassification results saved to {output_file}")
        
        return misclass_df
        
    except Exception as e:
        print(f"Error in debugging {dataset_name} dataset: {str(e)}")
        traceback.print_exc()
        raise

def train_and_debug_all_datasets(setup_dict):
    """Train and debug models for all datasets"""
    results = {}
    
    # Train and debug full dataset
    print("\n=== Processing Full Dataset ===")
    full_results = train_single_dataset(
        setup_dict['df_normalized'],
        setup_dict['model_name'],
        setup_dict['output_dir'],
        setup_dict['current_date'],
        'full'
    )
    full_misclass = debug_dataset(
        setup_dict['df_normalized'],
        full_results['model'],
        full_results['tokenizer'],
        full_results['label_mapping'],
        'full'
    )
    results['full'] = {'training': full_results, 'debugging': full_misclass}
    
    # Train and debug UA dataset
    print("\n=== Processing UA Dataset ===")
    ua_results = train_single_dataset(
        setup_dict['df_normalized_ua'],
        setup_dict['model_name'],
        setup_dict['output_dir'],
        setup_dict['current_date'],
        'ua'
    )
    ua_misclass = debug_dataset(
        setup_dict['df_normalized_ua'],
        ua_results['model'],
        ua_results['tokenizer'],
        ua_results['label_mapping'],
        'ua'
    )
    results['ua'] = {'training': ua_results, 'debugging': ua_misclass}
    
    # Train and debug CC dataset
    print("\n=== Processing CC Dataset ===")
    cc_results = train_single_dataset(
        setup_dict['df_normalized_cc'],
        setup_dict['model_name'],
        setup_dict['output_dir'],
        setup_dict['current_date'],
        'cc'
    )
    cc_misclass = debug_dataset(
        setup_dict['df_normalized_cc'],
        cc_results['model'],
        cc_results['tokenizer'],
        cc_results['label_mapping'],
        'cc'
    )
    results['cc'] = {'training': cc_results, 'debugging': cc_misclass}
    
    return results

In [None]:
setup_dict = setup_training()

In [None]:
ua_results = train_single_dataset(
    setup_dict['df_normalized_ua'],
    setup_dict['model_name'],
    setup_dict['output_dir'],
    setup_dict['current_date'],
    'ua'
)

In [None]:
ua_misclass = debug_dataset(
    setup_dict['df_normalized_ua'],
    ua_results['model'],
    ua_results['tokenizer'],
    ua_results['label_mapping'],
    'ua'
)

In [None]:
cc_results, cc_model, cc_tokenizer, cc_label_mapping = train_cc()

In [None]:
# Train full dataset
results, model, tokenizer, label_mapping = train_full()