# Llama Model Training

The Llama model's performance across the different datasets reveals interesting patterns in how it handles narrative classification tasks. When trained on the Climate Change (CC) dataset alone, the model achieved its strongest results with a final accuracy of 0.625 and an F1 score of 0.665. This suggests that the model was able to effectively learn and distinguish between different climate change-related narratives.

However, when trained on the full combined dataset, which included both climate change and Ukraine-related narratives, the model's performance decreased notably, achieving an accuracy of 0.316 and an F1 score of 0.431. This decline in performance tells us something important about how Llama handles increasing narrative complexity. The lower metrics on the combined dataset likely indicate that the model struggled to maintain clear boundaries between similar narrative types when dealing with a broader context spanning multiple domains.

The difference in performance between the focused CC dataset and the combined dataset highlights a fundamental challenge in narrative classification: as the number and variety of possible narratives increase, the task of distinguishing between them becomes exponentially more complex. This is particularly relevant when narratives from different domains might share similar linguistic patterns or rhetorical structures, making it harder for the model to make clean distinctions.

In [1]:
import os
import pandas as pd
import wandb
import torch
import logging
from datetime import datetime
from huggingface_hub import login

from model import initialize_model, setup_peft
from data_utils import prepare_data, get_predictions_batch, prepare_data_for_model, ensure_model_on_device
from trainer import train_model
from debug_utils import debug_misclassifications

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [2]:
def train_single_dataset(df, model_name, output_dir, current_date, dataset_name):
    """
    Train model on a single dataset
    
    Args:
        df: DataFrame containing the dataset
        model_name: Name of the model to use
        output_dir: Directory to save outputs
        current_date: Current date string for naming
        dataset_name: Name of the dataset for logging
    
    Returns:
        tuple: (results, model, tokenizer, label_mapping, df)  # Added df to return values
    """
    try:
        # Create dataset-specific output directory
        dataset_output_dir = os.path.join(output_dir, f"{dataset_name}_{current_date}")
        os.makedirs(dataset_output_dir, exist_ok=True)
        
        print(f"\nTraining on {dataset_name} dataset...")
        
        # Initialize wandb run for this dataset
        wandb.init(
            project="llama-classification",
            name=f"llama-classification-{dataset_name}-{current_date}",
            reinit=True
        )
        
        # Prepare data
        train_dataset, val_dataset, tokenizer, label_mapping, num_labels = prepare_data(
            df, 
            model_name, 
            dataset_output_dir
        )
        
        # Initialize and setup model
        print("\nInitializing model...")
        model = initialize_model(model_name, num_labels)
        model = setup_peft(model)
        
        # Move model to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        # Create data collator that handles device placement
        from transformers import DataCollatorWithPadding
        
        data_collator = DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        
        def collate_fn(batch):
            # Collate the batch using the data collator
            batch = data_collator(batch)
            # Move to device
            return {k: v.to(device) if isinstance(v, torch.Tensor) else v 
                   for k, v in batch.items()}
        
        # Train model with custom collate_fn
        trainer = train_model(
            model, 
            train_dataset, 
            val_dataset, 
            dataset_output_dir, 
            current_date, 
            dataset_name,
            collate_fn=collate_fn  # Pass the custom collate function
        )
        
        # Evaluate model
        print("\nEvaluating model...")
        results = trainer.evaluate()
        
        print(f"\nEvaluation results for {dataset_name} dataset:")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")
        
        # Save model and tokenizer
        print(f"\nSaving {dataset_name} model...")
        trainer.save_model(dataset_output_dir)
        tokenizer.save_pretrained(dataset_output_dir)
        
        # End wandb run
        wandb.finish()
        
        # Return df along with other outputs
        return results, model, tokenizer, label_mapping, df

    except Exception as e:
        print(f"Error in training {dataset_name} dataset: {str(e)}")
        wandb.finish()
        raise

In [4]:
def setup_training():
    try:
        # Login to Hugging Face
        login('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')

        # Check CUDA availability
        print(f"CUDA Available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"GPU Device: {torch.cuda.get_device_name(0)}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

        # Set paths
        def find_repo_root():
            current = os.getcwd()
            while current != os.path.dirname(current):
                if os.path.exists(os.path.join(current, '.git')):
                    return current
                current = os.path.dirname(current)
            raise Exception("No .git directory found - repository root could not be determined")

        # Set paths using repository root
        repo_root = find_repo_root()
        code_path = os.path.join(repo_root, "code")
        current_date = datetime.now().strftime("%Y%m%d")
        output_dir = os.path.join(code_path, "models", f"llama_{current_date}")
        os.makedirs(output_dir, exist_ok=True)

        # Load data from code directory
        print("\nLoading datasets...")
        print(f"Repository root: {repo_root}")
        print(f"Looking for data files in: {code_path}")
        input_file_full = os.path.join(code_path, "df_normalized.csv")
        input_file_ua = os.path.join(code_path, "df_normalized_ua.csv")
        input_file_cc = os.path.join(code_path, "df_normalized_cc.csv")

        df_normalized = pd.read_csv(input_file_full)
        df_normalized_ua = pd.read_csv(input_file_ua)
        df_normalized_cc = pd.read_csv(input_file_cc)

        # Model configuration
        model_name = "openlm-research/open_llama_3b"
        
        return {
            'output_dir': output_dir,
            'current_date': current_date,
            'model_name': model_name,
            'df_normalized': df_normalized,
            'df_normalized_ua': df_normalized_ua,
            'df_normalized_cc': df_normalized_cc
        }

    except Exception as e:
        print(f"Error in setup: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise

def train_ua():
    try:
        # Get setup configuration
        config = setup_training()
        
        print("\nStarting UA dataset training...")
        ua_results, ua_model, ua_tokenizer, ua_label_mapping, df_normalized_ua = train_single_dataset(
        config['df_normalized_ua'], 
        config['model_name'], 
        config['output_dir'], 
        config['current_date'], 
        "ua"    
        )
        
        return ua_results, ua_model, ua_tokenizer, ua_label_mapping
        
    except Exception as e:
        print(f"Error in UA training: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise

def train_cc():
    try:
        # Get setup configuration
        config = setup_training()
        
        print("\nStarting CC dataset training...")
        cc_results, cc_model, cc_tokenizer, cc_label_mapping, df_normalized_cc = train_single_dataset(
            config['df_normalized_cc'], 
            config['model_name'], 
            config['output_dir'], 
            config['current_date'], 
            "cc"
        )
        
        return cc_results, cc_model, cc_tokenizer, cc_label_mapping
        
    except Exception as e:
        print(f"Error in CC training: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise

def train_full():
    try:
        # Get setup configuration
        config = setup_training()
        
        print("\nStarting full dataset training...")
        results, model, tokenizer, label_mapping, df_normalized = train_single_dataset(
            config['df_normalized'], 
            config['model_name'], 
            config['output_dir'], 
            config['current_date'], 
            "full"
        )
        
        return results, model, tokenizer, label_mapping
        
    except Exception as e:
        print(f"Error in full dataset training: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise

In [None]:
def debug_model(model, dataset, tokenizer, label_mapping, dataset_type="Training"):
    """Run debug analysis on model predictions"""
    try:
        # Set up model and device
        model, device = ensure_model_on_device(model)
        print(f"\nAnalyzing {dataset_type} dataset...")
        
        # Prepare texts
        texts = dataset['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        true_labels = torch.tensor([
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in dataset['narrative_subnarrative_pairs']
        ]).to(device)
        
        print(f"Total samples: {len(texts)}")
        
        # Get predictions in batches
        batch_size = 8
        predictions = []
        confidences = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_preds, batch_confs = get_predictions_batch(model, batch_texts, tokenizer, device)
            predictions.append(batch_preds)
            confidences.append(batch_confs)
        
        # Concatenate and move to CPU
        predictions = torch.cat(predictions).cpu().numpy()
        confidences = torch.cat(confidences).cpu().numpy()
        true_labels = true_labels.cpu().numpy()
        
        # Track misclassifications
        misclassifications = []
        for idx, (pred, true, conf) in enumerate(zip(predictions, true_labels, confidences)):
            if pred != true:
                misclassifications.append({
                    'text': texts[idx][:200],
                    'predicted': pred,
                    'actual': true,
                    'confidence': conf,
                    'dataset_type': dataset_type
                })
        
        # Create DataFrame and display results
        misclass_df = pd.DataFrame(misclassifications)
        print(f"\nTotal misclassifications: {len(misclass_df)}")
        print(f"Accuracy: {1 - len(misclass_df)/len(texts):.4f}")
        
        if len(misclass_df) > 0:
            print("\nMisclassification distribution:")
            print(misclass_df.groupby(['actual', 'predicted']).size().unstack(fill_value=0))
            
            print("\nSample misclassifications:")
            for i, row in misclass_df.head().iterrows():
                print(f"\nExample {i+1}:")
                print(f"Text: {row['text']}")
                print(f"Predicted: {row['predicted']}, Actual: {row['actual']}")
                print(f"Confidence: {row['confidence']:.4f}")
        
        return misclass_df
    
    except Exception as e:
        print(f"Error in debug analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

In [5]:

config = setup_training()


CUDA Available: True
GPU Device: NVIDIA GeForce RTX 2080 Ti
GPU Memory: 11.81 GB

Loading datasets...
Repository root: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024
Looking for data files in: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024\code


In [6]:
#Train UA dataset
ua_results, ua_model, ua_tokenizer, ua_label_mapping, df_normalized_ua = train_ua()

CUDA Available: True
GPU Device: NVIDIA GeForce RTX 2080 Ti
GPU Memory: 11.81 GB

Loading datasets...
Repository root: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024
Looking for data files in: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024\code

Starting UA dataset training...

Training on ua dataset...


[34m[1mwandb[0m: Currently logged in as: [33mjonaskruse[0m ([33mbackpropagandists[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.



Creating narrative mapping...
Number of unique narratives: 12

Sample narrative mappings:
0: Amplifying war-related fears
1: Blaming the war on others rather than the invader
2: Discrediting Ukraine
3: Discrediting the West, Diplomacy
4: Distrust towards Media

Training set size: 940
Validation set size: 235

Initializing tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message



Tokenizing texts...

Initializing model...


2025-01-24 22:13:14,924 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 10,688,000 || all params: 3,334,800,000 || trainable%: 0.3205

Starting classification head pre-training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,2.1328,2.16467,0.242553,0.14129,0.115708,0.242553,"{'Class_0': [[216, 0], [19, 0]], 'Class_1': [[219, 0], [16, 0]], 'Class_2': [[54, 140], [4, 37]], 'Class_3': [[206, 0], [29, 0]], 'Class_4': [[233, 0], [2, 0]], 'Class_5': [[233, 0], [2, 0]], 'Class_6': [[226, 0], [9, 0]], 'Class_7': [[143, 38], [34, 20]], 'Class_8': [[233, 0], [2, 0]], 'Class_9': [[199, 0], [36, 0]], 'Class_10': [[221, 0], [14, 0]], 'Class_11': [[224, 0], [11, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[216   0]
 [ 19   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 1:
Confusion Matrix:
[[219   0]
 [ 16   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 2:
Confusion Matrix:
[[ 54 140]
 [  4  37]]
Precision: 0.2090
Recall: 0.9024
F1 Score: 0.3394

Metrics for Class 3:
Confusion Matrix:
[[206   0]
 [ 29   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[226   0]
 [  9   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[143  38]
 [ 34  20]]
Precision: 0.3448
Recall: 0.3704
F1 Score: 0.3571

Metrics for Class 8:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Starting full model fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,2.144,2.120298,0.246809,0.145393,0.11967,0.246809,"{'Class_0': [[216, 0], [19, 0]], 'Class_1': [[219, 0], [16, 0]], 'Class_2': [[54, 140], [4, 37]], 'Class_3': [[206, 0], [29, 0]], 'Class_4': [[233, 0], [2, 0]], 'Class_5': [[233, 0], [2, 0]], 'Class_6': [[226, 0], [9, 0]], 'Class_7': [[144, 37], [33, 21]], 'Class_8': [[233, 0], [2, 0]], 'Class_9': [[199, 0], [36, 0]], 'Class_10': [[221, 0], [14, 0]], 'Class_11': [[224, 0], [11, 0]]}"
2,2.1799,2.122152,0.246809,0.145393,0.11967,0.246809,"{'Class_0': [[216, 0], [19, 0]], 'Class_1': [[219, 0], [16, 0]], 'Class_2': [[54, 140], [4, 37]], 'Class_3': [[206, 0], [29, 0]], 'Class_4': [[233, 0], [2, 0]], 'Class_5': [[233, 0], [2, 0]], 'Class_6': [[226, 0], [9, 0]], 'Class_7': [[144, 37], [33, 21]], 'Class_8': [[233, 0], [2, 0]], 'Class_9': [[199, 0], [36, 0]], 'Class_10': [[221, 0], [14, 0]], 'Class_11': [[224, 0], [11, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[216   0]
 [ 19   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 1:
Confusion Matrix:
[[219   0]
 [ 16   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 2:
Confusion Matrix:
[[ 54 140]
 [  4  37]]
Precision: 0.2090
Recall: 0.9024
F1 Score: 0.3394

Metrics for Class 3:
Confusion Matrix:
[[206   0]
 [ 29   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[226   0]
 [  9   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[144  37]
 [ 33  21]]
Precision: 0.3621
Recall: 0.3889
F1 Score: 0.3750

Metrics for Class 8:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Metrics for Class 0:
Confusion Matrix:
[[216   0]
 [ 19   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 1:
Confusion Matrix:
[[219   0]
 [ 16   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 2:
Confusion Matrix:
[[ 54 140]
 [  4  37]]
Precision: 0.2090
Recall: 0.9024
F1 Score: 0.3394

Metrics for Class 3:
Confusion Matrix:
[[206   0]
 [ 29   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[226   0]
 [  9   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[144  37]
 [ 33  21]]
Precision: 0.3621
Recall: 0.3889
F1 Score: 0.3750

Metrics for Class 8:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Metrics for Class 0:
Confusion Matrix:
[[216   0]
 [ 19   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 1:
Confusion Matrix:
[[219   0]
 [ 16   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 2:
Confusion Matrix:
[[ 54 140]
 [  4  37]]
Precision: 0.2090
Recall: 0.9024
F1 Score: 0.3394

Metrics for Class 3:
Confusion Matrix:
[[206   0]
 [ 29   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[226   0]
 [  9   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[144  37]
 [ 33  21]]
Precision: 0.3621
Recall: 0.3889
F1 Score: 0.3750

Metrics for Class 8:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Evaluating model...





Metrics for Class 0:
Confusion Matrix:
[[216   0]
 [ 19   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 1:
Confusion Matrix:
[[219   0]
 [ 16   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 2:
Confusion Matrix:
[[ 54 140]
 [  4  37]]
Precision: 0.2090
Recall: 0.9024
F1 Score: 0.3394

Metrics for Class 3:
Confusion Matrix:
[[206   0]
 [ 29   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[226   0]
 [  9   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[144  37]
 [ 33  21]]
Precision: 0.3621
Recall: 0.3889
F1 Score: 0.3750

Metrics for Class 8:
Confusion Matrix:
[[233   0]
 [  2   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Analyzing misclassifications in Training dataset...
Model is on device: cuda:0

Total samples to analyze: 1175


KeyboardInterrupt: 

In [None]:
ua_debug_df = debug_model(ua_model, df_normalized_ua, ua_tokenizer, ua_label_mapping, "UA")

In [8]:
cc_results, cc_model, cc_tokenizer, cc_label_mapping = train_cc()

CUDA Available: True
GPU Device: NVIDIA L40S
GPU Memory: 47.81 GB

Loading datasets...
Repository root: /teamspace/studios/this_studio/nlp_Backpropagandists_2024
Looking for data files in: /teamspace/studios/this_studio/nlp_Backpropagandists_2024/code

Starting CC dataset training...

Training on cc dataset...



Creating narrative mapping...
Number of unique narratives: 11

Sample narrative mappings:
0: Amplifying Climate Fears
1: Climate change is beneficial
2: Controversy about green technologies
3: Criticism of climate movement
4: Criticism of climate policies

Training set size: 415
Validation set size: 104

Initializing tokenizer...

Tokenizing texts...

Initializing model...


2025-01-21 13:33:51,645 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 16,822,272 || all params: 6,624,210,944 || trainable%: 0.2540

Starting classification head pre-training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,1.6289,1.775053,0.490385,0.526456,0.488726,0.490385,"{'Class_0': [[40, 24], [3, 37]], 'Class_1': [[103, 0], [1, 0]], 'Class_3': [[94, 2], [6, 2]], 'Class_4': [[98, 2], [4, 0]], 'Class_5': [[82, 10], [10, 2]], 'Class_6': [[95, 2], [7, 0]], 'Class_7': [[103, 0], [1, 0]], 'Class_8': [[96, 0], [8, 0]], 'Class_9': [[69, 13], [12, 10]], 'Class_10': [[103, 0], [1, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[40 24]
 [ 3 37]]
Precision: 0.6066
Recall: 0.9250
F1 Score: 0.7327

Metrics for Class 1:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 3:
Confusion Matrix:
[[94  2]
 [ 6  2]]
Precision: 0.5000
Recall: 0.2500
F1 Score: 0.3333

Metrics for Class 4:
Confusion Matrix:
[[98  2]
 [ 4  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[82 10]
 [10  2]]
Precision: 0.1667
Recall: 0.1667
F1 Score: 0.1667

Metrics for Class 6:
Confusion Matrix:
[[95  2]
 [ 7  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 8:
Confusion Matrix:
[[96  0]
 [ 8  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 9:
Confusion Matrix:
[[69 13]
 [12 10]]
Precision: 0.4348
Recall: 0.4545
F1 Score: 0.4444

Metrics fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Unfreezing LoRA adapters for full training...

Starting full model fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,2.8536,1.626752,0.548077,0.562827,0.544105,0.548077,"{'Class_0': [[40, 24], [1, 39]], 'Class_1': [[103, 0], [1, 0]], 'Class_3': [[95, 1], [7, 1]], 'Class_4': [[99, 1], [4, 0]], 'Class_5': [[90, 2], [9, 3]], 'Class_6': [[96, 1], [7, 0]], 'Class_7': [[103, 0], [1, 0]], 'Class_8': [[96, 0], [8, 0]], 'Class_9': [[64, 18], [8, 14]], 'Class_10': [[103, 0], [1, 0]]}"
2,1.1771,1.612782,0.548077,0.593278,0.522068,0.548077,"{'Class_0': [[50, 14], [5, 35]], 'Class_1': [[103, 0], [1, 0]], 'Class_3': [[94, 2], [5, 3]], 'Class_4': [[93, 7], [2, 2]], 'Class_5': [[85, 7], [9, 3]], 'Class_6': [[95, 2], [7, 0]], 'Class_7': [[103, 0], [1, 0]], 'Class_8': [[93, 3], [7, 1]], 'Class_9': [[72, 10], [9, 13]], 'Class_10': [[102, 1], [1, 0]]}"
3,0.1479,1.666802,0.625,0.664835,0.632051,0.625,"{'Class_0': [[51, 13], [3, 37]], 'Class_1': [[103, 0], [1, 0]], 'Class_3': [[93, 3], [3, 5]], 'Class_4': [[98, 2], [3, 1]], 'Class_5': [[83, 9], [6, 6]], 'Class_6': [[95, 2], [7, 0]], 'Class_7': [[103, 0], [1, 0]], 'Class_8': [[96, 0], [7, 1]], 'Class_9': [[75, 7], [7, 15]], 'Class_10': [[102, 1], [1, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[40 24]
 [ 1 39]]
Precision: 0.6190
Recall: 0.9750
F1 Score: 0.7573

Metrics for Class 1:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 3:
Confusion Matrix:
[[95  1]
 [ 7  1]]
Precision: 0.5000
Recall: 0.1250
F1 Score: 0.2000

Metrics for Class 4:
Confusion Matrix:
[[99  1]
 [ 4  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[90  2]
 [ 9  3]]
Precision: 0.6000
Recall: 0.2500
F1 Score: 0.3529

Metrics for Class 6:
Confusion Matrix:
[[96  1]
 [ 7  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 8:
Confusion Matrix:
[[96  0]
 [ 8  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 9:
Confusion Matrix:
[[64 18]
 [ 8 14]]
Precision: 0.4375
Recall: 0.6364
F1 Score: 0.5185

Metrics fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[50 14]
 [ 5 35]]
Precision: 0.7143
Recall: 0.8750
F1 Score: 0.7865

Metrics for Class 1:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 3:
Confusion Matrix:
[[94  2]
 [ 5  3]]
Precision: 0.6000
Recall: 0.3750
F1 Score: 0.4615

Metrics for Class 4:
Confusion Matrix:
[[93  7]
 [ 2  2]]
Precision: 0.2222
Recall: 0.5000
F1 Score: 0.3077

Metrics for Class 5:
Confusion Matrix:
[[85  7]
 [ 9  3]]
Precision: 0.3000
Recall: 0.2500
F1 Score: 0.2727

Metrics for Class 6:
Confusion Matrix:
[[95  2]
 [ 7  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 8:
Confusion Matrix:
[[93  3]
 [ 7  1]]
Precision: 0.2500
Recall: 0.1250
F1 Score: 0.1667

Metrics for Class 9:
Confusion Matrix:
[[72 10]
 [ 9 13]]
Precision: 0.5652
Recall: 0.5909
F1 Score: 0.5778

Metrics fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[51 13]
 [ 3 37]]
Precision: 0.7400
Recall: 0.9250
F1 Score: 0.8222

Metrics for Class 1:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 3:
Confusion Matrix:
[[93  3]
 [ 3  5]]
Precision: 0.6250
Recall: 0.6250
F1 Score: 0.6250

Metrics for Class 4:
Confusion Matrix:
[[98  2]
 [ 3  1]]
Precision: 0.3333
Recall: 0.2500
F1 Score: 0.2857

Metrics for Class 5:
Confusion Matrix:
[[83  9]
 [ 6  6]]
Precision: 0.4000
Recall: 0.5000
F1 Score: 0.4444

Metrics for Class 6:
Confusion Matrix:
[[95  2]
 [ 7  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 8:
Confusion Matrix:
[[96  0]
 [ 7  1]]
Precision: 1.0000
Recall: 0.1250
F1 Score: 0.2222

Metrics for Class 9:
Confusion Matrix:
[[75  7]
 [ 7 15]]
Precision: 0.6818
Recall: 0.6818
F1 Score: 0.6818

Metrics fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluating model...



Metrics for Class 0:
Confusion Matrix:
[[50 14]
 [ 5 35]]
Precision: 0.7143
Recall: 0.8750
F1 Score: 0.7865

Metrics for Class 1:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 3:
Confusion Matrix:
[[94  2]
 [ 5  3]]
Precision: 0.6000
Recall: 0.3750
F1 Score: 0.4615

Metrics for Class 4:
Confusion Matrix:
[[93  7]
 [ 2  2]]
Precision: 0.2222
Recall: 0.5000
F1 Score: 0.3077

Metrics for Class 5:
Confusion Matrix:
[[85  7]
 [ 9  3]]
Precision: 0.3000
Recall: 0.2500
F1 Score: 0.2727

Metrics for Class 6:
Confusion Matrix:
[[95  2]
 [ 7  0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[103   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 8:
Confusion Matrix:
[[93  3]
 [ 7  1]]
Precision: 0.2500
Recall: 0.1250
F1 Score: 0.1667

Metrics for Class 9:
Confusion Matrix:
[[72 10]
 [ 9 13]]
Precision: 0.5652
Recall: 0.5909
F1 Score: 0.5778

Metrics fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-01-21 13:56:17,667 - ERROR - Error in debugging misclassifications: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)


Error in CC training: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)


Traceback (most recent call last):
  File "/tmp/ipykernel_1941/2645010921.py", line 91, in train_cc
    cc_results, cc_model, cc_tokenizer, cc_label_mapping = train_single_dataset(
  File "/tmp/ipykernel_1941/2084970581.py", line 56, in train_single_dataset
    misclass_df = debug_misclassifications(df, model, tokenizer, label_mapping)
  File "/teamspace/studios/this_studio/nlp_Backpropagandists_2024/code/modules/utils.py", line 126, in debug_misclassifications
    outputs = model(input_ids, attention_mask=attention_mask)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/peft/peft_model.py", line 1521, in forward
   

0,1
eval/accuracy,▁▄▄█▄
eval/f1,▁▃▄█▄
eval/loss,█▂▁▃▁
eval/precision,▁▄▃█▃
eval/recall,▁▄▄█▄
eval/runtime,▁▇▆▇█
eval/samples_per_second,█▂▃▂▁
eval/steps_per_second,▁████
train/epoch,▁▁▂▂▃▃▃▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇████
train/global_step,▁▁▂▂▃▃▃▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇████

0,1
eval/accuracy,0.54808
eval/f1,0.59328
eval/loss,1.61278
eval/precision,0.52207
eval/recall,0.54808
eval/runtime,27.0088
eval/samples_per_second,3.851
eval/steps_per_second,0.963
total_flos,2.483391893078016e+16
train/epoch,3.0


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [5]:
# Train full dataset
results, model, tokenizer, label_mapping = train_full()

CUDA Available: True
GPU Device: NVIDIA L40S
GPU Memory: 47.81 GB

Loading datasets...
Repository root: /teamspace/studios/this_studio/nlp_Backpropagandists_2024
Looking for data files in: /teamspace/studios/this_studio/nlp_Backpropagandists_2024/code

Starting full dataset training...

Training on full dataset...


[34m[1mwandb[0m: Currently logged in as: [33mjonaskruse[0m ([33mbackpropagandists[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message



Creating narrative mapping...
Number of unique narratives: 21

Sample narrative mappings:
0: Amplifying Climate Fears
1: Amplifying war-related fears
2: Blaming the war on others rather than the invader
3: Climate change is beneficial
4: Controversy about green technologies

Training set size: 1355
Validation set size: 339

Initializing tokenizer...

Tokenizing texts...

Initializing model...


2025-01-21 13:59:01,940 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 16,863,232 || all params: 6,624,292,864 || trainable%: 0.2546

Starting classification head pre-training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,2.3422,2.502463,0.247788,0.441706,0.247929,0.247788,"{'Class_0': [[266, 37], [14, 22]], 'Class_1': [[307, 9], [22, 1]], 'Class_2': [[312, 10], [17, 0]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[334, 1], [4, 0]], 'Class_6': [[327, 9], [3, 0]], 'Class_7': [[308, 10], [20, 1]], 'Class_8': [[226, 63], [27, 23]], 'Class_9': [[288, 15], [36, 0]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[333, 2], [4, 0]], 'Class_13': [[330, 2], [6, 1]], 'Class_14': [[333, 0], [6, 0]], 'Class_15': [[218, 63], [28, 30]], 'Class_16': [[335, 1], [3, 0]], 'Class_17': [[275, 29], [29, 6]], 'Class_18': [[336, 2], [1, 0]], 'Class_19': [[325, 0], [14, 0]], 'Class_20': [[322, 2], [15, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[266  37]
 [ 14  22]]
Precision: 0.3729
Recall: 0.6111
F1 Score: 0.4632

Metrics for Class 1:
Confusion Matrix:
[[307   9]
 [ 22   1]]
Precision: 0.1000
Recall: 0.0435
F1 Score: 0.0606

Metrics for Class 2:
Confusion Matrix:
[[312  10]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[334   1]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[327   9]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[308  10]
 [ 20   1]]
Precision: 0.0909
Recall: 0.0476
F1 Score: 0.0625

Metrics for Class 8:
Confusion Matrix:
[[226  63]
 [ 27  23]]
Precision: 0.2674
Recall: 0.4600
F1 Score: 0.3382

Metrics for Class 9:
Confusion Matrix:
[[288  15]
 [ 36   0]]
Precision: 0.0000
Recall: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Unfreezing LoRA adapters for full training...

Starting full model fine-tuning...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,3.8722,2.294061,0.315634,0.430827,0.306476,0.315634,"{'Class_0': [[281, 22], [6, 30]], 'Class_1': [[287, 29], [15, 8]], 'Class_2': [[318, 4], [17, 0]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[324, 11], [4, 0]], 'Class_6': [[336, 0], [3, 0]], 'Class_7': [[315, 3], [19, 2]], 'Class_8': [[224, 65], [24, 26]], 'Class_9': [[291, 12], [32, 4]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[334, 1], [4, 0]], 'Class_13': [[329, 3], [6, 1]], 'Class_14': [[333, 0], [5, 1]], 'Class_15': [[232, 49], [27, 31]], 'Class_16': [[336, 0], [3, 0]], 'Class_17': [[275, 29], [31, 4]], 'Class_18': [[336, 2], [1, 0]], 'Class_19': [[324, 1], [14, 0]], 'Class_20': [[323, 1], [15, 0]]}"
2,0.6345,2.776076,0.324484,0.365485,0.318175,0.324484,"{'Class_0': [[291, 12], [6, 30]], 'Class_1': [[298, 18], [18, 5]], 'Class_2': [[313, 9], [16, 1]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[332, 3], [4, 0]], 'Class_6': [[330, 6], [3, 0]], 'Class_7': [[313, 5], [19, 2]], 'Class_8': [[256, 33], [33, 17]], 'Class_9': [[260, 43], [26, 10]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[333, 2], [4, 0]], 'Class_13': [[330, 2], [5, 2]], 'Class_14': [[329, 4], [6, 0]], 'Class_15': [[228, 53], [26, 32]], 'Class_16': [[335, 1], [3, 0]], 'Class_17': [[282, 22], [27, 8]], 'Class_18': [[336, 2], [1, 0]], 'Class_19': [[318, 7], [12, 2]], 'Class_20': [[317, 7], [14, 1]]}"



Metrics for Class 0:
Confusion Matrix:
[[281  22]
 [  6  30]]
Precision: 0.5769
Recall: 0.8333
F1 Score: 0.6818

Metrics for Class 1:
Confusion Matrix:
[[287  29]
 [ 15   8]]
Precision: 0.2162
Recall: 0.3478
F1 Score: 0.2667

Metrics for Class 2:
Confusion Matrix:
[[318   4]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[324  11]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[336   0]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[315   3]
 [ 19   2]]
Precision: 0.4000
Recall: 0.0952
F1 Score: 0.1538

Metrics for Class 8:
Confusion Matrix:
[[224  65]
 [ 24  26]]
Precision: 0.2857
Recall: 0.5200
F1 Score: 0.3688

Metrics for Class 9:
Confusion Matrix:
[[291  12]
 [ 32   4]]
Precision: 0.2500
Recall: 0.1111


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[286  17]
 [  7  29]]
Precision: 0.6304
Recall: 0.8056
F1 Score: 0.7073

Metrics for Class 1:
Confusion Matrix:
[[303  13]
 [ 19   4]]
Precision: 0.2353
Recall: 0.1739
F1 Score: 0.2000

Metrics for Class 2:
Confusion Matrix:
[[315   7]
 [ 16   1]]
Precision: 0.1250
Recall: 0.0588
F1 Score: 0.0800

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[331   4]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[331   5]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[311   7]
 [ 14   7]]
Precision: 0.5000
Recall: 0.3333
F1 Score: 0.4000

Metrics for Class 8:
Confusion Matrix:
[[231  58]
 [ 25  25]]
Precision: 0.3012
Recall: 0.5000
F1 Score: 0.3759

Metrics for Class 9:
Confusion Matrix:
[[285  18]
 [ 29   7]]
Precision: 0.2800
Recall: 0.1944


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[291  12]
 [  6  30]]
Precision: 0.7143
Recall: 0.8333
F1 Score: 0.7692

Metrics for Class 1:
Confusion Matrix:
[[298  18]
 [ 18   5]]
Precision: 0.2174
Recall: 0.2174
F1 Score: 0.2174

Metrics for Class 2:
Confusion Matrix:
[[313   9]
 [ 16   1]]
Precision: 0.1000
Recall: 0.0588
F1 Score: 0.0741

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[332   3]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[330   6]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[313   5]
 [ 19   2]]
Precision: 0.2857
Recall: 0.0952
F1 Score: 0.1429

Metrics for Class 8:
Confusion Matrix:
[[256  33]
 [ 33  17]]
Precision: 0.3400
Recall: 0.3400
F1 Score: 0.3400

Metrics for Class 9:
Confusion Matrix:
[[260  43]
 [ 26  10]]
Precision: 0.1887
Recall: 0.2778


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluating model...



Metrics for Class 0:
Confusion Matrix:
[[281  22]
 [  6  30]]
Precision: 0.5769
Recall: 0.8333
F1 Score: 0.6818

Metrics for Class 1:
Confusion Matrix:
[[287  29]
 [ 15   8]]
Precision: 0.2162
Recall: 0.3478
F1 Score: 0.2667

Metrics for Class 2:
Confusion Matrix:
[[318   4]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[324  11]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[336   0]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[315   3]
 [ 19   2]]
Precision: 0.4000
Recall: 0.0952
F1 Score: 0.1538

Metrics for Class 8:
Confusion Matrix:
[[224  65]
 [ 24  26]]
Precision: 0.2857
Recall: 0.5200
F1 Score: 0.3688

Metrics for Class 9:
Confusion Matrix:
[[291  12]
 [ 32   4]]
Precision: 0.2500
Recall: 0.1111


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
2025-01-21 15:12:29,904 - ERROR - Error in debugging misclassifications: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)


Error in full dataset training: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)


Traceback (most recent call last):
  File "/tmp/ipykernel_82240/2645010921.py", line 115, in train_full
    results, model, tokenizer, label_mapping = train_single_dataset(
  File "/tmp/ipykernel_82240/2084970581.py", line 56, in train_single_dataset
    misclass_df = debug_misclassifications(df, model, tokenizer, label_mapping)
  File "/teamspace/studios/this_studio/nlp_Backpropagandists_2024/code/modules/utils.py", line 126, in debug_misclassifications
    outputs = model(input_ids, attention_mask=attention_mask)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/peft/peft_model.py", line 1521, in forward
    return

0,1
eval/accuracy,▁▇▇█▇
eval/f1,█▇▆▁▇
eval/loss,▄▁▅█▁
eval/precision,▁▇▇█▇
eval/recall,▁▇▇█▇
eval/runtime,▅█▃▁▅
eval/samples_per_second,▄▁▆█▄
eval/steps_per_second,▁▇███
train/epoch,▁▁▁▁▂▂▂▂▃▃▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇██
train/global_step,▁▁▁▂▂▃▃▃▃▃▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇█████

0,1
eval/accuracy,0.31563
eval/f1,0.43083
eval/loss,2.29406
eval/precision,0.30648
eval/recall,0.31563
eval/runtime,86.7013
eval/samples_per_second,3.91
eval/steps_per_second,0.98
total_flos,8.070626800356557e+16
train/epoch,2.98525


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)