In [64]:
import os
import pandas as pd

import torch
print(torch.__version__)
import wandb
from transformers import LlamaTokenizer, LlamaForSequenceClassification, TrainingArguments, Trainer, BitsAndBytesConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from datetime import datetime
import logging
import json
import numpy as np
import sentencepiece
from tqdm.notebook import tqdm
from huggingface_hub import login
login('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')
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

from datasets import Dataset
import bitsandbytes as bnb

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check CUDA availability
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


2.2.1+cu121
CUDA Available: True
GPU Device: NVIDIA L40S
GPU Memory: 47.81 GB


In [65]:
base_path = os.path.dirname(os.getcwd())

# Load preprocessed data
input_file_full = os.path.join(base_path, "df_normalized.csv")
df_normalized = pd.read_csv(input_file_full)
df = pd.read_csv(input_file_full)

input_file_ua = os.path.join(base_path, "df_normalized_ua.csv")
df_normalized_ua = pd.read_csv(input_file_ua)

input_file_cc = os.path.join(base_path, "df_normalized_cc.csv")
df_normalized_cc = pd.read_csv(input_file_cc)

# Display dataset information
print("\nFull Dataset Info:")
print(df_normalized.info())
print(f"\nNumber of records: {len(df_normalized)}")

print("\nUA Dataset Info:")
print(df_normalized_ua.info())
print(f"\nNumber of UA records: {len(df_normalized_ua)}")

print("\nCC Dataset Info:")
print(df_normalized_cc.info())
print(f"\nNumber of CC records: {len(df_normalized_cc)}")

# Display sample rows
print("\nSample row from full dataset:")
print(df_normalized.iloc[0])


Full Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1694 entries, 0 to 1693
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   filename                      1694 non-null   object
 1   language                      1694 non-null   object
 2   content                       1694 non-null   object
 3   topic                         1694 non-null   object
 4   narrative_subnarrative_pairs  1694 non-null   object
 5   target_indices                1694 non-null   object
 6   tokens                        1694 non-null   object
 7   tokens_normalized             1694 non-null   object
dtypes: object(8)
memory usage: 106.0+ KB
None

Number of records: 1694

UA Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        -----

In [66]:
class CustomDataset(torch.utils.data.Dataset):
    """Custom Dataset for loading Llama input data"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
        # Debug info
        print(f"Dataset created with {len(self.labels)} samples")
        print(f"Label distribution: {pd.Series(self.labels).value_counts()}")

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [67]:
def compute_metrics(pred):
    """Compute evaluation metrics"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)

    # Computing confusion matrix per class
    unique_classes = np.unique(labels)
    cm_per_class = {}
    
    for class_idx in unique_classes:
        binary_labels = (labels == class_idx).astype(int)
        binary_preds = (preds == class_idx).astype(int)
        cm = confusion_matrix(binary_labels, binary_preds)
        cm_per_class[f"Class_{class_idx}"] = cm.tolist()
        
        # Print per-class metrics for debugging
        print(f"\nMetrics for Class {class_idx}:")
        print(f"Confusion Matrix:\n{cm}")
        class_precision = precision_recall_fscore_support(binary_labels, binary_preds, average='binary')[0]
        class_recall = precision_recall_fscore_support(binary_labels, binary_preds, average='binary')[1]
        print(f"Precision: {class_precision:.4f}")
        print(f"Recall: {class_recall:.4f}")

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': cm_per_class
    }

In [68]:
def get_narrative_key(narrative_dict):
    """Extract key from narrative dictionary for classification"""
    if isinstance(narrative_dict, str):
        narrative_dict = eval(narrative_dict)
    return narrative_dict['narrative']  # or you could use narrative_dict['subnarrative']

In [69]:
def initialize_classification_head(model, train_dataset, val_dataset, tokenizer, output_dir):
    """Pre-train the classification head before fine-tuning the full model"""
    try:
        print("\nPre-training classification head...")
        
        # Freeze all layers except classification head
        for name, param in model.named_parameters():
            if "score" not in name:  # Freeze everything except score layer
                param.requires_grad = False
            else:
                param.requires_grad = True
        
        # Print trainable parameters
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        all_params = sum(p.numel() for p in model.parameters())
        print(f"Trainable parameters for head pre-training: {trainable_params:,} ({trainable_params/all_params:.2%} of total)")
        
        # Training arguments for head pre-training
        head_training_args = TrainingArguments(
            output_dir=os.path.join(output_dir, "head_pretraining"),
            run_name=f"llama-classification-run-{current_date}",
            num_train_epochs=1,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            learning_rate=1e-3,
            warmup_ratio=0.1,
            eval_strategy="epoch",
            save_strategy="no",
            logging_dir=os.path.join(output_dir, "head_logs"),
            logging_steps=10,
            remove_unused_columns=False,
            report_to="wandb"
        )
        
        # Initialize trainer with validation dataset
        head_trainer = Trainer(
            model=model,
            args=head_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,  # Added validation dataset
            compute_metrics=compute_metrics
        )
        
        # Train only the head
        head_trainer.train()
        
        print("\nClassification head pre-training completed")
        return model
        
    except Exception as e:
        print(f"Error in classification head pre-training: {str(e)}")
        raise

openlm-research/open_llama_7b

In [70]:
def train_llama(df, base_path, model_name="openlm-research/open_llama_7b"):
    """Train Llama model with classification head pre-training and LoRA fine-tuning"""
    try:
        current_date = datetime.now().strftime("%Y%m%d")
        
        # Create output directories
        output_dir = os.path.join(base_path, f"models/llama_{current_date}")
        log_dir = os.path.join(base_path, f"logs/llama_{current_date}")
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(log_dir, exist_ok=True)

        # Initialize wandb
        wandb.init(project="llama-classification", name=f"llama-classification-{current_date}")

        # Create narrative mapping
        print("\nCreating narrative mapping...")
        narratives = df['narrative_subnarrative_pairs'].apply(
            lambda x: eval(x)[0] if isinstance(x, str) else x[0]
        ).tolist()
        
        unique_narratives = set(get_narrative_key(n) for n in narratives)
        label_mapping = {narrative: idx for idx, narrative in enumerate(sorted(unique_narratives))}
        
        print(f"Number of unique narratives: {len(unique_narratives)}")
        print("\nSample narrative mappings:")
        for i, (narrative, idx) in enumerate(list(label_mapping.items())[:5]):
            print(f"{idx}: {narrative}")

        # Save label mapping
        with open(os.path.join(output_dir, "label_mapping.json"), 'w') as f:
            json.dump(label_mapping, f, indent=2)

        # Prepare data
        print("\nPreparing data splits...")
        df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)
        
        print(f"Training set size: {len(df_train)}")
        print(f"Validation set size: {len(df_val)}")

        # Process texts and labels
        train_texts = df_train['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        val_texts = df_val['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        train_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in df_train['narrative_subnarrative_pairs']
        ]
        val_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in df_val['narrative_subnarrative_pairs']
        ]

        # Initialize tokenizer
        print("\nInitializing tokenizer...")
        tokenizer = LlamaTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id

        # Tokenize texts
        print("\nTokenizing texts...")
        train_encodings = tokenizer(
            train_texts, 
            truncation=True, 
            padding=True, 
            max_length=512
        )
        val_encodings = tokenizer(
            val_texts, 
            truncation=True, 
            padding=True, 
            max_length=512
        )

        # Create datasets
        train_dataset = Dataset.from_dict({
            'input_ids': train_encodings['input_ids'],
            'attention_mask': train_encodings['attention_mask'],
            'labels': train_labels
        })
        val_dataset = Dataset.from_dict({
            'input_ids': val_encodings['input_ids'],
            'attention_mask': val_encodings['attention_mask'],
            'labels': val_labels
        })

        # Setup quantization configuration
        print("\nConfiguring quantization...")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )

        # Initialize model
        print("\nInitializing model...")
        model = LlamaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(label_mapping),
            torch_dtype=torch.float16,
            quantization_config=quantization_config,
            device_map='auto'
        )

        # Prepare model for k-bit training
        model = prepare_model_for_kbit_training(model)

        # Configure and apply LoRA
        print("\nApplying LoRA adapters...")
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            inference_mode=False,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            bias="none",
        )
        
        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()

        # Pre-train classification head
        print("\nStarting classification head pre-training...")
        
        # Freeze LoRA adapters
        for name, param in model.named_parameters():
            if 'lora' in name:
                param.requires_grad = False

        # Training arguments for head pre-training
        head_training_args = TrainingArguments(
            output_dir=os.path.join(output_dir, "head_pretraining"),
            num_train_epochs=1,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            learning_rate=1e-3,
            warmup_ratio=0.1,
            evaluation_strategy="epoch",
            save_strategy="no",
            logging_dir=os.path.join(output_dir, "head_logs"),
            logging_steps=10,
            remove_unused_columns=False,
            report_to="wandb"
        )
        
        # Initialize trainer for head pre-training
        head_trainer = Trainer(
            model=model,
            args=head_training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )
        
        # Train classification head
        head_trainer.train()

        # Unfreeze LoRA adapters for full training
        print("\nUnfreezing LoRA adapters for full training...")
        for name, param in model.named_parameters():
            if 'lora' in name:
                param.requires_grad = True

        # Training arguments for full model fine-tuning
        print("\nStarting full model fine-tuning...")
        training_args = TrainingArguments(
            output_dir=output_dir,
            run_name=f"llama-classification-run-{current_date}",
            num_train_epochs=3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            learning_rate=2e-4,
            warmup_ratio=0.03,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir=log_dir,
            load_best_model_at_end=True,
            metric_for_best_model='eval_loss',
            greater_is_better=False,
            logging_steps=10,
            gradient_accumulation_steps=2,
            gradient_checkpointing=True,
            optim="paged_adamw_8bit",
            remove_unused_columns=False,
            report_to="wandb"
        )

        # Initialize trainer for full model fine-tuning
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        # Train full model
        trainer.train()

        # Evaluate model
        print("\nEvaluating model...")
        results = trainer.evaluate()
        
        print("\nEvaluation results:")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")

        # Save model and tokenizer
        print("\nSaving model...")
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)

        # End wandb run
        wandb.finish()

        return results, model, tokenizer, label_mapping

    except Exception as e:
        print(f"Error in Llama training: {str(e)}")
        import traceback
        traceback.print_exc()
        wandb.finish()
        raise

In [71]:
def debug_misclassifications(dataset, model, tokenizer, label_mapping, dataset_type="Training"):
    """Debug misclassified examples with detailed output and proper device handling"""
    try:
        print(f"\nAnalyzing misclassifications in {dataset_type} dataset...")
        
        # Determine device
        device = model.device
        print(f"Model is on device: {device}")
        
        # Prepare data
        texts = dataset['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        true_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in dataset['narrative_subnarrative_pairs']
        ]
        
        print(f"\nTotal samples to analyze: {len(texts)}")

        # Get predictions in batches to manage memory
        batch_size = 8
        predictions = []
        confidences = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize batch
            encodings = tokenizer(
                batch_texts, 
                truncation=True, 
                padding=True, 
                max_length=512, 
                return_tensors="pt"
            )
            
            # Move encodings to same device as model
            encodings = {k: v.to(device) for k, v in encodings.items()}
            
            with torch.no_grad():
                outputs = model(**encodings)
                batch_preds = outputs.logits.argmax(-1)
                batch_confs = torch.softmax(outputs.logits, dim=-1).max(dim=-1)[0]
                
                # Move predictions back to CPU
                predictions.extend(batch_preds.cpu().numpy())
                confidences.extend(batch_confs.cpu().numpy())

        # Track misclassifications
        misclassifications = []
        for idx, (pred, true, conf) in enumerate(zip(predictions, true_labels, confidences)):
            if pred != true:
                misclassifications.append({
                    'text': texts[idx][:200],  # First 200 chars for brevity
                    'predicted': pred,
                    'actual': true,
                    'confidence': conf,
                    'dataset_type': dataset_type
                })

        # Create DataFrame and display results
        misclass_df = pd.DataFrame(misclassifications)
        
        print(f"\nTotal misclassifications: {len(misclass_df)}")
        print(f"Accuracy: {1 - len(misclass_df)/len(texts):.4f}")
        
        if len(misclass_df) > 0:
            print("\nMisclassification distribution:")
            print(misclass_df.groupby(['actual', 'predicted']).size().unstack(fill_value=0))
            
            print("\nSample misclassifications:")
            for i, row in misclass_df.head().iterrows():
                print(f"\nExample {i+1}:")
                print(f"Text: {row['text']}")
                print(f"Predicted: {row['predicted']}, Actual: {row['actual']}")
                print(f"Confidence: {row['confidence']:.4f}")
        
        return misclass_df

    except Exception as e:
        print(f"Error in debugging misclassifications: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

In [72]:
# Choose dataset to train on
print("Select dataset for training:")
print("1. Full dataset")
print("2. UA dataset")
print("3. CC dataset")
choice = input("Enter your choice (1-3): ")

if choice == "1":
    print("\nTraining on full dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized, base_path)
elif choice == "2":
    print("\nTraining on UA dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized_ua, base_path)
else:
    print("\nTraining on CC dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized_cc, base_path)


Select dataset for training:
1. Full dataset
2. UA dataset
3. CC dataset

Training on full dataset...



Creating narrative mapping...
Number of unique narratives: 21

Sample narrative mappings:
0: Amplifying Climate Fears
1: Amplifying war-related fears
2: Blaming the war on others rather than the invader
3: Climate change is beneficial
4: Controversy about green technologies

Preparing data splits...
Training set size: 1355
Validation set size: 339

Initializing tokenizer...

Tokenizing texts...

Configuring quantization...

Initializing model...


2025-01-09 23:03:09,944 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Applying LoRA adapters...
trainable params: 16,863,232 || all params: 6,624,292,864 || trainable%: 0.2546

Starting classification head pre-training...




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,2.3762,2.484182,0.247788,0.247788,0.247788,0.247788,"{'Class_0': [[254, 49], [10, 26]], 'Class_1': [[309, 7], [22, 1]], 'Class_2': [[316, 6], [17, 0]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[335, 0], [4, 0]], 'Class_6': [[332, 4], [3, 0]], 'Class_7': [[311, 7], [18, 3]], 'Class_8': [[223, 66], [27, 23]], 'Class_9': [[292, 11], [36, 0]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[334, 1], [4, 0]], 'Class_13': [[330, 2], [7, 0]], 'Class_14': [[333, 0], [6, 0]], 'Class_15': [[222, 59], [31, 27]], 'Class_16': [[336, 0], [3, 0]], 'Class_17': [[267, 37], [31, 4]], 'Class_18': [[337, 1], [1, 0]], 'Class_19': [[323, 2], [14, 0]], 'Class_20': [[321, 3], [15, 0]]}"



Metrics for Class 0:
Confusion Matrix:
[[254  49]
 [ 10  26]]
Precision: 0.3467
Recall: 0.7222

Metrics for Class 1:
Confusion Matrix:
[[309   7]
 [ 22   1]]
Precision: 0.1250
Recall: 0.0435

Metrics for Class 2:
Confusion Matrix:
[[316   6]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[335   0]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[332   4]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[311   7]
 [ 18   3]]
Precision: 0.3000
Recall: 0.1429

Metrics for Class 8:
Confusion Matrix:
[[223  66]
 [ 27  23]]
Precision: 0.2584
Recall: 0.4600

Metrics for Class 9:
Confusion Matrix:
[[292  11]
 [ 36   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 10:
Confusion Matrix:
[[334   0]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 11:
Confusion Matrix

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Unfreezing LoRA adapters for full training...

Starting full model fine-tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Confusion Matrix
1,4.0385,2.290623,0.321534,0.321534,0.321534,0.321534,"{'Class_0': [[270, 33], [4, 32]], 'Class_1': [[290, 26], [17, 6]], 'Class_2': [[320, 2], [17, 0]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[327, 8], [4, 0]], 'Class_6': [[336, 0], [3, 0]], 'Class_7': [[316, 2], [18, 3]], 'Class_8': [[218, 71], [19, 31]], 'Class_9': [[292, 11], [33, 3]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[335, 0], [4, 0]], 'Class_13': [[330, 2], [7, 0]], 'Class_14': [[332, 1], [6, 0]], 'Class_15': [[234, 47], [30, 28]], 'Class_16': [[335, 1], [3, 0]], 'Class_17': [[281, 23], [29, 6]], 'Class_18': [[338, 0], [1, 0]], 'Class_19': [[324, 1], [14, 0]], 'Class_20': [[322, 2], [15, 0]]}"
2,0.6004,2.697214,0.348083,0.348083,0.348083,0.348083,"{'Class_0': [[282, 21], [8, 28]], 'Class_1': [[303, 13], [12, 11]], 'Class_2': [[312, 10], [16, 1]], 'Class_4': [[338, 0], [1, 0]], 'Class_5': [[330, 5], [4, 0]], 'Class_6': [[335, 1], [3, 0]], 'Class_7': [[312, 6], [15, 6]], 'Class_8': [[252, 37], [28, 22]], 'Class_9': [[275, 28], [26, 10]], 'Class_10': [[334, 0], [5, 0]], 'Class_11': [[333, 2], [4, 0]], 'Class_13': [[331, 1], [7, 0]], 'Class_14': [[333, 0], [6, 0]], 'Class_15': [[228, 53], [24, 34]], 'Class_16': [[333, 3], [3, 0]], 'Class_17': [[282, 22], [31, 4]], 'Class_18': [[336, 2], [1, 0]], 'Class_19': [[318, 7], [13, 1]], 'Class_20': [[314, 10], [14, 1]]}"



Metrics for Class 0:
Confusion Matrix:
[[270  33]
 [  4  32]]
Precision: 0.4923
Recall: 0.8889

Metrics for Class 1:
Confusion Matrix:
[[290  26]
 [ 17   6]]
Precision: 0.1875
Recall: 0.2609

Metrics for Class 2:
Confusion Matrix:
[[320   2]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[327   8]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[336   0]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[316   2]
 [ 18   3]]
Precision: 0.6000
Recall: 0.1429

Metrics for Class 8:
Confusion Matrix:
[[218  71]
 [ 19  31]]
Precision: 0.3039
Recall: 0.6200

Metrics for Class 9:
Confusion Matrix:
[[292  11]
 [ 33   3]]
Precision: 0.2143
Recall: 0.0833

Metrics for Class 10:
Confusion Matrix:
[[334   0]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 11:
Confusion Matrix

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[273  30]
 [  5  31]]
Precision: 0.5082
Recall: 0.8611

Metrics for Class 1:
Confusion Matrix:
[[299  17]
 [ 13  10]]
Precision: 0.3704
Recall: 0.4348

Metrics for Class 2:
Confusion Matrix:
[[311  11]
 [ 16   1]]
Precision: 0.0833
Recall: 0.0588

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[332   3]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[334   2]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[311   7]
 [ 15   6]]
Precision: 0.4615
Recall: 0.2857

Metrics for Class 8:
Confusion Matrix:
[[236  53]
 [ 19  31]]
Precision: 0.3690
Recall: 0.6200

Metrics for Class 9:
Confusion Matrix:
[[289  14]
 [ 29   7]]
Precision: 0.3333
Recall: 0.1944

Metrics for Class 10:
Confusion Matrix:
[[331   3]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 11:
Confusion Matrix

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Metrics for Class 0:
Confusion Matrix:
[[282  21]
 [  8  28]]
Precision: 0.5714
Recall: 0.7778

Metrics for Class 1:
Confusion Matrix:
[[303  13]
 [ 12  11]]
Precision: 0.4583
Recall: 0.4783

Metrics for Class 2:
Confusion Matrix:
[[312  10]
 [ 16   1]]
Precision: 0.0909
Recall: 0.0588

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[330   5]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[335   1]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[312   6]
 [ 15   6]]
Precision: 0.5000
Recall: 0.2857

Metrics for Class 8:
Confusion Matrix:
[[252  37]
 [ 28  22]]
Precision: 0.3729
Recall: 0.4400

Metrics for Class 9:
Confusion Matrix:
[[275  28]
 [ 26  10]]
Precision: 0.2632
Recall: 0.2778

Metrics for Class 10:
Confusion Matrix:
[[334   0]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 11:
Confusion Matrix

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Evaluating model...



Metrics for Class 0:
Confusion Matrix:
[[270  33]
 [  4  32]]
Precision: 0.4923
Recall: 0.8889

Metrics for Class 1:
Confusion Matrix:
[[290  26]
 [ 17   6]]
Precision: 0.1875
Recall: 0.2609

Metrics for Class 2:
Confusion Matrix:
[[320   2]
 [ 17   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 4:
Confusion Matrix:
[[338   0]
 [  1   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 5:
Confusion Matrix:
[[327   8]
 [  4   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 6:
Confusion Matrix:
[[336   0]
 [  3   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 7:
Confusion Matrix:
[[316   2]
 [ 18   3]]
Precision: 0.6000
Recall: 0.1429

Metrics for Class 8:
Confusion Matrix:
[[218  71]
 [ 19  31]]
Precision: 0.3039
Recall: 0.6200

Metrics for Class 9:
Confusion Matrix:
[[292  11]
 [ 33   3]]
Precision: 0.2143
Recall: 0.0833

Metrics for Class 10:
Confusion Matrix:
[[334   0]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000

Metrics for Class 11:
Confusion Matrix

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
eval/accuracy,▁▆██▆
eval/f1,▁▆██▆
eval/loss,▄▁▅█▁
eval/precision,▁▆██▆
eval/recall,▁▆██▆
eval/runtime,▇▁██▇
eval/samples_per_second,▂█▁▁▂
eval/steps_per_second,▁█▇▇█
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▁▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▂▂▂▃▃▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.32153
eval/f1,0.32153
eval/loss,2.29062
eval/precision,0.32153
eval/recall,0.32153
eval/runtime,90.9329
eval/samples_per_second,3.728
eval/steps_per_second,0.935
total_flos,8.070626800356557e+16
train/epoch,2.98525


In [73]:
# Debug misclassifications
if choice == "1":
    misclass_df = debug_misclassifications(df_normalized, model, tokenizer, label_mapping)
elif choice == "2":
    misclass_df = debug_misclassifications(df_normalized_ua, model, tokenizer, label_mapping)
else:
    misclass_df = debug_misclassifications(df_normalized_cc,model, tokenizer, label_mapping)


Analyzing misclassifications in Training dataset...
Model is on device: cuda:0

Total samples to analyze: 1694

Total misclassifications: 706
Accuracy: 0.5832

Misclassification distribution:
predicted  0   1   2   5   7   8   9   13  14  15  16  17  18  19  20
actual                                                               
0           0   0   0   0   0   0   0   0   0  12   0   1   0   0   0
1           6   0   0   0   0  16   1   0   0  17   0   7   0   0   0
2           1   9   0   0   0  21   1   0   0  14   1   3   0   0   0
3           1   0   0   0   1   0   0   0   0   0   0   0   0   0   0
4           1   0   0   0   1   0   0   0   0   0   0   1   0   0   0
5           5   0   0   0   1   0   0   1   0   3   0   0   0   0   0
6          16   0   0   1   1   1   0   1   0   1   0   0   0   0   0
7          22   0   1   8   0   1   1   3   0   5   0   0   1   0   0
8           4   6   0   0   0   0   2   0   0  19   0  12   0   1   1
9           1  20   1   0   0  51   0