In [8]:
import os
import pandas as pd

import torch
print(torch.__version__)
import wandb
from transformers import LlamaTokenizer, LlamaForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from datetime import datetime
import logging
import json
import numpy as np
import sentencepiece
from tqdm.notebook import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check CUDA availability
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


2.5.1+cu121
CUDA Available: True
GPU Device: NVIDIA GeForce RTX 2080 Ti
GPU Memory: 11.81 GB


In [9]:
base_path = os.path.dirname(os.getcwd())

# Load preprocessed data
input_file_full = os.path.join(base_path, "df_normalized.csv")
df_normalized = pd.read_csv(input_file_full)
df = pd.read_csv(input_file_full)

input_file_ua = os.path.join(base_path, "df_normalized_ua.csv")
df_normalized_ua = pd.read_csv(input_file_ua)

input_file_cc = os.path.join(base_path, "df_normalized_cc.csv")
df_normalized_cc = pd.read_csv(input_file_cc)

# Display dataset information
print("\nFull Dataset Info:")
print(df_normalized.info())
print(f"\nNumber of records: {len(df_normalized)}")

print("\nUA Dataset Info:")
print(df_normalized_ua.info())
print(f"\nNumber of UA records: {len(df_normalized_ua)}")

print("\nCC Dataset Info:")
print(df_normalized_cc.info())
print(f"\nNumber of CC records: {len(df_normalized_cc)}")

# Display sample rows
print("\nSample row from full dataset:")
print(df_normalized.iloc[0])


Full Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1694 entries, 0 to 1693
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   filename                      1694 non-null   object
 1   language                      1694 non-null   object
 2   content                       1694 non-null   object
 3   topic                         1694 non-null   object
 4   narrative_subnarrative_pairs  1694 non-null   object
 5   target_indices                1694 non-null   object
 6   tokens                        1694 non-null   object
 7   tokens_normalized             1694 non-null   object
dtypes: object(8)
memory usage: 106.0+ KB
None

Number of records: 1694

UA Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        -----

In [10]:
class CustomDataset(torch.utils.data.Dataset):
    """Custom Dataset for loading Llama input data"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
        # Debug info
        print(f"Dataset created with {len(self.labels)} samples")
        print(f"Label distribution: {pd.Series(self.labels).value_counts()}")

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def compute_metrics(pred):
    """Compute evaluation metrics"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)

    # Computing confusion matrix per class
    unique_classes = np.unique(labels)
    cm_per_class = {}
    
    for class_idx in unique_classes:
        binary_labels = (labels == class_idx).astype(int)
        binary_preds = (preds == class_idx).astype(int)
        cm = confusion_matrix(binary_labels, binary_preds)
        cm_per_class[f"Class_{class_idx}"] = cm.tolist()
        
        # Print per-class metrics for debugging
        print(f"\nMetrics for Class {class_idx}:")
        print(f"Confusion Matrix:\n{cm}")
        class_precision = precision_recall_fscore_support(binary_labels, binary_preds, average='binary')[0]
        class_recall = precision_recall_fscore_support(binary_labels, binary_preds, average='binary')[1]
        print(f"Precision: {class_precision:.4f}")
        print(f"Recall: {class_recall:.4f}")

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': cm_per_class
    }

In [12]:
def get_narrative_key(narrative_dict):
    """Extract key from narrative dictionary for classification"""
    if isinstance(narrative_dict, str):
        narrative_dict = eval(narrative_dict)
    return narrative_dict['narrative']  # or you could use narrative_dict['subnarrative']

def train_llama(df, base_path, model_name="openlm-research/open_llama_7b"):
    """Train Llama model with detailed debugging outputs"""
    try:
        current_date = datetime.now().strftime("%Y%m%d")
        
        # Create output directories
        output_dir = os.path.join(base_path, f"models/llama_{current_date}")
        log_dir = os.path.join(base_path, f"logs/llama_{current_date}")
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(log_dir, exist_ok=True)
        
        print(f"\nOutput directory: {output_dir}")
        print(f"Log directory: {log_dir}")

        # Initialize wandb
        wandb.init(project="llama-classification", name=f"llama-classification-{current_date}")

        # Create narrative mapping
        print("\nCreating narrative mapping...")
        narratives = df['narrative_subnarrative_pairs'].apply(
            lambda x: eval(x)[0] if isinstance(x, str) else x[0]
        ).tolist()
        
        # Extract unique narratives (using main narrative or subnarrative)
        unique_narratives = set(get_narrative_key(n) for n in narratives)
        label_mapping = {narrative: idx for idx, narrative in enumerate(sorted(unique_narratives))}
        
        print(f"Number of unique narratives: {len(unique_narratives)}")
        print("\nSample narrative mappings:")
        for i, (narrative, idx) in enumerate(list(label_mapping.items())[:5]):
            print(f"{idx}: {narrative}")

        # Save label mapping
        with open(os.path.join(output_dir, "label_mapping.json"), 'w') as f:
            json.dump(label_mapping, f, indent=2)

        # Prepare data
        print("\nPreparing data for training...")
        df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)
        
        print(f"Training set size: {len(df_train)}")
        print(f"Validation set size: {len(df_val)}")

        # Process texts and labels
        train_texts = df_train['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        val_texts = df_val['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        # Convert narratives to labels using the main narrative
        train_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in df_train['narrative_subnarrative_pairs']
        ]
        val_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in df_val['narrative_subnarrative_pairs']
        ]

        print("\nSample processed text:")
        print(train_texts[0][:200])
        
        print("\nLabel distribution in training set:")
        print(pd.Series(train_labels).value_counts())

        # Initialize tokenizer
        print("\nInitializing tokenizer...")
        tokenizer = LlamaTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        
        print(f"Vocabulary size: {len(tokenizer)}")
        print(f"Padding token: {tokenizer.pad_token}")
        print(f"EOS token: {tokenizer.eos_token}")

        # Tokenize texts
        print("\nTokenizing texts...")
        train_encodings = tokenizer(
            train_texts, 
            truncation=True, 
            padding=True, 
            max_length=512,
            return_tensors="pt"
        )
        val_encodings = tokenizer(
            val_texts, 
            truncation=True, 
            padding=True, 
            max_length=512,
            return_tensors="pt"
        )
        
        print("\nEncoding shapes:")
        for key, val in train_encodings.items():
            print(f"Training {key}: {val.shape}")

        # Create datasets
        train_dataset = CustomDataset(train_encodings, train_labels)
        val_dataset = CustomDataset(val_encodings, val_labels)

        # Initialize model
        print("\nInitializing model...")
        model = LlamaForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(label_mapping),
            torch_dtype=torch.float16
        )
        
        if model.config.pad_token_id is None:
            model.config.pad_token_id = tokenizer.pad_token_id
        
        print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            run_name=f"llama-classification-run-{current_date}",
            num_train_epochs=3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            learning_rate=2e-5,
            warmup_ratio=0.1,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir=log_dir,
            load_best_model_at_end=True,
            metric_for_best_model='eval_loss',
            greater_is_better=False,
            logging_steps=10,
            fp16=True,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True
        )

        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        # Train model
        print("\nStarting training...")
        trainer.train()

        # Evaluate model
        print("\nEvaluating model...")
        results = trainer.evaluate()
        
        print("\nEvaluation results:")
        for metric, value in results.items():
            if isinstance(value, float):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")

        # Save model and tokenizer
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"\nModel and tokenizer saved to {output_dir}")

        # End wandb run
        wandb.finish()

        return results, model, tokenizer, label_mapping

    except Exception as e:
        print(f"Error in Llama training: {str(e)}")
        wandb.finish()
        raise

In [13]:
def debug_misclassifications(dataset, model, tokenizer, label_mapping, dataset_type="Training"):
    """Debug misclassified examples with detailed output"""
    try:
        print(f"\nAnalyzing misclassifications in {dataset_type} dataset...")
        
        # Prepare data
        texts = dataset['tokens_normalized'].apply(
            lambda x: ' '.join(x) if isinstance(x, list) else x
        ).tolist()
        
        true_labels = [
            label_mapping[get_narrative_key(eval(n)[0] if isinstance(n, str) else n[0])]
            for n in dataset['narrative_subnarrative_pairs']
        ]
        
        print(f"\nTotal samples to analyze: {len(texts)}")

        # Get predictions
        print("\nGenerating predictions...")
        encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**encodings)
            predictions = outputs.logits.argmax(-1)

        # Track misclassifications
        misclassifications = []
        for idx, (pred, true) in enumerate(zip(predictions, true_labels)):
            if pred != true:
                misclassifications.append({
                    'text': texts[idx][:200],
                    'predicted': pred.item(),
                    'actual': true,
                    'confidence': torch.softmax(outputs.logits[idx], dim=0)[pred].item(),
                    'dataset_type': dataset_type
                })

        # Create DataFrame and display results
        misclass_df = pd.DataFrame(misclassifications)
        
        print(f"\nTotal misclassifications: {len(misclass_df)}")
        print(f"Accuracy: {1 - len(misclass_df)/len(texts):.4f}")
        
        print("\nMisclassification distribution:")
        print(misclass_df.groupby(['actual', 'predicted']).size().unstack(fill_value=0))
        
        return misclass_df

    except Exception as e:
        print(f"Error in debugging misclassifications: {str(e)}")
        raise

In [14]:
# Choose dataset to train on
print("Select dataset for training:")
print("1. Full dataset")
print("2. UA dataset")
print("3. CC dataset")
choice = input("Enter your choice (1-3): ")

if choice == "1":
    print("\nTraining on full dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized, base_path)
elif choice == "2":
    print("\nTraining on UA dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized_ua, base_path)
else:
    print("\nTraining on CC dataset...")
    results, model, tokenizer, label_mapping = train_llama(df_normalized_cc, base_path)


Select dataset for training:
1. Full dataset
2. UA dataset
3. CC dataset

Training on CC dataset...

Output directory: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024\code\models/llama_20250109
Log directory: c:\Users\krona\OneDrive - TU Wien\TU Wien\1. Semester\NLP\nlp_Backpropagandists_2024\code\logs/llama_20250109



Creating narrative mapping...
Number of unique narratives: 11

Sample narrative mappings:
0: Amplifying Climate Fears
1: Climate change is beneficial
2: Controversy about green technologies
3: Criticism of climate movement
4: Criticism of climate policies

Preparing data for training...
Training set size: 415
Validation set size: 104

Sample processed text:
['eleio', 'autrquica', 'poder', 'ignorar', 'agenda', 'climtica', 'efeito', 'altera', 'climtica', 'j', 'fazer', 'parte', 'dia', 'dia', 'brasileiro', 'h', 'algum', 'tempo', 'maio', 'po', 'comoveuse', 'd

Label distribution in training set:
0     176
9      82
5      60
4      30
3      21
6      21
8      10
10      7
2       6
1       1
7       1
Name: count, dtype: int64

Initializing tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Vocabulary size: 32000
Padding token: </s>
EOS token: </s>

Tokenizing texts...

Encoding shapes:
Training input_ids: torch.Size([415, 512])
Training attention_mask: torch.Size([415, 512])
Dataset created with 415 samples
Label distribution: 0     176
9      82
5      60
4      30
3      21
6      21
8      10
10      7
2       6
1       1
7       1
Name: count, dtype: int64
Dataset created with 104 samples
Label distribution: 0     40
9     22
5     12
3      8
8      8
6      7
4      4
1      1
10     1
7      1
Name: count, dtype: int64

Initializing model...


Downloading shards: 100%|██████████| 2/2 [03:54<00:00, 117.41s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.25it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model parameters: 6,607,388,672





Starting training...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Error in Llama training: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 11.00 GiB of which 0 bytes is free. Of the allocated memory 24.43 GiB is allocated by PyTorch, and 1023.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 11.00 GiB of which 0 bytes is free. Of the allocated memory 24.43 GiB is allocated by PyTorch, and 1023.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Debug misclassifications
if choice == "1":
    misclass_df = debug_misclassifications(df_normalized, model, tokenizer, label_mapping)
elif choice == "2":
    misclass_df = debug_misclassifications(df_normalized_ua, model, tokenizer, label_mapping)
else:
    misclass_df = debug_misclassifications(df_normalized_cc,model, tokenizer, label_mapping)