In [None]:
%pip install -r "requirements_bert.txt"

In [1]:
# Test each import individually
try:
    import torch
    print("✓ PyTorch OK")
except ImportError as e:
    print(f"✗ PyTorch failed: {e}")

try:
    from datasets import load_dataset
    print("✓ Datasets OK")
except ImportError as e:
    print(f"✗ Datasets failed: {e}")

try:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, AutoModelForCausalLM
    print("✓ Transformers OK")
except ImportError as e:
    print(f"✗ Transformers failed: {e}")

try:
    import bitsandbytes as bnb
    print("✓ BnB OK")
except ImportError as e:
    print(f"✗ BNb failed: {e}")

try:
    from peft import LoraConfig, get_peft_model, TaskType
    print("✓ PEFT OK")
except ImportError as e:
    print(f"✗ PEFT failed: {e}")

try:
    import numpy as np
    print("✓ NumPy OK")
except ImportError as e:
    print(f"✗ NumPy failed: {e}")

✓ PyTorch OK


  from .autonotebook import tqdm as notebook_tqdm


✓ Datasets OK
✓ Transformers OK
✓ BnB OK
✓ PEFT OK
✓ NumPy OK


In [2]:
#import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version used by PyTorch: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    device = torch.cuda.get_device_name(0)
    print(f"GPU name: {device}")
    

PyTorch version: 2.8.0+cu129
CUDA available: True
CUDA version used by PyTorch: 12.9
Number of GPUs: 1
GPU name: NVIDIA GeForce RTX 4070 SUPER


In [51]:
torch.cuda.empty_cache()
import gc
gc.collect()

1036

In [59]:
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# Call this before and after model loading
print_gpu_memory()

GPU memory allocated: 0.87 GB
GPU memory reserved: 10.79 GB


In [53]:
model_name = "jhu-clsp/mmBERT-base"

quantization_config = BitsAndBytesConfig(
                                        load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    #dtype=torch.float16,
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model)

In [None]:
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])

    return layer_names

list(set(get_specific_layer_names(model)))


In [54]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["Wqkv"],  # Fine-tuning the attention layer specifically
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 540,672 || all params: 308,072,450 || trainable%: 0.1755


In [None]:
dataset = load_dataset("mlburnham/Pol_NLI")

def tokenize_function(example):
    return tokenizer(example["premise"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("entailment", "labels") # Rename entailment column to labels (which is standard lookup for evaluation in the transmformers trainer)

In [None]:
dataset["train"]
# Premise is the context, hypothesis is the statement to verify

Dataset({
    features: ['premise', 'hypothesis', 'entailment', 'dataset', 'task', 'augmented_hypothesis'],
    num_rows: 171289
})

In [26]:
tokenized_dataset["train"] = tokenized_dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])  # Take the first 100 samples
tokenized_dataset["validation"] = tokenized_dataset["validation"].shuffle(seed=42).select([i for i in list(range(20))])  # Take the first 20 samples

In [None]:
tokenized_dataset["train"]

Column(['This text describes a hostage taking (kidnapping)', 'This text is attacking people for their place of origin.', 'This text is about defense r&d.', 'This text is about voting rights.', 'This text is dehumanizing people for their race.'])

In [55]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Start small, increase gradually
    gradient_accumulation_steps=12,  # Simulate larger batch size

    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    max_grad_norm=1.0,

    disable_tqdm=False,
)

In [57]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

In [58]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)


trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1091,0.985361,0.45,0.457333
2,5.083,0.974652,0.45,0.457333
3,4.6821,0.971707,0.45,0.457333


TrainOutput(global_step=27, training_loss=10.236043294270834, metrics={'train_runtime': 417.4004, 'train_samples_per_second': 0.719, 'train_steps_per_second': 0.065, 'total_flos': 1643610193920000.0, 'train_loss': 10.236043294270834, 'epoch': 3.0})

In [None]:
from datetime import datetime
lora_model.save_pretrained(f"output/mmBERT/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}/final")

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

def main():
    model_name = "jhu-clsp/mmBERT-base"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3
    )

    dataset = load_dataset("xnli", "all_languages")

    def tokenize_function(examples):
        texts = [f"{p} {tokenizer.sep_token} {h}"
                for p, h in zip(examples["premise"], examples["hypothesis"])]

        return tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=512
        )

    train_dataset = dataset["train"].map(tokenize_function, batched=True)
    eval_dataset = dataset["validation"].map(tokenize_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./mmbert-xnli",
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset, Features, Value, Sequence
from transformers import DataCollatorForLanguageModeling

# Load dataset with correct schema
features = Features({
    'sent_id': Value('string'),
    'doc_id': Value('string'),
    'text': Value('string'),
    'tokens': Sequence(Value('string')),
    'clusters': Sequence(Sequence(Value('int64')))
})

dataset = load_dataset("alexandrainst/dacoref", features=features)
train_dataset = dataset['train']

# Tokenize function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=512
    )
    # For causal LM, labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply tokenization
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)

# Remove string columns that can't be converted to tensors
columns_to_remove = ['sent_id', 'doc_id', 'text', 'tokens', 'clusters']
final_dataset = tokenized_dataset.remove_columns(columns_to_remove)

# Verify the dataset structure
print("Final columns:", final_dataset.column_names)
print("Sample item keys:", final_dataset[0].keys())
print("Input IDs shape:", len(final_dataset[0]['input_ids']))

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Create trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=final_dataset,
    data_collator=data_collator,
)

trainer.train()

In [10]:
ds = load_dataset("mlburnham/Pol_NLI")
test = ds['test'].to_pandas()
# we'll use a random sample of 1,000 documents for this example
test = test[['premise', 'hypothesis', 'entailment', 'task']].sample(1000, random_state = 1)
test.reset_index(drop = True, inplace = True)
test[['premise', 'entailment']].head()

Generating train split: 100%|██████████| 171289/171289 [00:00<00:00, 601641.98 examples/s]
Generating validation split: 100%|██████████| 15036/15036 [00:00<00:00, 541832.88 examples/s]
Generating test split: 100%|██████████| 15366/15366 [00:00<00:00, 727422.97 examples/s]


Unnamed: 0,premise,entailment
0,The soldiers storming the beaches on D-Day may...,0
1,Regime warplanes and helicopters targeted Al-L...,1
2,rt @scottwalker first up this morning at #ncsc...,1
3,"With protection from the Taliban, al Qaeda and...",0
4,@aiyegbayo @KadariaAhmed LOL what nigerian pro...,1


In [3]:
from transformers import pipeline

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [8]:
pipe = pipeline("zero-shot-classification", model='mlburnham/Political_DEBATE_large_v1.0', device = device, batch_size = 32)

Device set to use cuda


In [11]:
colname = 'debate_label' # the name of the column where we will assign out labels to
test[colname] = 0

for i in test.index:
    hypothesis = test.loc[i, 'hypothesis'] # get the right entailment hypothesis
    sample = test.loc[i, 'premise'] # get the document to be classified
    res = pipe(sample, hypothesis, hypothesis_template = '{}') # classify the document-hypothesis pair
    test.loc[i, colname] = round(res['scores'][0]) # here we extract the probability from the resulting dictionary, round the number to 0 or 1, and assign it to the dataframe
test[colname].replace({0:1, 1:0}, inplace = True) # in our data entailment is labeled as 0 and not entailment is 1, so we recode the 0 and 1 probabilities to match the entailment labels
test[colname] = test[colname].astype(int)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[colname].replace({0:1, 1:0}, inplace = True) # in our data entailment is labeled as 0 and not entailment is 1, so we recode the 0 and 1 probabilities to match the entailment labels


In [13]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(test['entailment'], test['debate_label'])

np.float64(0.9027057940562578)