- implement with all 4 models
- label all of covid data
- implement generic benchmarks

In [1]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score, balanced_accuracy_score, precision_recall_fscore_support, classification_report
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
import os
os.environ["WANDB_PROJECT"] = "offline"
import accelerate

  Referenced from: <EB3FF92A-5EB1-3EE8-AF8B-5923C1265422> /Users/mb7336/miniforge3/envs/sandbox/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [11]:
seed = 1
modname = 'mlburnham/Political_DEBATE_ModernBERT_base_v1.0'

# Define label mapping
id2label = {0: "entailment", 1: "not_entailment"}
label2id = {'entailment':0, 'not_entialment':1}

def tokenize_function(docs):
    return tokenizer(docs['premise'], docs['hypothesis'], padding = False, truncation = False)

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(modname, num_labels = 2, ignore_mismatched_sizes=True)

def compute_metrics_standard(eval_pred, label_text_alphabetical=list(id2label.values())):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro') 
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)
    mcc = matthews_corrcoef(labels, preds_max)

    metrics = {'MCC': mcc,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

def metrics(df, preds, group_by=None):
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')
        
tokenizer = AutoTokenizer.from_pretrained(modname)

In [18]:
polnli = pd.read_csv('../data/polnli_test_results.csv')

In [12]:
polnli = load_dataset('mlburnham/Pol_NLI')

nlitok = polnli.map(tokenize_function, batched = True)
# Rename 'entailment' column to 'label'
nlitok = nlitok.rename_columns({'entailment':'label'})

Map:   0%|          | 0/171289 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/15036 [00:00<?, ? examples/s]

Map:   0%|          | 0/15366 [00:00<?, ? examples/s]

In [13]:
df = pd.read_csv('../data/covid_tweets_labeled.csv')
df = df[['text', 'non_comp']]
df['hypothesis'] = 'The author of this tweet does not believe COVID is dangerous.'
df.rename({'text':'premise', 'non_comp':'entailment'}, axis = 1, inplace = True)
df['entailment'].replace({0:1, 1:0}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['entailment'].replace({0:1, 1:0}, inplace = True)


In [15]:
train = df.sample(25, random_state = seed)
# Create validation set with remaining instances
val = df[~df.index.isin(train.index)]

# Create a DatasetDict with train and validation splits
ds = DatasetDict({'train': Dataset.from_pandas(train, preserve_index=False), 'validation':Dataset.from_pandas(val, preserve_index=False)})
# Tokenize the dataset
dstok = ds.map(tokenize_function, batched = True)
# Rename 'entailment' column to 'label'
dstok = dstok.rename_columns({'entailment':'label'})

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/1962 [00:00<?, ? examples/s]

In [52]:
training_args = TrainingArguments(output_dir='../few_shot/',
    logging_dir='../few_shot/',
    lr_scheduler_type= "linear",
    group_by_length=False,
    learning_rate = 5e-5,#9e-6, # base seems to benefit from higher learning rate. Unsure about large.
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1, 
    num_train_epochs=5,
    warmup_ratio=0.05,  
    weight_decay=0.01, 
    fp16=False,   
    fp16_full_eval=False,
    eval_strategy="no",
    seed=seed,
    save_strategy="no",
    dataloader_num_workers = 1,
)

tokenizer = AutoTokenizer.from_pretrained(modname)

# Initialize the Trainer
trainer = Trainer(
    model_init = model_init,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dstok['train'],
    eval_dataset=dstok['validation'],
    compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(id2label.values()))
)

  trainer = Trainer(


In [53]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=65, training_loss=0.521470935528095, metrics={'train_runtime': 43.2781, 'train_samples_per_second': 2.888, 'train_steps_per_second': 1.502, 'total_flos': 8554885454472.0, 'train_loss': 0.521470935528095, 'epoch': 5.0})

In [54]:
res = trainer.predict(dstok['validation'])
preds = np.argmax(res.predictions, axis=-1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': 0.27983788896423273, 'f1_macro': 0.6397498026070545, 'f1_micro': 0.7130479102956168, 'accuracy_balanced': 0.6421175067104086, 'accuracy': 0.7130479102956168, 'precision_macro': 0.6377543940795559, 'recall_macro': 0.6421175067104086, 'precision_micro': 0.7130479102956168, 'recall_micro': 0.7130479102956168}
Detailed metrics:  {'entailment': {'precision': 0.46557971014492755, 'recall': 0.4895238095238095, 'f1-score': 0.4772516248839369, 'support': 525.0}, 'not_entailment': {'precision': 0.8099290780141843, 'recall': 0.7947112038970077, 'f1-score': 0.8022479803301721, 'support': 1437.0}, 'accuracy': 0.7130479102956168, 'macro avg': {'precision': 0.6377543940795559, 'recall': 0.6421175067104086, 'f1-score': 0.6397498026070545, 'support': 1962.0}, 'weighted avg': {'precision': 0.7177866630644597, 'recall': 0.7130479102956168, 'f1-score': 0.7152841237505221, 'support': 1962.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


In [30]:
res = trainer.predict(dstok['validation'])
preds = np.argmax(res.predictions, axis=-1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': 0.27983788896423273, 'f1_macro': 0.6397498026070545, 'f1_micro': 0.7130479102956168, 'accuracy_balanced': 0.6421175067104086, 'accuracy': 0.7130479102956168, 'precision_macro': 0.6377543940795559, 'recall_macro': 0.6421175067104086, 'precision_micro': 0.7130479102956168, 'recall_micro': 0.7130479102956168}
Detailed metrics:  {'entailment': {'precision': 0.46557971014492755, 'recall': 0.4895238095238095, 'f1-score': 0.4772516248839369, 'support': 525.0}, 'not_entailment': {'precision': 0.8099290780141843, 'recall': 0.7947112038970077, 'f1-score': 0.8022479803301721, 'support': 1437.0}, 'accuracy': 0.7130479102956168, 'macro avg': {'precision': 0.6377543940795559, 'recall': 0.6421175067104086, 'f1-score': 0.6397498026070545, 'support': 1962.0}, 'weighted avg': {'precision': 0.7177866630644597, 'recall': 0.7130479102956168, 'f1-score': 0.7152841237505221, 'support': 1962.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


In [11]:
val['preds'] = preds
metrics(val, ['preds'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['preds'] = preds


Unnamed: 0_level_0,MCC,Accuracy,F1
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
preds,0.623038,0.858308,0.854046


In [10]:
nlires = trainer.predict(nlitok['test'])
nlipreds = np.argmax(nlires.predictions, axis=-1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': 0.8969777235203986, 'f1_macro': 0.9483258928158771, 'f1_micro': 0.9502798386047117, 'accuracy_balanced': 0.9462056265163759, 'accuracy': 0.9502798386047117, 'precision_macro': 0.9507837803241533, 'recall_macro': 0.9462056265163759, 'precision_micro': 0.9502798386047117, 'recall_micro': 0.9502798386047117}
Detailed metrics:  {'entailment': {'precision': 0.9532173342087984, 'recall': 0.9237989182309895, 'f1-score': 0.9382775892712877, 'support': 6286.0}, 'not_entailment': {'precision': 0.9483502264395083, 'recall': 0.9686123348017621, 'f1-score': 0.9583741963604664, 'support': 9080.0}, 'accuracy': 0.9502798386047117, 'macro avg': {'precision': 0.9507837803241533, 'recall': 0.9462056265163759, 'f1-score': 0.9483258928158771, 'support': 15366.0}, 'weighted avg': {'precision': 0.9503412871864664, 'recall': 0.9502798386047117, 'f1-score': 0.9501529759932545, 'support': 15366.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


In [23]:
nli = polnli['test'].to_pandas()
nli['fs'] = nlipreds

metrics(nli, ['fs'])

Unnamed: 0_level_0,MCC,Accuracy,F1
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fs,0.896976,0.95028,0.950154
