In [42]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score, balanced_accuracy_score, precision_recall_fscore_support, classification_report
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np

import os
os.environ["WANDB_PROJECT"] = "offline"

training_directory ='fewshot'
modname = "mlburnham/Political_DEBATE_base_v1.0"
# instantiate model
model = AutoModelForSequenceClassification.from_pretrained(modname, num_labels = 2, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(modname)

def metrics(df, preds, group_by=None):
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def truncate(text):
    words = text.split()
    if len(words) > 450:
        return " ".join(words[:450])
    return text


def tokenize_function(docs):
    return tokenizer(docs['premise'], docs['hypothesis'], padding = 'max_length', truncation = True)


def compute_metrics_standard(eval_pred, label_text_alphabetical=list(model.config.id2label.values())):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro') 
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)
    mcc = matthews_corrcoef(labels, preds_max)

    metrics = {'MCC': mcc,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

# Freedom and Rights

In [52]:
fr = pd.read_csv('freedom_test.csv')
fr = fr[~fr['text'].isna()]
fr['text'] = fr['text'].astype(str)
fr['hypothesis'] = 'This text is about freedom and rights.'
fr.rename({'text':'premise', 'freedom_and_rights':'entailment'}, inplace = True, axis = 1)
fr.drop_duplicates('premise', inplace = True)
fr['entailment'].replace({1:0, 0:1}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fr['entailment'].replace({1:0, 0:1}, inplace = True)


### Few Shot

In [53]:
training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=False,
    learning_rate = 2e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1, 
    num_train_epochs=5,
    warmup_ratio=0.06,  
    weight_decay=0.01, 
    fp16=True,   
    fp16_full_eval=True,
    eval_strategy="no",
    seed=1,
    save_strategy="no",
    dataloader_num_workers = 12,
)

tokenizer = AutoTokenizer.from_pretrained(modname)

In [54]:
%%time
# Define a function to initialize the model in the trainer. This will make results reproducible
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(modname, num_labels = 2, ignore_mismatched_sizes=True)

# Define the number of samples (shots) and random seeds to use
shots = [10, 25, 50, 100]
seeds = range(1,11)

# Initialize lists to store results
mcc_list = []
acc_list = []
shots_list = []

# Iterate through different shot sizes
for shot in shots:
    # Iterate through different random seeds
    for seed in seeds:
        # Sample training data based on current shot size and seed
        train = fr.sample(shot, random_state = seed)
        # Create validation set with remaining instances
        val = fr[~fr.index.isin(train.index)]
        
        # Create a DatasetDict with train and validation splits
        ds = DatasetDict({'train': Dataset.from_pandas(train, preserve_index=False), 'validation':Dataset.from_pandas(val, preserve_index=False)})
        # Tokenize the dataset
        dstok = ds.map(tokenize_function, batched = True)
        # Rename 'entailment' column to 'label'
        dstok = dstok.rename_columns({'entailment':'label'})
        # Define label mapping
        id2label = {0: "entailment", 1: "not_entailment"}
        
        # Initialize the Trainer
        trainer = Trainer(
            model_init = model_init,
            tokenizer=tokenizer,
            args=training_args,
            train_dataset=dstok['train'],
            eval_dataset=dstok['validation'],
            compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(model.config.id2label.values()))
        )
        
        # Train the model
        trainer.train()
        # Make predictions on validation set
        res = trainer.predict(dstok['validation'])
        preds = np.argmax(res.predictions, axis=-1)
        
        # Calculate Matthews Correlation Coefficient
        mcc_res = matthews_corrcoef(val['entailment'], preds)
        mcc_list.append(mcc_res)
        # Calculate Accuracy
        acc_res = accuracy_score(val['entailment'], preds)
        acc_list.append(acc_res)
        # Store the current shot size
        shots_list.append(shot)
    
    # Print progress
    print(shot)

Map: 100%|██████████| 10/10 [00:00<00:00, 1536.32 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4660.67 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.701119785385075, 'f1_macro': 0.8504231878390929, 'f1_micro': 0.8682414698162729, 'accuracy_balanced': 0.8474726076273464, 'accuracy': 0.8682414698162729, 'precision_macro': 0.8536746082050919, 'recall_macro': 0.8474726076273464, 'precision_micro': 0.8682414698162729, 'recall_micro': 0.8682414698162729}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8128058727569332, 'recall': 0.7852639873916469, 'f1-score': 0.7987975951903807, 'support': 2538.0}, 'not_entailment': {'precision': 0.8945433436532507, 'recall': 0.9096812278630461, 'f1-score': 0.9020487804878049, 'support': 5082.0}, 'accuracy': 0.8682414698162729, 'macro avg': {'precision': 0.8536746082050919, 'recall': 0.8474726076273464, 'f1-score': 0.8504231878390929, 'support': 7620.0}, 'weighted avg': {'precision': 0.8673189734255796, 'recall': 0.8682414698162729, 'f1-score': 0.8676588187706313, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1517.26 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4419.96 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7566455973726013, 'f1_macro': 0.8768041558527826, 'f1_micro': 0.8937007874015748, 'accuracy_balanced': 0.8667198218675234, 'accuracy': 0.8937007874015748, 'precision_macro': 0.8902928924783887, 'recall_macro': 0.8667198218675234, 'precision_micro': 0.8937007874015748, 'recall_micro': 0.8937007874015748}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8819106590004423, 'recall': 0.785967678360268, 'f1-score': 0.8311796581909129, 'support': 2537.0}, 'not_entailment': {'precision': 0.8986751259563351, 'recall': 0.9474719653747786, 'f1-score': 0.9224286535146524, 'support': 5083.0}, 'accuracy': 0.8937007874015748, 'macro avg': {'precision': 0.8902928924783887, 'recall': 0.8667198218675234, 'f1-score': 0.8768041558527826, 'support': 7620.0}, 'weighted avg': {'precision': 0.8930935704882117, 'recall': 0.8937007874015748, 'f1-score': 0.8920482465413812, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1855.72 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4824.35 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.6789855337106463, 'f1_macro': 0.8221673710301524, 'f1_micro': 0.8581364829396325, 'accuracy_balanced': 0.7967279905474611, 'accuracy': 0.8581364829396325, 'precision_macro': 0.8884208514823205, 'recall_macro': 0.7967279905474611, 'precision_micro': 0.8581364829396325, 'recall_micro': 0.8581364829396325}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9418886198547215, 'recall': 0.6123573396300669, 'f1-score': 0.7421893632244216, 'support': 2541.0}, 'not_entailment': {'precision': 0.8349530831099196, 'recall': 0.9810986414648553, 'f1-score': 0.9021453788358831, 'support': 5079.0}, 'accuracy': 0.8581364829396325, 'macro avg': {'precision': 0.8884208514823205, 'recall': 0.7967279905474611, 'f1-score': 0.8221673710301524, 'support': 7620.0}, 'weighted avg': {'precision': 0.8706122955598594, 'recall': 0.8581364829396325, 'f1-score': 0.8488057153622973, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1335.98 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4191.46 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7603451695099551, 'f1_macro': 0.8799216411916391, 'f1_micro': 0.8944881889763779, 'accuracy_balanced': 0.8756889763779527, 'accuracy': 0.8944881889763779, 'precision_macro': 0.8847097021390335, 'recall_macro': 0.8756889763779527, 'precision_micro': 0.8944881889763779, 'recall_micro': 0.8944881889763779}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.857790601813685, 'recall': 0.8192913385826772, 'f1-score': 0.838099073701168, 'support': 2540.0}, 'not_entailment': {'precision': 0.9116288024643819, 'recall': 0.9320866141732284, 'f1-score': 0.9217442086821102, 'support': 5080.0}, 'accuracy': 0.8944881889763779, 'macro avg': {'precision': 0.8847097021390335, 'recall': 0.8756889763779527, 'f1-score': 0.8799216411916391, 'support': 7620.0}, 'weighted avg': {'precision': 0.8936827355808163, 'recall': 0.8944881889763779, 'f1-score': 0.893862497021796, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1641.86 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4334.27 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.707420276699075, 'f1_macro': 0.8455690829119615, 'f1_micro': 0.8545931758530184, 'accuracy_balanced': 0.8710686017703287, 'accuracy': 0.8545931758530184, 'precision_macro': 0.8371636979640917, 'recall_macro': 0.8710686017703287, 'precision_micro': 0.8545931758530184, 'recall_micro': 0.8545931758530184}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7204566491823511, 'recall': 0.920378399684667, 'f1-score': 0.8082381446867428, 'support': 2537.0}, 'not_entailment': {'precision': 0.9538707467458324, 'recall': 0.8217588038559905, 'f1-score': 0.8829000211371802, 'support': 5083.0}, 'accuracy': 0.8545931758530184, 'macro avg': {'precision': 0.8371636979640917, 'recall': 0.8710686017703287, 'f1-score': 0.8455690829119615, 'support': 7620.0}, 'weighted avg': {'precision': 0.8761579428720067, 'recall': 0.8545931758530184, 'f1-score': 0.8580421234265818, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1655.34 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4853.91 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7602342095459054, 'f1_macro': 0.8757551299924181, 'f1_micro': 0.89501312335958, 'accuracy_balanced': 0.8593049393833292, 'accuracy': 0.89501312335958, 'precision_macro': 0.9021347816396754, 'recall_macro': 0.8593049393833292, 'precision_micro': 0.89501312335958, 'recall_micro': 0.89501312335958}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9178279673234022, 'recall': 0.7522646711303663, 'f1-score': 0.8268398268398268, 'support': 2539.0}, 'not_entailment': {'precision': 0.8864415959559487, 'recall': 0.9663452076362921, 'f1-score': 0.9246704331450094, 'support': 5081.0}, 'accuracy': 0.89501312335958, 'macro avg': {'precision': 0.9021347816396754, 'recall': 0.8593049393833292, 'f1-score': 0.8757551299924181, 'support': 7620.0}, 'weighted avg': {'precision': 0.8968996007987262, 'recall': 0.89501312335958, 'f1-score': 0.8920730697055266, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1660.52 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4846.80 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.733677646305904, 'f1_macro': 0.8665877223868812, 'f1_micro': 0.8828083989501312, 'accuracy_balanced': 0.8624684599445656, 'accuracy': 0.8828083989501312, 'precision_macro': 0.8712618808070184, 'recall_macro': 0.8624684599445656, 'precision_micro': 0.8828083989501312, 'recall_micro': 0.8828083989501312}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8395214521452146, 'recall': 0.8014966522252855, 'f1-score': 0.8200685069514406, 'support': 2539.0}, 'not_entailment': {'precision': 0.9030023094688222, 'recall': 0.9234402676638457, 'f1-score': 0.9131069378223217, 'support': 5081.0}, 'accuracy': 0.8828083989501312, 'macro avg': {'precision': 0.8712618808070184, 'recall': 0.8624684599445656, 'f1-score': 0.8665877223868812, 'support': 7620.0}, 'weighted avg': {'precision': 0.8818503545154575, 'recall': 0.8828083989501312, 'f1-score': 0.8821063372998589, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1685.20 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4852.32 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.6883166430890707, 'f1_macro': 0.8312614296428736, 'f1_micro': 0.8632545931758531, 'accuracy_balanced': 0.8079045419307649, 'accuracy': 0.8632545931758531, 'precision_macro': 0.8846807505521799, 'recall_macro': 0.8079045419307649, 'precision_micro': 0.8632545931758531, 'recall_micro': 0.8632545931758531}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9245604083947816, 'recall': 0.6419850334777472, 'f1-score': 0.7577870757787076, 'support': 2539.0}, 'not_entailment': {'precision': 0.8448010927095783, 'recall': 0.9738240503837827, 'f1-score': 0.9047357835070396, 'support': 5081.0}, 'accuracy': 0.8632545931758531, 'macro avg': {'precision': 0.8846807505521799, 'recall': 0.8079045419307649, 'f1-score': 0.8312614296428736, 'support': 7620.0}, 'weighted avg': {'precision': 0.8713770641695168, 'recall': 0.8632545931758531, 'f1-score': 0.8557721655382424, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1508.42 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4324.15 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7556648320523389, 'f1_macro': 0.8739493134262551, 'f1_micro': 0.8931758530183727, 'accuracy_balanced': 0.8584198683183549, 'accuracy': 0.8931758530183727, 'precision_macro': 0.8982963758955818, 'recall_macro': 0.8584198683183549, 'precision_micro': 0.8931758530183727, 'recall_micro': 0.8931758530183727}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9097387173396675, 'recall': 0.7542339503741631, 'f1-score': 0.8247200689061154, 'support': 2539.0}, 'not_entailment': {'precision': 0.8868540344514959, 'recall': 0.9626057862625468, 'f1-score': 0.9231785579463949, 'support': 5081.0}, 'accuracy': 0.8931758530183727, 'macro avg': {'precision': 0.8982963758955818, 'recall': 0.8584198683183549, 'f1-score': 0.8739493134262551, 'support': 7620.0}, 'weighted avg': {'precision': 0.8944792588416622, 'recall': 0.8931758530183727, 'f1-score': 0.8903719826611889, 'support': 7620.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1659.34 examples/s]
Map: 100%|██████████| 7620/7620 [00:01<00:00, 4836.15 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.6864185485672278, 'f1_macro': 0.8261822190297737, 'f1_micro': 0.8611548556430446, 'accuracy_balanced': 0.8005654693621187, 'accuracy': 0.8611548556430446, 'precision_macro': 0.8919033221090658, 'recall_macro': 0.8005654693621187, 'precision_micro': 0.8611548556430446, 'recall_micro': 0.8611548556430446}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.946417820590006, 'recall': 0.6186540731995277, 'f1-score': 0.7482151356496907, 'support': 2541.0}, 'not_entailment': {'precision': 0.8373888236281255, 'recall': 0.9824768655247096, 'f1-score': 0.9041493024098568, 'support': 5079.0}, 'accuracy': 0.8611548556430446, 'macro avg': {'precision': 0.8919033221090658, 'recall': 0.8005654693621187, 'f1-score': 0.8261822190297737, 'support': 7620.0}, 'weighted avg': {'precision': 0.8737461308827369, 'recall': 0.8611548556430446, 'f1-score': 0.8521507830217226, 'support': 7620.0}} 

10


Map: 100%|██████████| 25/25 [00:00<00:00, 2931.03 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4866.10 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7645377075959617, 'f1_macro': 0.8793808935660148, 'f1_micro': 0.8970414201183432, 'accuracy_balanced': 0.865680473372781, 'accuracy': 0.8970414201183432, 'precision_macro': 0.8996097337006428, 'recall_macro': 0.865680473372781, 'precision_micro': 0.8970414201183432, 'recall_micro': 0.8970414201183432}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9055555555555556, 'recall': 0.7715976331360946, 'f1-score': 0.8332268370607029, 'support': 2535.0}, 'not_entailment': {'precision': 0.89366391184573, 'recall': 0.9597633136094674, 'f1-score': 0.9255349500713267, 'support': 5070.0}, 'accuracy': 0.8970414201183432, 'macro avg': {'precision': 0.8996097337006428, 'recall': 0.865680473372781, 'f1-score': 0.8793808935660148, 'support': 7605.0}, 'weighted avg': {'precision': 0.8976277930823386, 'recall': 0.8970414201183432, 'f1-score': 0.8947655790677855, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2109.98 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4642.69 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7520554596414432, 'f1_macro': 0.8731291643930856, 'f1_micro': 0.8917817225509533, 'accuracy_balanced': 0.8595799251146694, 'accuracy': 0.8917817225509533, 'precision_macro': 0.893227885425012, 'recall_macro': 0.8595799251146694, 'precision_micro': 0.8917817225509533, 'recall_micro': 0.8917817225509533}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8965677179962894, 'recall': 0.7631267272009475, 'f1-score': 0.8244828321603753, 'support': 2533.0}, 'not_entailment': {'precision': 0.8898880528537346, 'recall': 0.9560331230283912, 'f1-score': 0.921775496625796, 'support': 5072.0}, 'accuracy': 0.8917817225509533, 'macro avg': {'precision': 0.893227885425012, 'recall': 0.8595799251146694, 'f1-score': 0.8731291643930856, 'support': 7605.0}, 'weighted avg': {'precision': 0.8921128512503278, 'recall': 0.8917817225509533, 'f1-score': 0.8893701949701864, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2672.14 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4158.52 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7444261428744835, 'f1_macro': 0.8700818561825343, 'f1_micro': 0.8884944115713347, 'accuracy_balanced': 0.8584406479296282, 'accuracy': 0.8884944115713347, 'precision_macro': 0.8865146750207442, 'recall_macro': 0.8584406479296282, 'precision_micro': 0.8884944115713347, 'recall_micro': 0.8884944115713347}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8817934782608695, 'recall': 0.7683504340962904, 'f1-score': 0.8211725010544074, 'support': 2534.0}, 'not_entailment': {'precision': 0.8912358717806189, 'recall': 0.9485308617629659, 'f1-score': 0.9189912113106611, 'support': 5071.0}, 'accuracy': 0.8884944115713347, 'macro avg': {'precision': 0.8865146750207442, 'recall': 0.8584406479296282, 'f1-score': 0.8700818561825343, 'support': 7605.0}, 'weighted avg': {'precision': 0.8880896488773915, 'recall': 0.8884944115713347, 'f1-score': 0.886397836979386, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2755.93 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4842.14 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7129444951398461, 'f1_macro': 0.8503391693846716, 'f1_micro': 0.8749506903353057, 'accuracy_balanced': 0.8323371636355937, 'accuracy': 0.8749506903353057, 'precision_macro': 0.8823600764279463, 'recall_macro': 0.8323371636355937, 'precision_micro': 0.8749506903353057, 'recall_micro': 0.8749506903353057}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.897887323943662, 'recall': 0.7046979865771812, 'f1-score': 0.7896483078964831, 'support': 2533.0}, 'not_entailment': {'precision': 0.8668328289122307, 'recall': 0.9599763406940063, 'f1-score': 0.9110300308728599, 'support': 5072.0}, 'accuracy': 0.8749506903353057, 'macro avg': {'precision': 0.8823600764279463, 'recall': 0.8323371636355937, 'f1-score': 0.8503391693846716, 'support': 7605.0}, 'weighted avg': {'precision': 0.8771761603934425, 'recall': 0.8749506903353057, 'f1-score': 0.8706013781050542, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2807.81 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4824.11 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7457101635351137, 'f1_macro': 0.8727976281966974, 'f1_micro': 0.8875739644970414, 'accuracy_balanced': 0.8707810418030812, 'accuracy': 0.8875739644970414, 'precision_macro': 0.8749407233008539, 'recall_macro': 0.8707810418030812, 'precision_micro': 0.8875739644970414, 'recall_micro': 0.8875739644970414}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8386446147640177, 'recall': 0.8204419889502762, 'f1-score': 0.829443447037702, 'support': 2534.0}, 'not_entailment': {'precision': 0.9112368318376902, 'recall': 0.9211200946558864, 'f1-score': 0.9161518093556928, 'support': 5071.0}, 'accuracy': 0.8875739644970414, 'macro avg': {'precision': 0.8749407233008539, 'recall': 0.8707810418030812, 'f1-score': 0.8727976281966974, 'support': 7605.0}, 'weighted avg': {'precision': 0.8870489714741548, 'recall': 0.8875739644970414, 'f1-score': 0.8872604234104216, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2734.16 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4822.46 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7665340672700671, 'f1_macro': 0.8824255989474702, 'f1_micro': 0.897698882314267, 'accuracy_balanced': 0.8747534516765285, 'accuracy': 0.897698882314267, 'precision_macro': 0.8919740256273618, 'recall_macro': 0.8747534516765285, 'precision_micro': 0.897698882314267, 'recall_micro': 0.897698882314267}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8772005152425933, 'recall': 0.8059171597633136, 'f1-score': 0.8400493421052632, 'support': 2535.0}, 'not_entailment': {'precision': 0.9067475360121304, 'recall': 0.9435897435897436, 'f1-score': 0.9248018557896772, 'support': 5070.0}, 'accuracy': 0.897698882314267, 'macro avg': {'precision': 0.8919740256273618, 'recall': 0.8747534516765285, 'f1-score': 0.8824255989474702, 'support': 7605.0}, 'weighted avg': {'precision': 0.8968985290889514, 'recall': 0.897698882314267, 'f1-score': 0.8965510178948725, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2163.08 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4219.75 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.6556463669900423, 'f1_macro': 0.8060496472432714, 'f1_micro': 0.8476002629848783, 'accuracy_balanced': 0.7799467253239272, 'accuracy': 0.8476002629848783, 'precision_macro': 0.8838874682761825, 'recall_macro': 0.7799467253239272, 'precision_micro': 0.8476002629848783, 'recall_micro': 0.8476002629848783}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9450904392764858, 'recall': 0.5766653527788727, 'f1-score': 0.7162790697674418, 'support': 2537.0}, 'not_entailment': {'precision': 0.8226844972758791, 'recall': 0.9832280978689818, 'f1-score': 0.8958202247191012, 'support': 5068.0}, 'accuracy': 0.8476002629848783, 'macro avg': {'precision': 0.8838874682761825, 'recall': 0.7799467253239272, 'f1-score': 0.8060496472432714, 'support': 7605.0}, 'weighted avg': {'precision': 0.8635186688545168, 'recall': 0.8476002629848783, 'f1-score': 0.8359259564597508, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2497.20 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4818.93 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7523952172020095, 'f1_macro': 0.8727666912131289, 'f1_micro': 0.8917817225509533, 'accuracy_balanced': 0.8581745105353165, 'accuracy': 0.8917817225509533, 'precision_macro': 0.8951276167184439, 'recall_macro': 0.8581745105353165, 'precision_micro': 0.8917817225509533, 'recall_micro': 0.8917817225509533}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9027255639097744, 'recall': 0.7571935356720536, 'f1-score': 0.8235798499464094, 'support': 2537.0}, 'not_entailment': {'precision': 0.8875296695271134, 'recall': 0.9591554853985793, 'f1-score': 0.9219535324798482, 'support': 5068.0}, 'accuracy': 0.8917817225509533, 'macro avg': {'precision': 0.8951276167184439, 'recall': 0.8581745105353165, 'f1-score': 0.8727666912131289, 'support': 7605.0}, 'weighted avg': {'precision': 0.892598963945103, 'recall': 0.8917817225509533, 'f1-score': 0.8891364341777661, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2763.26 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4764.37 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7671027305618837, 'f1_macro': 0.8835240262000228, 'f1_micro': 0.8961209730440499, 'accuracy_balanced': 0.8849674050171801, 'accuracy': 0.8961209730440499, 'precision_macro': 0.8821405342156416, 'recall_macro': 0.8849674050171801, 'precision_micro': 0.8961209730440499, 'recall_micro': 0.8961209730440499}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.838973162193699, 'recall': 0.8515594157125937, 'f1-score': 0.8452194357366771, 'support': 2533.0}, 'not_entailment': {'precision': 0.9253079062375844, 'recall': 0.9183753943217665, 'f1-score': 0.9218286166633683, 'support': 5072.0}, 'accuracy': 0.8961209730440499, 'macro avg': {'precision': 0.8821405342156416, 'recall': 0.8849674050171801, 'f1-score': 0.8835240262000228, 'support': 7605.0}, 'weighted avg': {'precision': 0.8965523629551174, 'recall': 0.8961209730440499, 'f1-score': 0.8963123700772659, 'support': 7605.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2526.93 examples/s]
Map: 100%|██████████| 7605/7605 [00:01<00:00, 4485.20 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7174905071995364, 'f1_macro': 0.8530280535528545, 'f1_micro': 0.8767915844838922, 'accuracy_balanced': 0.8355101794156818, 'accuracy': 0.8767915844838922, 'precision_macro': 0.8835894255265229, 'recall_macro': 0.8355101794156818, 'precision_micro': 0.8767915844838922, 'recall_micro': 0.8767915844838922}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8980099502487562, 'recall': 0.7114702404414663, 'f1-score': 0.7939300637783153, 'support': 2537.0}, 'not_entailment': {'precision': 0.8691689008042895, 'recall': 0.9595501183898973, 'f1-score': 0.9121260433273938, 'support': 5068.0}, 'accuracy': 0.8767915844838922, 'macro avg': {'precision': 0.8835894255265229, 'recall': 0.8355101794156818, 'f1-score': 0.8530280535528545, 'support': 7605.0}, 'weighted avg': {'precision': 0.8787901687123253, 'recall': 0.8767915844838922, 'f1-score': 0.8726962997223956, 'support': 7605.0}} 

25


Map: 100%|██████████| 50/50 [00:00<00:00, 3443.15 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4844.49 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7736634717425136, 'f1_macro': 0.885729168571127, 'f1_micro': 0.9007915567282322, 'accuracy_balanced': 0.8769590216588613, 'accuracy': 0.9007915567282322, 'precision_macro': 0.8969630205921673, 'recall_macro': 0.8769590216588613, 'precision_micro': 0.9007915567282322, 'recall_micro': 0.9007915567282322}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8872442316064432, 'recall': 0.8052153299091268, 'f1-score': 0.844241922120961, 'support': 2531.0}, 'not_entailment': {'precision': 0.9066818095778914, 'recall': 0.9487027134085958, 'f1-score': 0.927216415021293, 'support': 5049.0}, 'accuracy': 0.9007915567282322, 'macro avg': {'precision': 0.8969630205921673, 'recall': 0.8769590216588613, 'f1-score': 0.885729168571127, 'support': 7580.0}, 'weighted avg': {'precision': 0.9001915048489025, 'recall': 0.9007915567282322, 'f1-score': 0.8995108158747573, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3429.02 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4323.32 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7568132136234244, 'f1_macro': 0.8782795450266075, 'f1_micro': 0.8927440633245383, 'accuracy_balanced': 0.8752554589861381, 'accuracy': 0.8927440633245383, 'precision_macro': 0.8815842158982241, 'recall_macro': 0.8752554589861381, 'precision_micro': 0.8927440633245383, 'recall_micro': 0.8927440633245383}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8501841997544003, 'recall': 0.8229001584786054, 'f1-score': 0.8363197100865714, 'support': 2524.0}, 'not_entailment': {'precision': 0.9129842320420479, 'recall': 0.9276107594936709, 'f1-score': 0.9202393799666437, 'support': 5056.0}, 'accuracy': 0.8927440633245383, 'macro avg': {'precision': 0.8815842158982241, 'recall': 0.8752554589861381, 'f1-score': 0.8782795450266075, 'support': 7580.0}, 'weighted avg': {'precision': 0.8920729811853167, 'recall': 0.8927440633245383, 'f1-score': 0.8922956798640971, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 2528.46 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4617.13 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7438624935637211, 'f1_macro': 0.8718713119590873, 'f1_micro': 0.8868073878627968, 'accuracy_balanced': 0.8698132422415696, 'accuracy': 0.8868073878627968, 'precision_macro': 0.8740613816158249, 'recall_macro': 0.8698132422415696, 'precision_micro': 0.8868073878627968, 'recall_micro': 0.8868073878627968}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8375202593192869, 'recall': 0.8189381933438986, 'f1-score': 0.828125, 'support': 2524.0}, 'not_entailment': {'precision': 0.9106025039123631, 'recall': 0.9206882911392406, 'f1-score': 0.9156176239181747, 'support': 5056.0}, 'accuracy': 0.8868073878627968, 'macro avg': {'precision': 0.8740613816158249, 'recall': 0.8698132422415696, 'f1-score': 0.8718713119590873, 'support': 7580.0}, 'weighted avg': {'precision': 0.8862674662668585, 'recall': 0.8868073878627968, 'f1-score': 0.8864841961121757, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 2892.70 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4493.74 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7825754693825839, 'f1_macro': 0.8907685171903045, 'f1_micro': 0.9046174142480211, 'accuracy_balanced': 0.8845640874413377, 'accuracy': 0.9046174142480211, 'precision_macro': 0.8981289369439573, 'recall_macro': 0.8845640874413377, 'precision_micro': 0.9046174142480211, 'recall_micro': 0.9046174142480211}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8809322033898305, 'recall': 0.824672748909163, 'f1-score': 0.8518746158574063, 'support': 2521.0}, 'not_entailment': {'precision': 0.9153256704980843, 'recall': 0.9444554259735125, 'f1-score': 0.9296624185232026, 'support': 5059.0}, 'accuracy': 0.9046174142480211, 'macro avg': {'precision': 0.8981289369439573, 'recall': 0.8845640874413377, 'f1-score': 0.8907685171903045, 'support': 7580.0}, 'weighted avg': {'precision': 0.9038868933767245, 'recall': 0.9046174142480211, 'f1-score': 0.9037913036788131, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3610.61 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4851.59 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7678395639627681, 'f1_macro': 0.8838071023434142, 'f1_micro': 0.8976253298153034, 'accuracy_balanced': 0.8809179018011895, 'accuracy': 0.8976253298153034, 'precision_macro': 0.8869453189248173, 'recall_macro': 0.8809179018011895, 'precision_micro': 0.8976253298153034, 'recall_micro': 0.8976253298153034}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8568507157464212, 'recall': 0.8310194367314557, 'f1-score': 0.8437374144180427, 'support': 2521.0}, 'not_entailment': {'precision': 0.9170399221032133, 'recall': 0.9308163668709231, 'f1-score': 0.9238767902687856, 'support': 5059.0}, 'accuracy': 0.8976253298153034, 'macro avg': {'precision': 0.8869453189248173, 'recall': 0.8809179018011895, 'f1-score': 0.8838071023434142, 'support': 7580.0}, 'weighted avg': {'precision': 0.8970218496460269, 'recall': 0.8976253298153034, 'f1-score': 0.8972235756883473, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3543.74 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4260.91 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7561888271703352, 'f1_macro': 0.8739608903492715, 'f1_micro': 0.8932717678100264, 'accuracy_balanced': 0.8580217252588249, 'accuracy': 0.8932717678100264, 'precision_macro': 0.8992924884124416, 'recall_macro': 0.8580217252588249, 'precision_micro': 0.8932717678100264, 'recall_micro': 0.8932717678100264}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9126679462571977, 'recall': 0.7520759193357058, 'f1-score': 0.8246260567960113, 'support': 2529.0}, 'not_entailment': {'precision': 0.8859170305676856, 'recall': 0.9639675311819441, 'f1-score': 0.9232957239025316, 'support': 5051.0}, 'accuracy': 0.8932717678100264, 'macro avg': {'precision': 0.8992924884124416, 'recall': 0.8580217252588249, 'f1-score': 0.8739608903492715, 'support': 7580.0}, 'weighted avg': {'precision': 0.8948422371348065, 'recall': 0.8932717678100264, 'f1-score': 0.8903754616185752, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3639.94 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4871.73 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7046465018639259, 'f1_macro': 0.8420087352853978, 'f1_micro': 0.8704485488126649, 'accuracy_balanced': 0.820067263301435, 'accuracy': 0.8704485488126649, 'precision_macro': 0.8878299575747659, 'recall_macro': 0.820067263301435, 'precision_micro': 0.8704485488126649, 'recall_micro': 0.8704485488126649}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9215258855585831, 'recall': 0.6686437327006722, 'f1-score': 0.7749770852428964, 'support': 2529.0}, 'not_entailment': {'precision': 0.8541340295909486, 'recall': 0.9714907939021976, 'f1-score': 0.9090403853278992, 'support': 5051.0}, 'accuracy': 0.8704485488126649, 'macro avg': {'precision': 0.8878299575747659, 'recall': 0.820067263301435, 'f1-score': 0.8420087352853978, 'support': 7580.0}, 'weighted avg': {'precision': 0.8766187266545565, 'recall': 0.8704485488126649, 'f1-score': 0.8643113502467684, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3132.04 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4748.08 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7678954341339881, 'f1_macro': 0.8818281947245236, 'f1_micro': 0.8984168865435356, 'accuracy_balanced': 0.869954884643336, 'accuracy': 0.8984168865435356, 'precision_macro': 0.8984698014815409, 'recall_macro': 0.869954884643336, 'precision_micro': 0.8984168865435356, 'recall_micro': 0.8984168865435356}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.898596650067904, 'recall': 0.7842749901224813, 'f1-score': 0.8375527426160337, 'support': 2531.0}, 'not_entailment': {'precision': 0.8983429528951778, 'recall': 0.9556347791641909, 'f1-score': 0.9261036468330134, 'support': 5049.0}, 'accuracy': 0.8984168865435356, 'macro avg': {'precision': 0.8984698014815409, 'recall': 0.869954884643336, 'f1-score': 0.8818281947245236, 'support': 7580.0}, 'weighted avg': {'precision': 0.898427663652984, 'recall': 0.8984168865435356, 'f1-score': 0.8965360559922252, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3704.56 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4286.93 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7954366489038037, 'f1_macro': 0.8976036903055538, 'f1_micro': 0.9097625329815303, 'accuracy_balanced': 0.8946115621831952, 'accuracy': 0.9097625329815303, 'precision_macro': 0.900849546145317, 'recall_macro': 0.8946115621831952, 'precision_micro': 0.9097625329815303, 'recall_micro': 0.9097625329815303}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8757154538021259, 'recall': 0.8493259318001586, 'f1-score': 0.8623188405797102, 'support': 2522.0}, 'not_entailment': {'precision': 0.925983638488508, 'recall': 0.9398971925662317, 'f1-score': 0.9328885400313972, 'support': 5058.0}, 'accuracy': 0.9097625329815303, 'macro avg': {'precision': 0.900849546145317, 'recall': 0.8946115621831952, 'f1-score': 0.8976036903055538, 'support': 7580.0}, 'weighted avg': {'precision': 0.9092585247973397, 'recall': 0.9097625329815303, 'f1-score': 0.9094087534855984, 'support': 7580.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 2746.33 examples/s]
Map: 100%|██████████| 7580/7580 [00:01<00:00, 4694.37 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7532613806989781, 'f1_macro': 0.8756440502690481, 'f1_micro': 0.8919525065963061, 'accuracy_balanced': 0.8674957505410021, 'accuracy': 0.8919525065963061, 'precision_macro': 0.8859926997912488, 'recall_macro': 0.8674957505410021, 'precision_micro': 0.8919525065963061, 'recall_micro': 0.8919525065963061}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8707718993928881, 'recall': 0.7939897192566232, 'f1-score': 0.8306101344364012, 'support': 2529.0}, 'not_entailment': {'precision': 0.9012135001896094, 'recall': 0.9410017818253811, 'f1-score': 0.920677966101695, 'support': 5051.0}, 'accuracy': 0.8919525065963061, 'macro avg': {'precision': 0.8859926997912488, 'recall': 0.8674957505410021, 'f1-score': 0.8756440502690481, 'support': 7580.0}, 'weighted avg': {'precision': 0.891056929158619, 'recall': 0.8919525065963061, 'f1-score': 0.890627630180649, 'support': 7580.0}} 

50


Map: 100%|██████████| 100/100 [00:00<00:00, 4101.17 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4853.93 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7725980111817501, 'f1_macro': 0.8860077583659225, 'f1_micro': 0.899734395750332, 'accuracy_balanced': 0.8814303252970499, 'accuracy': 0.899734395750332, 'precision_macro': 0.8912298310426264, 'recall_macro': 0.8814303252970499, 'precision_micro': 0.899734395750332, 'recall_micro': 0.899734395750332}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8678065054211843, 'recall': 0.8261214767764986, 'f1-score': 0.8464510880618263, 'support': 2519.0}, 'not_entailment': {'precision': 0.9146531566640685, 'recall': 0.9367391738176013, 'f1-score': 0.9255644286700188, 'support': 5011.0}, 'accuracy': 0.899734395750332, 'macro avg': {'precision': 0.8912298310426264, 'recall': 0.8814303252970499, 'f1-score': 0.8860077583659225, 'support': 7530.0}, 'weighted avg': {'precision': 0.8989816142363362, 'recall': 0.899734395750332, 'f1-score': 0.8990987573563353, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3378.23 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4494.25 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7982156471603467, 'f1_macro': 0.8988341604597325, 'f1_micro': 0.9112881806108898, 'accuracy_balanced': 0.8942163691481182, 'accuracy': 0.9112881806108898, 'precision_macro': 0.9040599713987374, 'recall_macro': 0.8942163691481182, 'precision_micro': 0.9112881806108898, 'recall_micro': 0.9112881806108898}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8843252305113161, 'recall': 0.8433253397282174, 'f1-score': 0.8633387888707038, 'support': 2502.0}, 'not_entailment': {'precision': 0.9237947122861586, 'recall': 0.9451073985680191, 'f1-score': 0.9343295320487613, 'support': 5028.0}, 'accuracy': 0.9112881806108898, 'macro avg': {'precision': 0.9040599713987374, 'recall': 0.8942163691481182, 'f1-score': 0.8988341604597325, 'support': 7530.0}, 'weighted avg': {'precision': 0.9106801514095774, 'recall': 0.9112881806108898, 'f1-score': 0.9107413727617096, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4180.80 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4847.45 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7741580111182207, 'f1_macro': 0.8867130395377818, 'f1_micro': 0.900796812749004, 'accuracy_balanced': 0.8815282867101617, 'accuracy': 0.900796812749004, 'precision_macro': 0.8927104798351486, 'recall_macro': 0.8815282867101617, 'precision_micro': 0.900796812749004, 'recall_micro': 0.900796812749004}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8708860759493671, 'recall': 0.8239520958083832, 'f1-score': 0.8467692307692307, 'support': 2505.0}, 'not_entailment': {'precision': 0.9145348837209303, 'recall': 0.9391044776119403, 'f1-score': 0.9266568483063329, 'support': 5025.0}, 'accuracy': 0.900796812749004, 'macro avg': {'precision': 0.8927104798351486, 'recall': 0.8815282867101617, 'f1-score': 0.8867130395377818, 'support': 7530.0}, 'weighted avg': {'precision': 0.9000142644025019, 'recall': 0.900796812749004, 'f1-score': 0.9000806886874164, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3998.00 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4828.55 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7909092600215714, 'f1_macro': 0.8952643510262872, 'f1_micro': 0.9061088977423639, 'accuracy_balanced': 0.899205600525828, 'accuracy': 0.9061088977423639, 'precision_macro': 0.8917389039406765, 'recall_macro': 0.899205600525828, 'precision_micro': 0.9061088977423639, 'recall_micro': 0.9061088977423639}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8451786400307337, 'recall': 0.8785942492012779, 'f1-score': 0.8615625611905228, 'support': 2504.0}, 'not_entailment': {'precision': 0.9382991678506191, 'recall': 0.919816951850378, 'f1-score': 0.9289661408620516, 'support': 5026.0}, 'accuracy': 0.9061088977423639, 'macro avg': {'precision': 0.8917389039406765, 'recall': 0.899205600525828, 'f1-score': 0.8952643510262872, 'support': 7530.0}, 'weighted avg': {'precision': 0.9073331915344182, 'recall': 0.9061088977423639, 'f1-score': 0.9065519890031528, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4096.48 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4846.94 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7922774854372554, 'f1_macro': 0.8961073740174859, 'f1_micro': 0.9073041168658699, 'accuracy_balanced': 0.8976903620973656, 'accuracy': 0.9073041168658699, 'precision_macro': 0.8945931771016249, 'recall_macro': 0.8976903620973656, 'precision_micro': 0.9073041168658699, 'recall_micro': 0.9073041168658699}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8552373479795998, 'recall': 0.8688720605819051, 'f1-score': 0.8620007908264136, 'support': 2509.0}, 'not_entailment': {'precision': 0.9339490062236498, 'recall': 0.9265086636128261, 'f1-score': 0.9302139572085583, 'support': 5021.0}, 'accuracy': 0.9073041168658699, 'macro avg': {'precision': 0.8945931771016249, 'recall': 0.8976903620973656, 'f1-score': 0.8961073740174859, 'support': 7530.0}, 'weighted avg': {'precision': 0.9077222398844306, 'recall': 0.9073041168658699, 'f1-score': 0.907485293934614, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4333.14 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4810.31 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7675786488505046, 'f1_macro': 0.883788080800723, 'f1_micro': 0.8966799468791501, 'accuracy_balanced': 0.8834839418127328, 'accuracy': 0.8966799468791501, 'precision_macro': 0.8840949502253733, 'recall_macro': 0.8834839418127328, 'precision_micro': 0.8966799468791501, 'recall_micro': 0.8966799468791501}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8464299960111688, 'recall': 0.8437375745526839, 'f1-score': 0.8450816407805655, 'support': 2515.0}, 'not_entailment': {'precision': 0.9217599044395779, 'recall': 0.9232303090727817, 'f1-score': 0.9224945208208807, 'support': 5015.0}, 'accuracy': 0.8966799468791501, 'macro avg': {'precision': 0.8840949502253733, 'recall': 0.8834839418127328, 'f1-score': 0.883788080800723, 'support': 7530.0}, 'weighted avg': {'precision': 0.8965999151039274, 'recall': 0.8966799468791501, 'f1-score': 0.8966388244993145, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3834.93 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4795.71 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7343134604983083, 'f1_macro': 0.8584691400693474, 'f1_micro': 0.8830013280212483, 'accuracy_balanced': 0.8371245286915852, 'accuracy': 0.8830013280212483, 'precision_macro': 0.8998643026374811, 'recall_macro': 0.8371245286915852, 'precision_micro': 0.8830013280212483, 'recall_micro': 0.8830013280212483}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9335812964930924, 'recall': 0.6991643454038997, 'f1-score': 0.7995449374288964, 'support': 2513.0}, 'not_entailment': {'precision': 0.8661473087818697, 'recall': 0.9750847119792705, 'f1-score': 0.9173933427097984, 'support': 5017.0}, 'accuracy': 0.8830013280212483, 'macro avg': {'precision': 0.8998643026374811, 'recall': 0.8371245286915852, 'f1-score': 0.8584691400693474, 'support': 7530.0}, 'weighted avg': {'precision': 0.8886521708161729, 'recall': 0.8830013280212483, 'f1-score': 0.878063589393609, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4199.55 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4803.35 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7896036244953403, 'f1_macro': 0.894491925850263, 'f1_micro': 0.9073041168658699, 'accuracy_balanced': 0.8896761009218864, 'accuracy': 0.9073041168658699, 'precision_macro': 0.8999949460213101, 'recall_macro': 0.8896761009218864, 'precision_micro': 0.9073041168658699, 'recall_micro': 0.9073041168658699}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8799665411961523, 'recall': 0.8365805168986084, 'f1-score': 0.8577252344068488, 'support': 2515.0}, 'not_entailment': {'precision': 0.9200233508464681, 'recall': 0.9427716849451645, 'f1-score': 0.9312586172936773, 'support': 5015.0}, 'accuracy': 0.9073041168658699, 'macro avg': {'precision': 0.8999949460213101, 'recall': 0.8896761009218864, 'f1-score': 0.894491925850263, 'support': 7530.0}, 'weighted avg': {'precision': 0.9066444828158514, 'recall': 0.9073041168658699, 'f1-score': 0.9066986627172665, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3263.80 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4393.44 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7971722522220549, 'f1_macro': 0.8985803727085407, 'f1_micro': 0.9100929614873838, 'accuracy_balanced': 0.8979027020585695, 'accuracy': 0.9100929614873838, 'precision_macro': 0.89927072398923, 'recall_macro': 0.8979027020585695, 'precision_micro': 0.9100929614873838, 'recall_micro': 0.9100929614873838}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.867363344051447, 'recall': 0.8614770459081836, 'f1-score': 0.8644101742439415, 'support': 2505.0}, 'not_entailment': {'precision': 0.9311781039270131, 'recall': 0.9343283582089552, 'f1-score': 0.93275057117314, 'support': 5025.0}, 'accuracy': 0.9100929614873838, 'macro avg': {'precision': 0.89927072398923, 'recall': 0.8979027020585695, 'f1-score': 0.8985803727085407, 'support': 7530.0}, 'weighted avg': {'precision': 0.9099488909803606, 'recall': 0.9100929614873838, 'f1-score': 0.9100158176130283, 'support': 7530.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4068.11 examples/s]
Map: 100%|██████████| 7530/7530 [00:01<00:00, 4828.91 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.7889229456296178, 'f1_macro': 0.8923245351693465, 'f1_micro': 0.9074369189907039, 'accuracy_balanced': 0.8801260786485633, 'accuracy': 0.9074369189907039, 'precision_macro': 0.9093374863635296, 'recall_macro': 0.8801260786485633, 'precision_micro': 0.9074369189907039, 'recall_micro': 0.9074369189907039}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.9138952164009112, 'recall': 0.7979315831344471, 'f1-score': 0.851985559566787, 'support': 2514.0}, 'not_entailment': {'precision': 0.904779756326148, 'recall': 0.9623205741626795, 'f1-score': 0.9326635107719061, 'support': 5016.0}, 'accuracy': 0.9074369189907039, 'macro avg': {'precision': 0.9093374863635296, 'recall': 0.8801260786485633, 'f1-score': 0.8923245351693465, 'support': 7530.0}, 'weighted avg': {'precision': 0.9078230852276027, 'recall': 0.9074369189907039, 'f1-score': 0.9057280035568106, 'support': 7530.0}} 

100
CPU times: total: 39min 45s
Wall time: 3h 44min 16s


In [59]:
res_fr = pd.DataFrame({'n':shots_list, 'mcc':mcc_list})
#res_fr['mcc'] = acc_list
res_fr.groupby('n').mean()

Unnamed: 0_level_0,mcc
n,Unnamed: 1_level_1
10,0.722883
25,0.737884
50,0.760218
100,0.780575


### Zero Shot

In [61]:
pipe = pipeline("zero-shot-classification", model = modname, device = 0, batch_size = 32)
res = pipe(list(fr['premise'].str.lower()), ['freedom and rights except voting.'], hypothesis_template = 'This text is about {}', multi_label = False)
labels = [round(label['scores'][0], 0) for label in res]
fr['0_shot'] = labels
fr['0_shot'].replace({0:1, 1:0}, inplace = True)

zs_fr = pd.DataFrame({'n':0, 'mcc':matthews_corrcoef(fr['entailment'], fr['0_shot']), 'accuracy':accuracy_score(fr['entailment'], fr['0_shot'])}, index = [0])
res_fr = pd.concat([res_fr, zs_fr], axis = 0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fr['0_shot'].replace({0:1, 1:0}, inplace = True)


In [62]:
res_fr.to_csv('motn_fewshot_base2.csv', index = False)

### Supervised

In [53]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report, matthews_corrcoef

In [63]:
fr = pd.read_csv('freedom_test.csv')
fr = fr[~fr['text'].isna()]
fr['text'] = fr['text'].astype(str)
fr.drop_duplicates('text', inplace = True)
fr.rename({'freedom_and_rights': 'label'}, axis = 1, inplace = True)

In [64]:
val = fr.sample(frac = .2, random_state = 1)
train = fr[~fr.index.isin(val.index)]

train = fr.sample(100, random_state = 1)
val = fr[~fr.index.isin(train.index)]

In [94]:
#val = df.sample(frac = .2, random_state = 1)
#train = df[~df.index.isin(val.index)]

train = df.sample(50, random_state = 1)
val = df[~df.index.isin(train.index)]

In [95]:
modname = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(modname)

In [96]:
ds = DatasetDict({'train': Dataset.from_pandas(train, preserve_index=False), 'validation':Dataset.from_pandas(val, preserve_index=False)})

In [97]:
ds = ds.rename_columns({'premise':'text', 'entailment':'label'})

In [98]:
def tokenize_function(docs):
    return tokenizer(docs['text'], padding = True, truncation = True)

dstok = ds.map(tokenize_function, batched = True)
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels = 2, ignore_mismatched_sizes=True)

Map: 100%|██████████| 50/50 [00:00<00:00, 3274.04 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5374.18 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    #deepspeed="ds_config_zero3.json",  # if using deepspeed
    lr_scheduler_type= "linear",
    group_by_length=True,  # can increase speed with dynamic padding, by grouping similar length texts https://huggingface.co/transformers/main_classes/trainer.html
    learning_rate=9e-6 if "large" in modname else 2e-5,
    per_device_train_batch_size=4 if "large" in modname else 2,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4 if "large" in modname else 1,  # (!adapt/halve batch size accordingly). accumulates gradients over X steps, only then backward/update. decreases memory usage, but also slightly speed
    #eval_accumulation_steps=2,
    num_train_epochs=5,
    #max_steps=400,
    #warmup_steps=0,  # 1000,
    warmup_ratio=0.06,  #0.1, 0.06
    weight_decay=0.01,  #0.1,
    fp16=True,   # ! only makes sense at batch-size > 8. loads two copies of model weights, which creates overhead. https://huggingface.co/transformers/performance.html?#fp16
    fp16_full_eval=True,
    eval_strategy="epoch",
    seed=1,
    #load_best_model_at_end=True,
    #metric_for_best_model="accuracy",
    #eval_steps=50,  # evaluate after n steps if evaluation_strategy!='steps'. defaults to logging_steps
    save_strategy="no",  # options: "no"/"steps"/"epoch"
    #save_steps=100,  # Number of updates steps before two checkpoint saves.
    dataloader_num_workers = 12,
    #save_total_limit=1,  # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir
    #logging_strategy="epoch",
    #report_to="all",  # "all"
    #run_name=run_name,
    #push_to_hub=True,  # does not seem to work if save_strategy="no"
    #hub_model_id=hub_model_id,
    #hub_token=config.HF_ACCESS_TOKEN,
    #hub_strategy="end",
    #hub_private_repo=True,
)

In [100]:
def compute_metrics_standard(eval_pred, label_text_alphabetical=list(model.config.id2label.values())):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)  # argmax on each row (axis=1) in the tensor

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)
    mcc = matthews_corrcoef(labels, preds_max)

    metrics = {'MCC': mcc,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            #'label_gold_raw': labels,
            #'label_predicted_raw': preds_max
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )  # print metrics but without label lists
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

In [101]:
trainer = Trainer(
    model=model,
    #model_init=model_init,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dstok['train'],
    eval_dataset=dstok['validation'],
    compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(model.config.id2label.values()))  #compute_metrics,
    #data_collator=data_collator,  # for weighted sampling per dataset; for dynamic padding probably not necessary because done by default  https://huggingface.co/course/chapter3/3?fw=pt
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [102]:
trainer.train()
#trainer.train(resume_from_checkpoint = 'training_base/checkpoint-157664')

Epoch,Training Loss,Validation Loss,Mcc,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
1,No log,0.565873,0.0,0.422997,0.733092,0.5,0.733092,0.366546,0.5,0.733092,0.733092
2,No log,0.647523,0.0,0.422997,0.733092,0.5,0.733092,0.366546,0.5,0.733092,0.733092
3,No log,0.894894,0.0,0.422997,0.733092,0.5,0.733092,0.366546,0.5,0.733092,0.733092
4,No log,0.813769,0.0,0.422997,0.733092,0.5,0.733092,0.366546,0.5,0.733092,0.733092
5,No log,0.87292,0.0,0.422997,0.733092,0.5,0.733092,0.366546,0.5,0.733092,0.733092


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aggregate metrics:  {'MCC': 0.0, 'f1_macro': 0.4229967232648198, 'f1_micro': 0.73309241094476, 'accuracy_balanced': 0.5, 'accuracy': 0.73309241094476, 'precision_macro': 0.36654620547238, 'recall_macro': 0.5, 'precision_micro': 0.73309241094476, 'recall_micro': 0.73309241094476}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Detailed metrics:  {'LABEL_0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 517.0}, 'LABEL_1': {'precision': 0.73309241094476, 'recall': 1.0, 'f1-score': 0.8459934465296396, 'support': 1420.0}, 'accuracy': 0.73309241094476, 'macro avg': {'precision': 0.36654620547238, 'recall': 0.5, 'f1-score': 0.4229967232648198, 'support': 1937.0}, 'weighted avg': {'precision': 0.5374244829848007, 'recall': 0.73309241094476, 'f1-score': 0.6201913753598803, 'support': 1937.0}} 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aggregate metrics:  {'MCC': 0.0, 'f1_macro': 0.4229967232648198, 'f1_micro': 0.73309241094476, 'accuracy_balanced': 0.5, 'accuracy': 0.73309241094476, 'precision_macro': 0.36654620547238, 'recall_macro': 0.5, 'precision_micro': 0.73309241094476, 'recall_micro': 0.73309241094476}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Detailed metrics:  {'LABEL_0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 517.0}, 'LABEL_1': {'precision': 0.73309241094476, 'recall': 1.0, 'f1-score': 0.8459934465296396, 'support': 1420.0}, 'accuracy': 0.73309241094476, 'macro avg': {'precision': 0.36654620547238, 'recall': 0.5, 'f1-score': 0.4229967232648198, 'support': 1937.0}, 'weighted avg': {'precision': 0.5374244829848007, 'recall': 0.73309241094476, 'f1-score': 0.6201913753598803, 'support': 1937.0}} 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aggregate metrics:  {'MCC': 0.0, 'f1_macro': 0.4229967232648198, 'f1_micro': 0.73309241094476, 'accuracy_balanced': 0.5, 'accuracy': 0.73309241094476, 'precision_macro': 0.36654620547238, 'recall_macro': 0.5, 'precision_micro': 0.73309241094476, 'recall_micro': 0.73309241094476}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Detailed metrics:  {'LABEL_0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 517.0}, 'LABEL_1': {'precision': 0.73309241094476, 'recall': 1.0, 'f1-score': 0.8459934465296396, 'support': 1420.0}, 'accuracy': 0.73309241094476, 'macro avg': {'precision': 0.36654620547238, 'recall': 0.5, 'f1-score': 0.4229967232648198, 'support': 1937.0}, 'weighted avg': {'precision': 0.5374244829848007, 'recall': 0.73309241094476, 'f1-score': 0.6201913753598803, 'support': 1937.0}} 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aggregate metrics:  {'MCC': 0.0, 'f1_macro': 0.4229967232648198, 'f1_micro': 0.73309241094476, 'accuracy_balanced': 0.5, 'accuracy': 0.73309241094476, 'precision_macro': 0.36654620547238, 'recall_macro': 0.5, 'precision_micro': 0.73309241094476, 'recall_micro': 0.73309241094476}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Detailed metrics:  {'LABEL_0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 517.0}, 'LABEL_1': {'precision': 0.73309241094476, 'recall': 1.0, 'f1-score': 0.8459934465296396, 'support': 1420.0}, 'accuracy': 0.73309241094476, 'macro avg': {'precision': 0.36654620547238, 'recall': 0.5, 'f1-score': 0.4229967232648198, 'support': 1937.0}, 'weighted avg': {'precision': 0.5374244829848007, 'recall': 0.73309241094476, 'f1-score': 0.6201913753598803, 'support': 1937.0}} 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Aggregate metrics:  {'MCC': 0.0, 'f1_macro': 0.4229967232648198, 'f1_micro': 0.73309241094476, 'accuracy_balanced': 0.5, 'accuracy': 0.73309241094476, 'precision_macro': 0.36654620547238, 'recall_macro': 0.5, 'precision_micro': 0.73309241094476, 'recall_micro': 0.73309241094476}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Detailed metrics:  {'LABEL_0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 517.0}, 'LABEL_1': {'precision': 0.73309241094476, 'recall': 1.0, 'f1-score': 0.8459934465296396, 'support': 1420.0}, 'accuracy': 0.73309241094476, 'macro avg': {'precision': 0.36654620547238, 'recall': 0.5, 'f1-score': 0.4229967232648198, 'support': 1937.0}, 'weighted avg': {'precision': 0.5374244829848007, 'recall': 0.73309241094476, 'f1-score': 0.6201913753598803, 'support': 1937.0}} 



TrainOutput(global_step=125, training_loss=0.5637968139648437, metrics={'train_runtime': 489.4979, 'train_samples_per_second': 0.511, 'train_steps_per_second': 0.255, 'total_flos': 44065962885000.0, 'train_loss': 0.5637968139648437, 'epoch': 5.0})

# Covid

In [43]:
df = pd.read_csv('covid_tweets_labeled.csv')
df = df[['text', 'non_comp']]
df['hypothesis'] = 'The author of this tweet does not believe COVID is dangerous.'
df.rename({'text':'premise', 'non_comp':'entailment'}, axis = 1, inplace = True)
df['entailment'].replace({0:1, 1:0}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['entailment'].replace({0:1, 1:0}, inplace = True)


### Few Shot

In [47]:
training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=False,
    learning_rate = 2e-5,
    #learning_rate = 9e-6,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1, 
    num_train_epochs=5,
    warmup_ratio=0.06,  
    weight_decay=0.01, 
    fp16=True,   
    fp16_full_eval=True,
    eval_strategy="no",
    seed=1,
    save_strategy="no",
    dataloader_num_workers = 12,
)

tokenizer = AutoTokenizer.from_pretrained(modname)

In [48]:
%%time
# Define a function to initialize the modelin the trainer. This will make results reproducible
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(modname, num_labels = 2, ignore_mismatched_sizes=True)

# Define the number of samples (shots) and random seeds to use
shots = [10, 25, 50, 100]
seeds = [1,2,3,4,5,6,7,8,9,10]

# Initialize lists to store results
mcc_list = []
acc_list = []
shots_list = []

# Iterate through different shot sizes
for shot in shots:
    # Iterate through different random seeds
    for seed in seeds:
        # Sample training data based on current shot size and seed
        train = df.sample(shot, random_state = seed)
        # Create validation set with remaining instances
        val = df[~df.index.isin(train.index)]
        
        # Create a DatasetDict with train and validation splits
        ds = DatasetDict({'train': Dataset.from_pandas(train, preserve_index=False), 'validation':Dataset.from_pandas(val, preserve_index=False)})
        # Tokenize the dataset
        dstok = ds.map(tokenize_function, batched = True)
        # Rename 'entailment' column to 'label'
        dstok = dstok.rename_columns({'entailment':'label'})
        # Define label mapping
        id2label = {0: "entailment", 1: "not_entailment"}
        
        # Initialize the Trainer
        trainer = Trainer(
            model_init = model_init,
            tokenizer=tokenizer,
            args=training_args,
            train_dataset=dstok['train'],
            eval_dataset=dstok['validation'],
            compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(model.config.id2label.values()))
        )
        
        # Train the model
        trainer.train()
        # Make predictions on validation set
        res = trainer.predict(dstok['validation'])
        preds = np.argmax(res.predictions, axis=-1)
        
        # Calculate Matthews Correlation Coefficient
        mcc_res = matthews_corrcoef(val['entailment'], preds)
        mcc_list.append(mcc_res)
        # Calculate Accuracy
        acc_res = accuracy_score(val['entailment'], preds)
        acc_list.append(acc_res)
        # Store the current shot size
        shots_list.append(shot)
    
    # Print progress
    print(shot)

Map: 100%|██████████| 10/10 [00:00<00:00, 1614.68 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5577.52 examples/s]


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5158637973867064, 'f1_macro': 0.7491694092490311, 'f1_micro': 0.8234699038947901, 'accuracy_balanced': 0.727720393946673, 'accuracy': 0.8234699038947901, 'precision_macro': 0.7921515425585368, 'recall_macro': 0.727720393946673, 'precision_micro': 0.8234699038947901, 'recall_micro': 0.8234699038947901}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7419354838709677, 'recall': 0.5217391304347826, 'f1-score': 0.6126526082130965, 'support': 529.0}, 'not_entailment': {'precision': 0.8423676012461059, 'recall': 0.9337016574585635, 'f1-score': 0.8856862102849656, 'support': 1448.0}, 'accuracy': 0.8234699038947901, 'macro avg': {'precision': 0.7921515425585368, 'recall': 0.727720393946673, 'f1-score': 0.7491694092490311, 'support': 1977.0}, 'weighted avg': {'precision': 0.8154942628083476, 'recall': 0.8234699038947901, 'f1-score': 0.8126286607169237, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1424.12 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5085.62 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5223192565201425, 'f1_macro': 0.7569981119071005, 'f1_micro': 0.8234699038947901, 'accuracy_balanced': 0.7407491984558006, 'accuracy': 0.8234699038947901, 'precision_macro': 0.7833004299512147, 'recall_macro': 0.7407491984558006, 'precision_micro': 0.8234699038947901, 'recall_micro': 0.8234699038947901}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7139423076923077, 'recall': 0.5635673624288425, 'f1-score': 0.6299045599151644, 'support': 527.0}, 'not_entailment': {'precision': 0.8526585522101218, 'recall': 0.9179310344827586, 'f1-score': 0.8840916638990368, 'support': 1450.0}, 'accuracy': 0.8234699038947901, 'macro avg': {'precision': 0.7833004299512147, 'recall': 0.7407491984558006, 'f1-score': 0.7569981119071005, 'support': 1977.0}, 'weighted avg': {'precision': 0.8156815866760357, 'recall': 0.8234699038947901, 'f1-score': 0.816334150596305, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1560.21 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5500.05 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.3774911846143342, 'f1_macro': 0.6181362627317838, 'f1_micro': 0.7845220030349014, 'accuracy_balanced': 0.6095951426495065, 'accuracy': 0.7845220030349014, 'precision_macro': 0.8250591016548463, 'recall_macro': 0.6095951426495065, 'precision_micro': 0.7845220030349014, 'recall_micro': 0.7845220030349014}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8723404255319149, 'recall': 0.23163841807909605, 'f1-score': 0.36607142857142855, 'support': 531.0}, 'not_entailment': {'precision': 0.7777777777777778, 'recall': 0.9875518672199171, 'f1-score': 0.870201096892139, 'support': 1446.0}, 'accuracy': 0.7845220030349014, 'macro avg': {'precision': 0.8250591016548463, 'recall': 0.6095951426495065, 'f1-score': 0.6181362627317838, 'support': 1977.0}, 'weighted avg': {'precision': 0.8031762431077965, 'recall': 0.7845220030349014, 'f1-score': 0.734797528921326, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1414.51 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 4604.22 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.35786061461940194, 'f1_macro': 0.598634755075433, 'f1_micro': 0.7794638340920587, 'accuracy_balanced': 0.5964513437039548, 'accuracy': 0.7794638340920587, 'precision_macro': 0.8319399569197631, 'recall_macro': 0.5964513437039548, 'precision_micro': 0.7794638340920587, 'recall_micro': 0.7794638340920587}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8916666666666667, 'recall': 0.2018867924528302, 'f1-score': 0.3292307692307692, 'support': 530.0}, 'not_entailment': {'precision': 0.7722132471728594, 'recall': 0.9910158949550795, 'f1-score': 0.8680387409200968, 'support': 1447.0}, 'accuracy': 0.7794638340920587, 'macro avg': {'precision': 0.8319399569197631, 'recall': 0.5964513437039548, 'f1-score': 0.598634755075433, 'support': 1977.0}, 'weighted avg': {'precision': 0.804236672732656, 'recall': 0.7794638340920587, 'f1-score': 0.7235935082466808, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1992.54 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5676.99 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.4645404902317606, 'f1_macro': 0.7124690376093625, 'f1_micro': 0.7410217501264542, 'accuracy_balanced': 0.7589226896292114, 'accuracy': 0.7410217501264542, 'precision_macro': 0.7083612944213159, 'recall_macro': 0.7589226896292114, 'precision_micro': 0.7410217501264542, 'recall_micro': 0.7410217501264542}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.5096852300242131, 'recall': 0.7973484848484849, 'f1-score': 0.621861152141802, 'support': 528.0}, 'not_entailment': {'precision': 0.9070373588184187, 'recall': 0.7204968944099379, 'f1-score': 0.803076923076923, 'support': 1449.0}, 'accuracy': 0.7410217501264542, 'macro avg': {'precision': 0.7083612944213159, 'recall': 0.7589226896292114, 'f1-score': 0.7124690376093625, 'support': 1977.0}, 'weighted avg': {'precision': 0.8009160012041847, 'recall': 0.7410217501264542, 'f1-score': 0.7546793878954643, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1992.07 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5661.31 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.43325421186522367, 'f1_macro': 0.6722968085505724, 'f1_micro': 0.8012139605462822, 'accuracy_balanced': 0.6501399492422897, 'accuracy': 0.8012139605462822, 'precision_macro': 0.8125570726616516, 'recall_macro': 0.6501399492422897, 'precision_micro': 0.8012139605462822, 'recall_micro': 0.8012139605462822}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8269230769230769, 'recall': 0.32514177693761814, 'f1-score': 0.46675712347354137, 'support': 529.0}, 'not_entailment': {'precision': 0.7981910684002261, 'recall': 0.9751381215469613, 'f1-score': 0.8778364936276034, 'support': 1448.0}, 'accuracy': 0.8012139605462822, 'macro avg': {'precision': 0.8125570726616516, 'recall': 0.6501399492422897, 'f1-score': 0.6722968085505724, 'support': 1977.0}, 'weighted avg': {'precision': 0.8058790969832247, 'recall': 0.8012139605462822, 'f1-score': 0.7678410526506187, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1868.87 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5727.40 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5173309589721384, 'f1_macro': 0.7585325528397158, 'f1_micro': 0.8133535660091047, 'accuracy_balanced': 0.7552995012234143, 'accuracy': 0.8133535660091047, 'precision_macro': 0.7620758362516604, 'recall_macro': 0.7552995012234143, 'precision_micro': 0.8133535660091047, 'recall_micro': 0.8133535660091047}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6568047337278107, 'recall': 0.6306818181818182, 'f1-score': 0.6434782608695652, 'support': 528.0}, 'not_entailment': {'precision': 0.8673469387755102, 'recall': 0.8799171842650103, 'f1-score': 0.8735868448098664, 'support': 1449.0}, 'accuracy': 0.8133535660091047, 'macro avg': {'precision': 0.7620758362516604, 'recall': 0.7552995012234143, 'f1-score': 0.7585325528397158, 'support': 1977.0}, 'weighted avg': {'precision': 0.8111171541193719, 'recall': 0.8133535660091047, 'f1-score': 0.8121314415116979, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1983.97 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5730.53 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.3252623334870806, 'f1_macro': 0.662222010795281, 'f1_micro': 0.7420333839150227, 'accuracy_balanced': 0.6584929261559697, 'accuracy': 0.7420333839150227, 'precision_macro': 0.666877456539211, 'recall_macro': 0.6584929261559697, 'precision_micro': 0.7420333839150227, 'recall_micro': 0.7420333839150227}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.5184426229508197, 'recall': 0.4791666666666667, 'f1-score': 0.49803149606299213, 'support': 528.0}, 'not_entailment': {'precision': 0.8153122901276024, 'recall': 0.8378191856452726, 'f1-score': 0.8264125255275698, 'support': 1449.0}, 'accuracy': 0.7420333839150227, 'macro avg': {'precision': 0.666877456539211, 'recall': 0.6584929261559697, 'f1-score': 0.662222010795281, 'support': 1977.0}, 'weighted avg': {'precision': 0.7360269161926801, 'recall': 0.7420333839150227, 'f1-score': 0.7387113704657099, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1423.78 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5090.53 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.16601183608012687, 'f1_macro': 0.5770661427635888, 'f1_micro': 0.7025796661608498, 'accuracy_balanced': 0.5732076836311606, 'accuracy': 0.7025796661608498, 'precision_macro': 0.5941155639398079, 'recall_macro': 0.5732076836311606, 'precision_micro': 0.7025796661608498, 'recall_micro': 0.7025796661608498}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.42048517520215634, 'recall': 0.2948960302457467, 'f1-score': 0.3466666666666667, 'support': 529.0}, 'not_entailment': {'precision': 0.7677459526774595, 'recall': 0.8515193370165746, 'f1-score': 0.8074656188605108, 'support': 1448.0}, 'accuracy': 0.7025796661608498, 'macro avg': {'precision': 0.5941155639398079, 'recall': 0.5732076836311606, 'f1-score': 0.5770661427635888, 'support': 1977.0}, 'weighted avg': {'precision': 0.6748269080217006, 'recall': 0.7025796661608498, 'f1-score': 0.6841663544646871, 'support': 1977.0}} 



Map: 100%|██████████| 10/10 [00:00<00:00, 1794.12 examples/s]
Map: 100%|██████████| 1977/1977 [00:00<00:00, 5677.13 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.20927651410377224, 'f1_macro': 0.49573806702389345, 'f1_micro': 0.7486090035407182, 'accuracy_balanced': 0.5355900946308148, 'accuracy': 0.7486090035407182, 'precision_macro': 0.8076464098349583, 'recall_macro': 0.5355900946308148, 'precision_micro': 0.7486090035407182, 'recall_micro': 0.7486090035407182}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8695652173913043, 'recall': 0.07532956685499058, 'f1-score': 0.1386481802426343, 'support': 531.0}, 'not_entailment': {'precision': 0.7457276022786121, 'recall': 0.995850622406639, 'f1-score': 0.8528279538051525, 'support': 1446.0}, 'accuracy': 0.7486090035407182, 'macro avg': {'precision': 0.8076464098349583, 'recall': 0.5355900946308148, 'f1-score': 0.49573806702389345, 'support': 1977.0}, 'weighted avg': {'precision': 0.7789889951085764, 'recall': 0.7486090035407182, 'f1-score': 0.6610072862473897, 'support': 1977.0}} 

10


Map: 100%|██████████| 25/25 [00:00<00:00, 2752.38 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5694.12 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.49588813717170444, 'f1_macro': 0.7470373296464546, 'f1_micro': 0.8083588175331294, 'accuracy_balanced': 0.739218610199821, 'accuracy': 0.8083588175331294, 'precision_macro': 0.7569877865921644, 'recall_macro': 0.739218610199821, 'precision_micro': 0.8083588175331294, 'recall_micro': 0.8083588175331294}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6581740976645435, 'recall': 0.5904761904761905, 'f1-score': 0.6224899598393574, 'support': 525.0}, 'not_entailment': {'precision': 0.8558014755197854, 'recall': 0.8879610299234516, 'f1-score': 0.8715846994535519, 'support': 1437.0}, 'accuracy': 0.8083588175331294, 'macro avg': {'precision': 0.7569877865921644, 'recall': 0.739218610199821, 'f1-score': 0.7470373296464546, 'support': 1962.0}, 'weighted avg': {'precision': 0.8029195319040862, 'recall': 0.8083588175331294, 'f1-score': 0.8049309082723836, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 3035.39 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5695.23 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5355540823892682, 'f1_macro': 0.7670531390991028, 'f1_micro': 0.8124362895005097, 'accuracy_balanced': 0.7753724769033095, 'accuracy': 0.8124362895005097, 'precision_macro': 0.7603911058843041, 'recall_macro': 0.7753724769033095, 'precision_micro': 0.8124362895005097, 'recall_micro': 0.8124362895005097}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6352530541012217, 'recall': 0.6959847036328872, 'f1-score': 0.6642335766423357, 'support': 523.0}, 'not_entailment': {'precision': 0.8855291576673866, 'recall': 0.8547602501737318, 'f1-score': 0.8698727015558698, 'support': 1439.0}, 'accuracy': 0.8124362895005097, 'macro avg': {'precision': 0.7603911058843041, 'recall': 0.7753724769033095, 'f1-score': 0.7670531390991028, 'support': 1962.0}, 'weighted avg': {'precision': 0.8188143757279859, 'recall': 0.8124362895005097, 'f1-score': 0.8150565637731082, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 3107.63 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5604.08 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.510996848830496, 'f1_macro': 0.7242454524677933, 'f1_micro': 0.8241590214067278, 'accuracy_balanced': 0.6943970573615668, 'accuracy': 0.8241590214067278, 'precision_macro': 0.835804696658851, 'recall_macro': 0.6943970573615668, 'precision_micro': 0.8241590214067278, 'recall_micro': 0.8241590214067278}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8515625, 'recall': 0.41523809523809524, 'f1-score': 0.558258642765685, 'support': 525.0}, 'not_entailment': {'precision': 0.8200468933177022, 'recall': 0.9735560194850382, 'f1-score': 0.8902322621699014, 'support': 1437.0}, 'accuracy': 0.8241590214067278, 'macro avg': {'precision': 0.835804696658851, 'recall': 0.6943970573615668, 'f1-score': 0.7242454524677933, 'support': 1962.0}, 'weighted avg': {'precision': 0.8284799685002743, 'recall': 0.8241590214067278, 'f1-score': 0.8014014007085284, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2459.08 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 4136.59 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.51056240563606, 'f1_macro': 0.731354553167779, 'f1_micro': 0.8236493374108053, 'accuracy_balanced': 0.7029441516968707, 'accuracy': 0.8236493374108053, 'precision_macro': 0.8211154003075667, 'recall_macro': 0.7029441516968707, 'precision_micro': 0.8236493374108053, 'recall_micro': 0.8236493374108053}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8175438596491228, 'recall': 0.44212523719165087, 'f1-score': 0.5738916256157636, 'support': 527.0}, 'not_entailment': {'precision': 0.8246869409660107, 'recall': 0.9637630662020906, 'f1-score': 0.8888174807197944, 'support': 1435.0}, 'accuracy': 0.8236493374108053, 'macro avg': {'precision': 0.8211154003075667, 'recall': 0.7029441516968707, 'f1-score': 0.731354553167779, 'support': 1962.0}, 'weighted avg': {'precision': 0.8227682845674378, 'recall': 0.8236493374108053, 'f1-score': 0.8042273045527076, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2758.76 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5683.48 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5851076881552162, 'f1_macro': 0.7891527554239512, 'f1_micro': 0.8251783893985729, 'accuracy_balanced': 0.8092981654614451, 'accuracy': 0.8251783893985729, 'precision_macro': 0.7767160017160017, 'recall_macro': 0.8092981654614451, 'precision_micro': 0.8251783893985729, 'recall_micro': 0.8251783893985729}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6412698412698413, 'recall': 0.7754318618042226, 'f1-score': 0.7019982623805386, 'support': 521.0}, 'not_entailment': {'precision': 0.9121621621621622, 'recall': 0.8431644691186676, 'f1-score': 0.8763072484673639, 'support': 1441.0}, 'accuracy': 0.8251783893985729, 'macro avg': {'precision': 0.7767160017160017, 'recall': 0.8092981654614451, 'f1-score': 0.7891527554239512, 'support': 1962.0}, 'weighted avg': {'precision': 0.8402279627814796, 'recall': 0.8251783893985729, 'f1-score': 0.8300203056787624, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2961.41 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5831.30 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5245374632036893, 'f1_macro': 0.752818486686472, 'f1_micro': 0.8261977573904179, 'accuracy_balanced': 0.7305006975252728, 'accuracy': 0.8261977573904179, 'precision_macro': 0.7984150951148363, 'recall_macro': 0.7305006975252728, 'precision_micro': 0.8261977573904179, 'recall_micro': 0.8261977573904179}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7540983606557377, 'recall': 0.523719165085389, 'f1-score': 0.6181410974244121, 'support': 527.0}, 'not_entailment': {'precision': 0.8427318295739349, 'recall': 0.9372822299651568, 'f1-score': 0.8874958759485319, 'support': 1435.0}, 'accuracy': 0.8261977573904179, 'macro avg': {'precision': 0.7984150951148363, 'recall': 0.7305006975252728, 'f1-score': 0.752818486686472, 'support': 1962.0}, 'weighted avg': {'precision': 0.818924572632095, 'recall': 0.8261977573904179, 'f1-score': 0.8151462488933784, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2494.77 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 4694.08 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.55051088520169, 'f1_macro': 0.7707697425235429, 'f1_micro': 0.8333333333333334, 'accuracy_balanced': 0.7532458494880208, 'accuracy': 0.8333333333333334, 'precision_macro': 0.7991778891324772, 'recall_macro': 0.7532458494880208, 'precision_micro': 0.8333333333333334, 'recall_micro': 0.8333333333333334}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7402912621359223, 'recall': 0.580952380952381, 'f1-score': 0.6510138740661686, 'support': 525.0}, 'not_entailment': {'precision': 0.8580645161290322, 'recall': 0.9255393180236604, 'f1-score': 0.8905256109809173, 'support': 1437.0}, 'accuracy': 0.8333333333333334, 'macro avg': {'precision': 0.7991778891324772, 'recall': 0.7532458494880208, 'f1-score': 0.7707697425235429, 'support': 1962.0}, 'weighted avg': {'precision': 0.8265502662073285, 'recall': 0.8333333333333334, 'f1-score': 0.8264360789318638, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2761.95 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5726.22 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.46203121399252073, 'f1_macro': 0.7309926836802625, 'f1_micro': 0.7884811416921509, 'accuracy_balanced': 0.7322650767940877, 'accuracy': 0.7884811416921509, 'precision_macro': 0.7297728587202271, 'recall_macro': 0.7322650767940877, 'precision_micro': 0.7884811416921509, 'recall_micro': 0.7884811416921509}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6015037593984962, 'recall': 0.6118546845124283, 'f1-score': 0.6066350710900474, 'support': 523.0}, 'not_entailment': {'precision': 0.858041958041958, 'recall': 0.8526754690757471, 'f1-score': 0.8553502962704775, 'support': 1439.0}, 'accuracy': 0.7884811416921509, 'macro avg': {'precision': 0.7297728587202271, 'recall': 0.7322650767940877, 'f1-score': 0.7309926836802625, 'support': 1962.0}, 'weighted avg': {'precision': 0.7896579224198733, 'recall': 0.7884811416921509, 'f1-score': 0.7890515894563263, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 3089.86 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5686.35 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.34959358636563664, 'f1_macro': 0.6139896279103503, 'f1_micro': 0.7787971457696228, 'accuracy_balanced': 0.6061262802249595, 'accuracy': 0.7787971457696228, 'precision_macro': 0.7879015343064015, 'recall_macro': 0.6061262802249595, 'precision_micro': 0.7787971457696228, 'recall_micro': 0.7787971457696228}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7987012987012987, 'recall': 0.2338403041825095, 'f1-score': 0.36176470588235293, 'support': 526.0}, 'not_entailment': {'precision': 0.7771017699115044, 'recall': 0.9784122562674095, 'f1-score': 0.8662145499383477, 'support': 1436.0}, 'accuracy': 0.7787971457696228, 'macro avg': {'precision': 0.7879015343064015, 'recall': 0.6061262802249595, 'f1-score': 0.6139896279103503, 'support': 1962.0}, 'weighted avg': {'precision': 0.7828924692710516, 'recall': 0.7787971457696228, 'f1-score': 0.7309746834890851, 'support': 1962.0}} 



Map: 100%|██████████| 25/25 [00:00<00:00, 2760.65 examples/s]
Map: 100%|██████████| 1962/1962 [00:00<00:00, 5679.20 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5012352982401268, 'f1_macro': 0.7416283191716573, 'f1_micro': 0.8180428134556575, 'accuracy_balanced': 0.7207234427996219, 'accuracy': 0.8180428134556575, 'precision_macro': 0.7845606486280072, 'recall_macro': 0.7207234427996219, 'precision_micro': 0.8180428134556575, 'recall_micro': 0.8180428134556575}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7309782608695652, 'recall': 0.5104364326375711, 'f1-score': 0.6011173184357542, 'support': 527.0}, 'not_entailment': {'precision': 0.8381430363864492, 'recall': 0.9310104529616725, 'f1-score': 0.8821393199075602, 'support': 1435.0}, 'accuracy': 0.8180428134556575, 'macro avg': {'precision': 0.7845606486280072, 'recall': 0.7207234427996219, 'f1-score': 0.7416283191716573, 'support': 1962.0}, 'weighted avg': {'precision': 0.8093582062654514, 'recall': 0.8180428134556575, 'f1-score': 0.8066558363317999, 'support': 1962.0}} 

25


Map: 100%|██████████| 50/50 [00:00<00:00, 3833.64 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5742.02 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5822118255829892, 'f1_macro': 0.7878961261222063, 'f1_micro': 0.8440887971089314, 'accuracy_balanced': 0.7718909199880132, 'accuracy': 0.8440887971089314, 'precision_macro': 0.8116788617505333, 'recall_macro': 0.7718909199880132, 'precision_micro': 0.8440887971089314, 'recall_micro': 0.8440887971089314}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7541371158392435, 'recall': 0.6170212765957447, 'f1-score': 0.6787234042553192, 'support': 517.0}, 'not_entailment': {'precision': 0.869220607661823, 'recall': 0.9267605633802817, 'f1-score': 0.8970688479890934, 'support': 1420.0}, 'accuracy': 0.8440887971089314, 'macro avg': {'precision': 0.8116788617505333, 'recall': 0.7718909199880132, 'f1-score': 0.7878961261222063, 'support': 1937.0}, 'weighted avg': {'precision': 0.8385039503194, 'recall': 0.8440887971089314, 'f1-score': 0.8387907920209152, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 4098.80 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5745.02 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.4633226140313118, 'f1_macro': 0.7074258319788537, 'f1_micro': 0.8100154878678368, 'accuracy_balanced': 0.6822309096357643, 'accuracy': 0.8100154878678368, 'precision_macro': 0.7944997710622711, 'recall_macro': 0.6822309096357643, 'precision_micro': 0.8100154878678368, 'recall_micro': 0.8100154878678368}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7728937728937729, 'recall': 0.4081237911025145, 'f1-score': 0.5341772151898734, 'support': 517.0}, 'not_entailment': {'precision': 0.8161057692307693, 'recall': 0.956338028169014, 'f1-score': 0.880674448767834, 'support': 1420.0}, 'accuracy': 0.8100154878678368, 'macro avg': {'precision': 0.7944997710622711, 'recall': 0.6822309096357643, 'f1-score': 0.7074258319788537, 'support': 1937.0}, 'weighted avg': {'precision': 0.8045721594701978, 'recall': 0.8100154878678368, 'f1-score': 0.7881917075392302, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 4078.32 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5713.80 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5898827801508547, 'f1_macro': 0.7775006047233894, 'f1_micro': 0.8487351574599896, 'accuracy_balanced': 0.7465510472090221, 'accuracy': 0.8487351574599896, 'precision_macro': 0.8528292601648384, 'recall_macro': 0.7465510472090221, 'precision_micro': 0.8487351574599896, 'recall_micro': 0.8487351574599896}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8589341692789969, 'recall': 0.524904214559387, 'f1-score': 0.6516052318668252, 'support': 522.0}, 'not_entailment': {'precision': 0.8467243510506799, 'recall': 0.9681978798586572, 'f1-score': 0.9033959775799538, 'support': 1415.0}, 'accuracy': 0.8487351574599896, 'macro avg': {'precision': 0.8528292601648384, 'recall': 0.7465510472090221, 'f1-score': 0.7775006047233894, 'support': 1937.0}, 'weighted avg': {'precision': 0.8500147615386414, 'recall': 0.8487351574599896, 'f1-score': 0.8355411663965501, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3594.15 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 4298.54 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.564726613101568, 'f1_macro': 0.7707951266492938, 'f1_micro': 0.8404749612803304, 'accuracy_balanced': 0.7448835568101624, 'accuracy': 0.8404749612803304, 'precision_macro': 0.8255793811754346, 'recall_macro': 0.7448835568101624, 'precision_micro': 0.8404749612803304, 'recall_micro': 0.8404749612803304}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.8022922636103151, 'recall': 0.5384615384615384, 'f1-score': 0.6444188722669736, 'support': 520.0}, 'not_entailment': {'precision': 0.8488664987405542, 'recall': 0.9513055751587862, 'f1-score': 0.897171381031614, 'support': 1417.0}, 'accuracy': 0.8404749612803304, 'macro avg': {'precision': 0.8255793811754346, 'recall': 0.7448835568101624, 'f1-score': 0.7707951266492938, 'support': 1937.0}, 'weighted avg': {'precision': 0.8363633483700202, 'recall': 0.8404749612803304, 'f1-score': 0.8293183585444623, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3861.52 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5724.94 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5825939754685414, 'f1_macro': 0.7887737536706383, 'f1_micro': 0.8265358802271554, 'accuracy_balanced': 0.805951168495521, 'accuracy': 0.8265358802271554, 'precision_macro': 0.7773447000719729, 'recall_macro': 0.805951168495521, 'precision_micro': 0.8265358802271554, 'recall_micro': 0.8265358802271554}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6462809917355372, 'recall': 0.7621832358674464, 'f1-score': 0.6994633273703041, 'support': 513.0}, 'not_entailment': {'precision': 0.9084084084084084, 'recall': 0.8497191011235955, 'f1-score': 0.8780841799709724, 'support': 1424.0}, 'accuracy': 0.8265358802271554, 'macro avg': {'precision': 0.7773447000719729, 'recall': 0.805951168495521, 'f1-score': 0.7887737536706383, 'support': 1937.0}, 'weighted avg': {'precision': 0.8389859175704203, 'recall': 0.8265358802271554, 'f1-score': 0.8307777796694016, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3775.18 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5586.63 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5832432706655862, 'f1_macro': 0.7851119681951332, 'f1_micro': 0.8456375838926175, 'accuracy_balanced': 0.7636291460832745, 'accuracy': 0.8456375838926175, 'precision_macro': 0.8225864038846045, 'recall_macro': 0.7636291460832745, 'precision_micro': 0.8456375838926175, 'recall_micro': 0.8456375838926175}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7840616966580977, 'recall': 0.5865384615384616, 'f1-score': 0.671067106710671, 'support': 520.0}, 'not_entailment': {'precision': 0.8611111111111112, 'recall': 0.9407198306280875, 'f1-score': 0.8991568296795953, 'support': 1417.0}, 'accuracy': 0.8456375838926175, 'macro avg': {'precision': 0.8225864038846045, 'recall': 0.7636291460832745, 'f1-score': 0.7851119681951332, 'support': 1937.0}, 'weighted avg': {'precision': 0.8404267045465438, 'recall': 0.8456375838926175, 'f1-score': 0.8379246892852532, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 2879.12 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 4227.92 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5844893144317603, 'f1_macro': 0.7912692655975246, 'f1_micro': 0.8425400103252452, 'accuracy_balanced': 0.7819097947646553, 'accuracy': 0.8425400103252452, 'precision_macro': 0.8029583975346688, 'recall_macro': 0.7819097947646553, 'precision_micro': 0.8425400103252452, 'recall_micro': 0.8425400103252452}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7272727272727273, 'recall': 0.6524271844660194, 'f1-score': 0.6878198567041965, 'support': 515.0}, 'not_entailment': {'precision': 0.8786440677966102, 'recall': 0.9113924050632911, 'f1-score': 0.8947186744908526, 'support': 1422.0}, 'accuracy': 0.8425400103252452, 'macro avg': {'precision': 0.8029583975346688, 'recall': 0.7819097947646553, 'f1-score': 0.7912692655975246, 'support': 1937.0}, 'weighted avg': {'precision': 0.838398202866409, 'recall': 0.8425400103252452, 'f1-score': 0.839709437960069, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3723.77 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5658.61 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.524132166600318, 'f1_macro': 0.7605647650582346, 'f1_micro': 0.8208569953536397, 'accuracy_balanced': 0.7501575209073204, 'accuracy': 0.8208569953536397, 'precision_macro': 0.7745415439327538, 'recall_macro': 0.7501575209073204, 'precision_micro': 0.8208569953536397, 'recall_micro': 0.8208569953536397}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.688195991091314, 'recall': 0.5988372093023255, 'f1-score': 0.6404145077720207, 'support': 516.0}, 'not_entailment': {'precision': 0.8608870967741935, 'recall': 0.9014778325123153, 'f1-score': 0.8807150223444483, 'support': 1421.0}, 'accuracy': 0.8208569953536397, 'macro avg': {'precision': 0.7745415439327538, 'recall': 0.7501575209073204, 'f1-score': 0.7605647650582346, 'support': 1937.0}, 'weighted avg': {'precision': 0.8148836840058065, 'recall': 0.8208569953536397, 'f1-score': 0.8167010494382156, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3728.54 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5645.45 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5447111833314443, 'f1_macro': 0.7649862450549209, 'f1_micro': 0.8322147651006712, 'accuracy_balanced': 0.7439056789962806, 'accuracy': 0.8322147651006712, 'precision_macro': 0.8041239901294663, 'recall_macro': 0.7439056789962806, 'precision_micro': 0.8322147651006712, 'recall_micro': 0.8322147651006712}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7578947368421053, 'recall': 0.5527831094049904, 'f1-score': 0.6392896781354052, 'support': 521.0}, 'not_entailment': {'precision': 0.8503532434168273, 'recall': 0.9350282485875706, 'f1-score': 0.8906828119744366, 'support': 1416.0}, 'accuracy': 0.8322147651006712, 'macro avg': {'precision': 0.8041239901294663, 'recall': 0.7439056789962806, 'f1-score': 0.7649862450549209, 'support': 1937.0}, 'weighted avg': {'precision': 0.8254844349886237, 'recall': 0.8322147651006712, 'f1-score': 0.8230649375654869, 'support': 1937.0}} 



Map: 100%|██████████| 50/50 [00:00<00:00, 3396.20 examples/s]
Map: 100%|██████████| 1937/1937 [00:00<00:00, 5581.74 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5698033066567421, 'f1_macro': 0.7828835016335016, 'f1_micro': 0.8378936499741869, 'accuracy_balanced': 0.7701782205662946, 'accuracy': 0.8378936499741869, 'precision_macro': 0.8004274434079433, 'recall_macro': 0.7701782205662946, 'precision_micro': 0.8378936499741869, 'recall_micro': 0.8378936499741869}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7313769751693002, 'recall': 0.6242774566473989, 'f1-score': 0.6735966735966736, 'support': 519.0}, 'not_entailment': {'precision': 0.8694779116465864, 'recall': 0.9160789844851904, 'f1-score': 0.8921703296703297, 'support': 1418.0}, 'accuracy': 0.8378936499741869, 'macro avg': {'precision': 0.8004274434079433, 'recall': 0.7701782205662946, 'f1-score': 0.7828835016335016, 'support': 1937.0}, 'weighted avg': {'precision': 0.8324751310416758, 'recall': 0.8378936499741869, 'f1-score': 0.8336056794368617, 'support': 1937.0}} 

50


Map: 100%|██████████| 100/100 [00:00<00:00, 4755.39 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 5742.69 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5935519952153985, 'f1_macro': 0.7935893160349043, 'f1_micro': 0.8479067302596714, 'accuracy_balanced': 0.7774154260578012, 'accuracy': 0.8479067302596714, 'precision_macro': 0.8174877259265818, 'recall_macro': 0.7774154260578012, 'precision_micro': 0.8479067302596714, 'recall_micro': 0.8479067302596714}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7632850241545893, 'recall': 0.6257425742574257, 'f1-score': 0.6877040261153428, 'support': 505.0}, 'not_entailment': {'precision': 0.8716904276985743, 'recall': 0.9290882778581766, 'f1-score': 0.8994746059544658, 'support': 1382.0}, 'accuracy': 0.8479067302596714, 'macro avg': {'precision': 0.8174877259265818, 'recall': 0.7774154260578012, 'f1-score': 0.7935893160349043, 'support': 1887.0}, 'weighted avg': {'precision': 0.8426789127066758, 'recall': 0.8479067302596714, 'f1-score': 0.8428004444182936, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3498.08 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 4296.77 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5506235836232969, 'f1_macro': 0.7632847942193534, 'f1_micro': 0.8351881293057764, 'accuracy_balanced': 0.7378621674874037, 'accuracy': 0.8351881293057764, 'precision_macro': 0.8186575801910758, 'recall_macro': 0.7378621674874037, 'precision_micro': 0.8351881293057764, 'recall_micro': 0.8351881293057764}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7928994082840237, 'recall': 0.5265225933202358, 'f1-score': 0.6328217237308147, 'support': 509.0}, 'not_entailment': {'precision': 0.8444157520981278, 'recall': 0.9492017416545718, 'f1-score': 0.893747864707892, 'support': 1378.0}, 'accuracy': 0.8351881293057764, 'macro avg': {'precision': 0.8186575801910758, 'recall': 0.7378621674874037, 'f1-score': 0.7632847942193534, 'support': 1887.0}, 'weighted avg': {'precision': 0.8305197165913026, 'recall': 0.8351881293057764, 'f1-score': 0.8233655617098357, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3571.23 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 4410.06 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5897435256644638, 'f1_macro': 0.7943578468802934, 'f1_micro': 0.8420773714891362, 'accuracy_balanced': 0.7875100637800986, 'accuracy': 0.8420773714891362, 'precision_macro': 0.8024219582876619, 'recall_macro': 0.7875100637800986, 'precision_micro': 0.8420773714891362, 'recall_micro': 0.8420773714891362}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.723404255319149, 'recall': 0.6692913385826772, 'f1-score': 0.6952965235173824, 'support': 508.0}, 'not_entailment': {'precision': 0.881439661256175, 'recall': 0.90572878897752, 'f1-score': 0.8934191702432046, 'support': 1379.0}, 'accuracy': 0.8420773714891362, 'macro avg': {'precision': 0.8024219582876619, 'recall': 0.7875100637800986, 'f1-score': 0.7943578468802934, 'support': 1887.0}, 'weighted avg': {'precision': 0.8388948884866947, 'recall': 0.8420773714891362, 'f1-score': 0.840082495872925, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4380.11 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 5761.23 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5888951214779488, 'f1_macro': 0.7932093335321306, 'f1_micro': 0.8441971383147854, 'accuracy_balanced': 0.7827359432565506, 'accuracy': 0.8441971383147854, 'precision_macro': 0.8066443021942287, 'recall_macro': 0.7827359432565506, 'precision_micro': 0.8441971383147854, 'recall_micro': 0.8441971383147854}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7354260089686099, 'recall': 0.6507936507936508, 'f1-score': 0.6905263157894737, 'support': 504.0}, 'not_entailment': {'precision': 0.8778625954198473, 'recall': 0.9146782357194505, 'f1-score': 0.8958923512747875, 'support': 1383.0}, 'accuracy': 0.8441971383147854, 'macro avg': {'precision': 0.8066443021942287, 'recall': 0.7827359432565506, 'f1-score': 0.7932093335321306, 'support': 1887.0}, 'weighted avg': {'precision': 0.8398191192293737, 'recall': 0.8441971383147854, 'f1-score': 0.841041009523543, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4279.81 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 4241.60 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5657992284112296, 'f1_macro': 0.7759870951136809, 'f1_micro': 0.8108108108108109, 'accuracy_balanced': 0.8053748693834901, 'accuracy': 0.8108108108108109, 'precision_macro': 0.7620785131378351, 'recall_macro': 0.8053748693834901, 'precision_micro': 0.8108108108108109, 'recall_micro': 0.8108108108108109}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6064814814814815, 'recall': 0.793939393939394, 'f1-score': 0.6876640419947506, 'support': 495.0}, 'not_entailment': {'precision': 0.9176755447941889, 'recall': 0.8168103448275862, 'f1-score': 0.8643101482326112, 'support': 1392.0}, 'accuracy': 0.8108108108108109, 'macro avg': {'precision': 0.7620785131378351, 'recall': 0.8053748693834901, 'f1-score': 0.7759870951136809, 'support': 1887.0}, 'weighted avg': {'precision': 0.8360427618902195, 'recall': 0.8108108108108109, 'f1-score': 0.8179721394420755, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3311.75 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 4298.71 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5922780772222426, 'f1_macro': 0.7960814146452634, 'f1_micro': 0.8394276629570747, 'accuracy_balanced': 0.7985379780516921, 'accuracy': 0.8394276629570747, 'precision_macro': 0.7937593761499051, 'recall_macro': 0.7985379780516921, 'precision_micro': 0.8394276629570747, 'recall_micro': 0.8394276629570747}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.6932038834951456, 'recall': 0.7111553784860558, 'f1-score': 0.7020648967551623, 'support': 502.0}, 'not_entailment': {'precision': 0.8943148688046647, 'recall': 0.8859205776173286, 'f1-score': 0.8900979325353645, 'support': 1385.0}, 'accuracy': 0.8394276629570747, 'macro avg': {'precision': 0.7937593761499051, 'recall': 0.7985379780516921, 'f1-score': 0.7960814146452634, 'support': 1887.0}, 'weighted avg': {'precision': 0.8408131652406061, 'recall': 0.8394276629570747, 'f1-score': 0.8400753655180558, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4784.69 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 5602.31 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5382423309481594, 'f1_macro': 0.7636427431124997, 'f1_micro': 0.8293587705352411, 'accuracy_balanced': 0.7450528487977721, 'accuracy': 0.8293587705352411, 'precision_macro': 0.7955533961814749, 'recall_macro': 0.7450528487977721, 'precision_micro': 0.8293587705352411, 'recall_micro': 0.8293587705352411}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7383419689119171, 'recall': 0.5632411067193676, 'f1-score': 0.6390134529147982, 'support': 506.0}, 'not_entailment': {'precision': 0.8527648234510327, 'recall': 0.9268645908761767, 'f1-score': 0.8882720333102012, 'support': 1381.0}, 'accuracy': 0.8293587705352411, 'macro avg': {'precision': 0.7955533961814749, 'recall': 0.7450528487977721, 'f1-score': 0.7636427431124997, 'support': 1887.0}, 'weighted avg': {'precision': 0.8220822774007982, 'recall': 0.8293587705352411, 'f1-score': 0.8214332194892824, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 3627.69 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 4310.23 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.567250135546, 'f1_macro': 0.7820728451484782, 'f1_micro': 0.8373078961314255, 'accuracy_balanced': 0.7707204638342362, 'accuracy': 0.8373078961314255, 'precision_macro': 0.7971448036469628, 'recall_macro': 0.7707204638342362, 'precision_micro': 0.8373078961314255, 'recall_micro': 0.8373078961314255}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7224770642201835, 'recall': 0.6287425149700598, 'f1-score': 0.6723585912486659, 'support': 501.0}, 'not_entailment': {'precision': 0.8718125430737422, 'recall': 0.9126984126984127, 'f1-score': 0.8917870990482905, 'support': 1386.0}, 'accuracy': 0.8373078961314255, 'macro avg': {'precision': 0.7971448036469628, 'recall': 0.7707204638342362, 'f1-score': 0.7820728451484782, 'support': 1887.0}, 'weighted avg': {'precision': 0.8321638547294746, 'recall': 0.8373078961314255, 'f1-score': 0.8335286558010133, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4640.69 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 5750.88 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5883789262103876, 'f1_macro': 0.7920422649249703, 'f1_micro': 0.8441971383147854, 'accuracy_balanced': 0.7787210472738886, 'accuracy': 0.8441971383147854, 'precision_macro': 0.8105163425892099, 'recall_macro': 0.7787210472738886, 'precision_micro': 0.8441971383147854, 'recall_micro': 0.8441971383147854}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7482678983833718, 'recall': 0.6365422396856582, 'f1-score': 0.6878980891719745, 'support': 509.0}, 'not_entailment': {'precision': 0.8727647867950481, 'recall': 0.920899854862119, 'f1-score': 0.8961864406779662, 'support': 1378.0}, 'accuracy': 0.8441971383147854, 'macro avg': {'precision': 0.8105163425892099, 'recall': 0.7787210472738886, 'f1-score': 0.7920422649249703, 'support': 1887.0}, 'weighted avg': {'precision': 0.8391829552097045, 'recall': 0.8441971383147854, 'f1-score': 0.8400026723067157, 'support': 1887.0}} 



Map: 100%|██████████| 100/100 [00:00<00:00, 4328.17 examples/s]
Map: 100%|██████████| 1887/1887 [00:00<00:00, 5010.54 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss


Aggregate metrics:  {'MCC': 0.5477404233811167, 'f1_macro': 0.7702354583378024, 'f1_micro': 0.830948595654478, 'accuracy_balanced': 0.754407364688551, 'accuracy': 0.830948595654478, 'precision_macro': 0.7948220187857111, 'recall_macro': 0.754407364688551, 'precision_micro': 0.830948595654478, 'recall_micro': 0.830948595654478}


  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


Detailed metrics:  {'entailment': {'precision': 0.7310513447432763, 'recall': 0.5885826771653543, 'f1-score': 0.6521264994547438, 'support': 508.0}, 'not_entailment': {'precision': 0.8585926928281461, 'recall': 0.9202320522117476, 'f1-score': 0.888344417220861, 'support': 1379.0}, 'accuracy': 0.830948595654478, 'macro avg': {'precision': 0.7948220187857111, 'recall': 0.754407364688551, 'f1-score': 0.7702354583378024, 'support': 1887.0}, 'weighted avg': {'precision': 0.8242572371698981, 'recall': 0.830948595654478, 'f1-score': 0.8247521001963843, 'support': 1887.0}} 

100
CPU times: total: 17min 53s
Wall time: 3h 18min 51s


In [49]:
res_df = pd.DataFrame({'n':shots_list, 'mcc':mcc_list})
res_df['accuracy'] = acc_list
res_df.groupby('n').mean()

Unnamed: 0_level_0,mcc,accuracy
n,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0.388921,0.775974
25,0.502602,0.813863
50,0.558912,0.834899
100,0.57225,0.836142
