In [85]:
import pandas as pd
from sklearn.metrics import matthews_corrcoef,precision_score,recall_score,f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from abc import ABC, abstractmethod
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
)
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import matthews_corrcoef
from tqdm import tqdm
import pandas as pd

In [126]:
df_babe_test=pd.read_csv('babe_test_annomatic.csv')

df_babe_test=df_babe_test[['text','label','zephyr_label','openchat_label','llama_13b_label','majority','magpie','synth','babe']]
df_babe_test=df_babe_test[df_babe_test['zephyr_label']!="?"]

df_babe_test['zephyr_label'] = df_babe_test['zephyr_label'].astype(int)
df_babe_test['openchat_label'] = df_babe_test['openchat_label'].astype(int)
df_babe_test['llama_13b_label'] = df_babe_test['llama_13b_label'].astype(int)

In [125]:
df_basil = pd.read_csv('basil_synth_babe_preds.csv')
df_basil=df_basil[~df_basil.synth_pred.isna()]

In [127]:
def get_scores(data,source:str,target:str):
    precision = precision_score(y_true=data[source],y_pred=data[target],average='binary')
    recall = recall_score(y_true=data[source],y_pred=data[target],average='binary')
    print(f"P: {precision}")
    print(f"R: {recall}")
    print(f"F1: {2 * ((precision * recall) / (precision + recall))}")
    print(f"MCC: {matthews_corrcoef(y_true=data[source],y_pred=data[target])}")

# eval BABE test

In [128]:
# Annotator #1
get_scores(data=df_babe_test,source='label',target='zephyr_label')

P: 0.8307692307692308
R: 0.7728085867620751
F1: 0.8007414272474515
MCC: 0.5697130131956829


In [129]:
# Annotator #2
get_scores(data=df_babe_test,source='label',target='openchat_label')

P: 0.8144876325088339
R: 0.8246869409660107
F1: 0.8195555555555555
MCC: 0.587635513179489


In [130]:
# Annotator #3
get_scores(data=df_babe_test,source='label',target='llama_13b_label')

P: 0.827708703374778
R: 0.8336314847942755
F1: 0.8306595365418895
MCC: 0.614285145672567


In [131]:
# majority
get_scores(data=df_babe_test,source='label',target='majority')

P: 0.8518518518518519
R: 0.8228980322003577
F1: 0.8371246587807097
MCC: 0.6390586829640236


In [132]:
# magpie
get_scores(data=df_babe_test,source='label',target='magpie')

P: 0.8973305954825462
R: 0.7817531305903399
F1: 0.8355640535372849
MCC: 0.6639293573226216


In [133]:
# synth
get_scores(data=df_babe_test,source='label',target='synth')

P: 0.875
R: 0.813953488372093
F1: 0.8433734939759036
MCC: 0.6624344914259455


In [134]:
# babe
get_scores(data=df_babe_test,source='label',target='babe')

P: 0.9152542372881356
R: 0.7728085867620751
F1: 0.8380213385063046
MCC: 0.6784047466439473


# eval BASIL

In [136]:
# babe
get_scores(data=df_basil,source='lex_label',target='babe_pred')

P: 0.15841584158415842
R: 0.3747072599531616
F1: 0.22268615170494085
MCC: 0.1637080886035334


In [138]:
# synth
get_scores(data=df_basil,source='lex_label',target='synth_pred')

P: 0.15993907083015993
R: 0.4918032786885246
F1: 0.2413793103448276
MCC: 0.1948816352518775


# synth preds

In [41]:
def make_predictions(data:pd.DataFrame,checkpoint:str):
        """
        Generates predictions for the test data using the trained model.

        Returns:
            predictions (list): List of predicted labels for the test data.
        """
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
        device = (
            torch.device("cuda:0")
            if torch.cuda.is_available()
            else torch.device("cpu")
        )
        model.to(device)
        tok = tokenizer(
            list(data["text"]),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="pt",
        )
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        testing_dataloader = DataLoader(
            Dataset.from_dict(tok), batch_size=8, collate_fn=data_collator
        )

        predictions = []
        for batch in tqdm(testing_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).tolist())

        return predictions



In [38]:
df_babe_test['synth'] = make_predictions(data=df_babe_test,checkpoint='anonymous/roberta-anno-lexical-ft')

100%|██████████| 63/63 [05:20<00:00,  5.09s/it]


In [42]:
df_babe_test['magpie'] = make_predictions(data=df_babe_test,checkpoint='anonymous/magpie-annomatic')

100%|██████████| 125/125 [06:03<00:00,  2.91s/it]


In [44]:
df_babe_test['babe'] = make_predictions(data=df_babe_test,checkpoint='anonymous/babe-base-annomatic')

100%|██████████| 125/125 [04:54<00:00,  2.35s/it]
