In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from sklearn.utils import resample
from datasets import load_dataset, DatasetDict, Dataset
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
def metrics(df, preds, group_by=None):
    """
    Calculate MCC, Accuracy, F1 for predictions.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: DataFrame with calculated metrics, optionally grouped by `group_by`.
    """
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='macro')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def bootstrapped_errors(y_true, y_pred, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.

    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def metrics_with_errors(df, preds, n_bootstrap=1000, group_by=None):
    """
    Calculate metrics and bootstrapped standard errors for predictions, optionally grouped.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: Combined DataFrame of metrics, standard errors, and confidence intervals.
    """
    # Step 1: Calculate metrics for each model
    metrics_df = metrics(df, preds, group_by=group_by)

    # Step 2: Calculate bootstrapped errors for each model or group
    errors = []
    if group_by not in ['dataset', 'task']:
        for col in preds:
            y_true = df['entailment']
            y_pred = df[col]
            errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
            errors_dict['Column'] = col
            errors.append(errors_dict)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                y_true = group['entailment']
                y_pred = group[col]
                errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
                errors_dict['Column'] = col
                errors_dict[group_by.capitalize()] = group_name
                errors.append(errors_dict)

    errors_df = pd.DataFrame(errors)

    if group_by in ['dataset', 'task']:
        errors_df = errors_df.set_index(['Column', group_by.capitalize()])
    else:
        errors_df = errors_df.set_index('Column')

    # Step 3: Merge metrics and errors DataFrames
    combined_df = metrics_df.merge(errors_df, left_index=True, right_index=True)

    # Step 4: Calculate confidence intervals (upper and lower bounds)
    combined_df['MCC_Lower'] = combined_df['MCC'] - combined_df['MCC_SE']
    combined_df['MCC_Upper'] = combined_df['MCC'] + combined_df['MCC_SE']

    combined_df['Accuracy_Lower'] = combined_df['Accuracy'] - combined_df['Accuracy_SE']
    combined_df['Accuracy_Upper'] = combined_df['Accuracy'] + combined_df['Accuracy_SE']

    combined_df['F1_Lower'] = combined_df['F1'] - combined_df['F1_SE']
    combined_df['F1_Upper'] = combined_df['F1'] + combined_df['F1_SE']

    return combined_df

def label_docs(model, docs_dict, batch_size = 8, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.bfloat16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

In [4]:
llama_key = open('../llama_key.txt', 'r').read()

In [7]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="cuda",
    token = llama_key
)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:19<00:00,  4.88s/it]
Device set to use cuda


# UKP Stance

In [22]:
user_message = """You are a classifier that can only respond with 0 or 1. I'm going to show you a short text sample and I want you to determine if {hypothesis}. Here is the text:
{doc}

If it is true that {hypothesis}, return 0. If it is not true that {hypothesis}, return 1.
Do not explain your answer, and only return 0 or 1.
"""

In [11]:
ukp = pd.read_csv('ukp_stance.csv')
ukp['hypothesis'] = [hypoth.capitalize() for hypoth in ukp['hypothesis']]
ukp.rename({'sentence':'text', 'topic': 'task'}, axis = 1, inplace = True)
ukp = ukp[['text', 'entailment', 'hypothesis']]
ukp['task'] = 'stance'

In [None]:
ukp['hypothesis'] = ukp['hypothesis'].str.lower()

ukp['hypothesis'] = [hyp[0:-1] for hyp in ukp['hypothesis']]

In [25]:
%%time
data = ukp
res = []
for i in data.index:
    doc = data.loc[i, 'text']
    hypothesis = data.loc[i, 'hypothesis']
    messages = [
        {"role": "user", "content": user_message.format(doc = doc, hypothesis = hypothesis)},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.eos_token_id, temperature = 0)
    res.extend(outputs)

res = [text['generated_text'] for text in res]
# return a list of unique responses from the model
print(set(res))

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'1', '0'}
CPU times: total: 5min 14s
Wall time: 7min 19s


In [None]:
labs = [num[0] for num in res]
ukp['llama'] = [1 if '1' in text else 0 for text in labs]

overall = metrics_with_errors(ukp, ['llama'], group_by = None)

In [32]:
overall

Unnamed: 0_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
llama,0.764781,0.897154,0.882336,0.009774,0.004273,0.00425,0.755006,0.774555,0.892882,0.901427,0.878086,0.886586


In [81]:
ukp.to_csv('../data/ukp_stance.csv', index = False)

# UKP Topic

In [22]:
user_message = """You are a classifier that can only respond with 0 or 1. I'm going to show you a short text sample and I want you to determine if {hypothesis}. Here is the text:
{doc}

If it is true that {hypothesis}, return 0. If it is not true that {hypothesis}, return 1.
Do not explain your answer, and only return 0 or 1.
"""

In [None]:
topic = pd.read_csv('../data/ukp_topic.csv')
# format hypotheses
topic['hypothesis'] = topic['hypothesis'].str.lower()
topic['hypothesis'] = [hyp[0:-1] for hyp in topic['hypothesis']]

In [37]:
%%time
data = topic
res = []
for i in data.index:
    doc = data.loc[i, 'text']
    hypothesis = data.loc[i, 'hypothesis']
    messages = [
        {"role": "user", "content": user_message.format(doc = doc, hypothesis = hypothesis)},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.eos_token_id, temperature = 0)
    res.extend(outputs)

res = [text['generated_text'] for text in res]
# return a list of unique responses from the model
print(set(res))



{'1', '0'}
CPU times: total: 10min 9s
Wall time: 14min 47s


In [40]:
labs = [num[0] for num in res]
topic['llama'] = [1 if '1' in text else 0 for text in labs]

In [41]:
overall = metrics_with_errors(topic, ['llama'], group_by = None)

In [42]:
overall

Unnamed: 0_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
llama,0.891902,0.945935,0.945934,0.004567,0.002285,0.002285,0.887336,0.896469,0.94365,0.948219,0.943649,0.948219


In [79]:
topic.to_csv('../data/ukp_topic.csv', index = False)

# RAND Terrorism

In [60]:
user_message = """You are a classifier that can only respond with 1, 2, 3, or 4. I'm going to show you a short text sample about a terrorist attack and I want you to determine which of the following labels best describes the text:

1: Explosives attack.
2: Firearms attack.
3: Arson attack.
4: Knife or sharp object attack.

Return the number of the statement the best describes the text.Here is the text:

{doc}

Only return the number of the statement that best describes the text.
"""

In [43]:
rand = pd.read_csv('../data/rand_terror.csv')

In [61]:
%%time
data = rand
res = []
for i in data.index:
    doc = data.loc[i, 'premise']
    messages = [
        {"role": "user", "content": user_message.format(doc = doc)},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.eos_token_id, temperature = 0)
    res.extend(outputs)

res = [text['generated_text'] for text in res]
# return a list of unique responses from the model
print(set(res))



{'3', '1', '2', '4'}
CPU times: total: 2min 12s
Wall time: 3min 7s


In [62]:
labs = [num[0] for num in res]
rand['llama'] = [int(text) for text in labs]

In [53]:
rand['entailment'] = rand['hypothesis']

In [55]:
rand['entailment'].replace({'This text describes an explosives attack.': 1,
       'This text describes a firearms attack.': 2,
       'This text describes an arson attack.': 3,
       'This text describes a knife or sharp object attack.': 4}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rand['entailment'].replace({'This text describes an explosives attack.': 1,
  rand['entailment'].replace({'This text describes an explosives attack.': 1,


In [63]:
overall = metrics_with_errors(rand, ['llama'], group_by = None)

In [77]:
rand.to_csv('../data/rand_terror.csv', index = False)