In [23]:
import pandas as pd
import numpy as np
from datasets import load_dataset, DatasetDict, Dataset
import random
import json

# Initial Data Compile

In [180]:
zsnli = load_dataset("MoritzLaurer/zeroshot_test_downsampled")

drop = ['mnli_mm', 'fevernli', 'anli_r2', 'anli_r3', 'lingnli', 'imdb', 'yelpreviews', 'hatexplain',
 'emocontext', 'empathetic', 'biasframes_sex', 'biasframes_offensive', 'biasframes_intent', 'financialphrasebank', 
 'appreviews', 'hateoffensive', 'trueteacher', 'spam', 'wikitoxic_toxicaggregated', 'wikitoxic_obscene', 
 'wikitoxic_identityhate', 'wikitoxic_threat', 'wikitoxic_insult', 'wellformedquery', 'massive', 'banking77', 
 'manifesto', 'capsotu']
for data in drop:
    zsnli.pop(data)

# download additional datasets
sst2 = load_dataset("SetFit/sst2")
tweet_topic = load_dataset("cardiffnlp/tweet_topic_single")
go_emotions = load_dataset("SetFit/go_emotions")

# Add to the dataset dict
zsnli['sst2'] = sst2['test']
zsnli['tweet_topic'] = tweet_topic['test_2021']
zsnli['go_emotions'] = go_emotions['validation']

# create a dict with only the the nli datasets
nli_datasets = ['mnli_m', 'anli_r1', 'wanli']
nli = DatasetDict({key: zsnli[key] for key in nli_datasets})
for data in nli_datasets:
    zsnli.pop(data)
    
# Create a new DatasetDict with the random samples
# 1000 for the nli datasets since those wont be validated
sample_size = 1000
nli = DatasetDict({
    key: dataset.shuffle(seed=1).select(range(min(sample_size, len(dataset))))
    for key, dataset in nli.items()
})

# 1500 for the other datasets to account for possible loss during validation
sample_size = 1500
zsnli = DatasetDict({
    key: dataset.shuffle(seed=1).select(range(min(sample_size, len(dataset))))
    for key, dataset in zsnli.items()
})

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


# SST2 Clean

In [181]:
# clean sst2
sst2 = zsnli['sst2'].to_pandas()

# add entailment hypotheses
sst2['hypothesis'] = "The sentiment of this text is positive"
sst2.loc[750:,'hypothesis'] = "The sentiment of this text is negative"

# add entailment labels
sst2['labels'] = 0

pos_index = sst2[sst2['label_text'] == 'positive'].index
pos_labels = [1 if 'negative' in text else 0 for text in sst2.loc[pos_index, 'hypothesis']]
sst2.loc[pos_index, 'labels'] = pos_labels

neg_index = sst2[sst2['label_text'] == 'negative'].index
neg_labels = [1 if 'positive' in text else 0 for text in sst2.loc[neg_index, 'hypothesis']]
sst2.loc[neg_index, 'labels'] = neg_labels

# add task name
sst2['task_name'] = 'sst2'

# add label text
sst2['label_text'] = ['entailment' if label == 0 else 'not_entailment' for label in sst2['labels']]

# drop original label column
sst2.drop('label', axis = 1, inplace = True)

# re-order
sst2 = sst2[['text', 'hypothesis', 'labels', 'task_name', 'label_text']]

zsnli['sst2'] = Dataset.from_pandas(sst2)

# Tweets Clean

In [182]:
# Clean the tweets topic data
tweets = zsnli['tweet_topic'].to_pandas()

hypoths = {'sports_&_gaming': 'This text is about sports and gaming', 
           'daily_life': 'This text is about daily life',
           'pop_culture': 'This text is about pop culture', 
           'science_&_technology': 'This text is about science and technology',
           'business_&_entreprenuers': 'This text is about business and entreprenuers', 
           'arts_&_culture': 'This text is about arts and culture'}

tweets['label_name'].replace(hypoths, inplace = True)
tweets['labels'] = 0
tweets.drop(['date', 'label', 'id'], axis = 1, inplace = True)
tweets['task_name'] = 'tweet_topic'
tweets['label_text'] = 'entailment'
tweets.rename({'label_name':'hypothesis'}, axis = 1, inplace = True)
zsnli['tweet_topic'] = Dataset.from_pandas(tweets)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tweets['label_name'].replace(hypoths, inplace = True)


# Emotion Clean

In [183]:
emo = zsnli['go_emotions'].to_pandas()

emotion_columns = [col for col in emo.columns if col != 'text']

# Initialize lists to store the results
texts = []
hypotheses = []
labels = []
label_text = []

# Split the DataFrame into two halves
midpoint = len(emo) // 2
true_hypothesis_df = emo.iloc[:midpoint]
false_hypothesis_df = emo.iloc[midpoint:]

# Create true hypotheses
for _, row in true_hypothesis_df.iterrows():
    # Get the expressed emotions for this text
    expressed_emotions = [emotion for emotion in emotion_columns if row[emotion] == 1]
    if expressed_emotions:
        # Randomly select one expressed emotion
        selected_emotion = random.choice(expressed_emotions)
        texts.append(row['text'])
        hypotheses.append(f"This text expresses {selected_emotion}.")
        labels.append(0)
        label_text.append("entailment")

# Create false hypotheses
for _, row in false_hypothesis_df.iterrows():
    # Get the emotions not expressed in this text
    non_expressed_emotions = [emotion for emotion in emotion_columns if row[emotion] == 0]
    if non_expressed_emotions:
        # Randomly select one non-expressed emotion
        selected_emotion = random.choice(non_expressed_emotions)
        texts.append(row['text'])
        hypotheses.append(f"This text expresses {selected_emotion}.")
        labels.append(1)
        label_text.append("not_entailment")

# Create a new DataFrame with the entailment pairs and labels
emo = pd.DataFrame({
    "text": texts,
    "hypothesis": hypotheses,
    "labels": labels,
    'task_name': 'go_emotions',
    'label_text': label_text
})

emo['hypothesis'].replace({'This text expresses neutral.': 'This text does not express emotion.'}, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  emo['hypothesis'].replace({'This text expresses neutral.': 'This text does not express emotion.'}, inplace = True)


# Combine and Export

In [188]:
combined_df = pd.concat(
    [dataset.to_pandas() for dataset in zsnli.values()],
    ignore_index=True
)

In [197]:
new_hypoths = {
'The sentiment in this example rotten tomatoes movie review is positive': 'The sentiment of this review is positive.',
'The sentiment in this example rotten tomatoes movie review is negative': 'The sentiment of this review is negative.',
'The sentiment in this example amazon product review is positive': 'The sentiment of this review is positive.',
'The sentiment in this example amazon product review is negative': 'The sentiment of this review is negative.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Computers & Internet': 'This text is about computers & internet.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Sports': 'This text is about sports.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Family & Relationships': 'This text is about family & relationships.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Education & Reference': 'This text is about education and reference.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Entertainment & Music': 'This text is about entertainment and music.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Health': 'This text is about health.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Society & Culture': 'This text is about society & culture.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Science & Mathematics': 'This text is about science & mathematics.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Politics & Government': 'This text is about politics & government.',
'This example question from the Yahoo Q&A forum is categorized in the topic: Business & Finance': 'This text is about business & finance.',
}

In [199]:
combined_df['hypothesis'].replace(new_hypoths, inplace = True)

In [203]:
combined_df.to_csv('../data/out_domain_bench.csv', index = False)

In [205]:
combined_nli = pd.concat(
    [dataset.to_pandas() for dataset in nli.values()],
    ignore_index=True
)

In [207]:
combined_nli.to_csv('../data/nli_bench.csv', index = False)

# Validation

In [19]:
# import and format hypotheses for prompt
zsnli = pd.read_csv('../data/out_domain_bench.csv')
zsnli['hypothesis'] = zsnli['hypothesis'].str.lower()
zsnli['hypothesis'] = zsnli['hypothesis'].str.replace('.', '')

In [25]:
api_key = open('../openAI_key.txt', 'r').read()
from openai import OpenAI
client = OpenAI(api_key = api_key)

In [20]:
system_message = "You are a text classifier and are only allowed to respond with 0 or 1."

user_message = """You are a classifier that determines if {hypothesis}. If it is true that {hypothesis}, return 0. If it is not true that {hypothesis} or it cannot be determined, return 1. Do not explain your reasoning. Only respond with 0 or 1.

Here is the document:\n{text}
"""

In [21]:
# create dictionary of requests
data = zsnli
messages_batch = [
    {
        'custom_id': str(i),
        'method': 'POST',
        'url': '/v1/chat/completions',
        'body':{
        'model': 'gpt-4o-2024-11-20',
        'messages': [
            {'role': 'system', 'content': system_message},
            {'role': 'user', 'content': user_message.format(text=data.loc[i, 'text'], hypothesis = data.loc[i, 'hypothesis'])}
        ],
        'max_tokens': 1,
        'temperature': 0,
        'logit_bias': {16: 100, 15: 100}
    }
    }
    for i in data.index
]

In [24]:
# export as .jsonl file
with open('../data/out_domain_bench.jsonl', 'w') as outfile:
    for entry in messages_batch:
        json.dump(entry, outfile)
        outfile.write('\n')

In [26]:
client.files.create(
  file=open('../data/out_domain_bench.jsonl', "rb"),
  purpose="batch"
)

FileObject(id='file-PhDbCmoUi99YZcR1MENAQy', bytes=10747287, created_at=1735597417, filename='out_domain_bench.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [27]:
client.batches.create(
  input_file_id="file-PhDbCmoUi99YZcR1MENAQy",
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

Batch(id='batch_67731d923cb8819091c0af22a16f994e', completion_window='24h', created_at=1735597458, endpoint='/v1/chat/completions', input_file_id='file-PhDbCmoUi99YZcR1MENAQy', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1735683858, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [None]:
# read in the completed batch
res = pd.read_json(path_or_buf='../data/OOS_batch_output.jsonl', lines=True)
labels = [res['response'][i]['body']['choices'][0]['message']['content'] for i in res.index]

zsnli['validated_labels'] = labels

zsnli['validated_labels'] = [int(label) for label in zsnli['validated_labels']]

validated = zsnli[zsnli['labels'] == zsnli['validated_labels']]

validated['validation_source'] = 'gpt-4o-2024-11-20'

In [60]:
validated.to_csv('../data/out_domain_bench.csv', index = False)