# Setup

## Load Packages

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openai
import re
from helper_funcs import prepare_examples, create_query
from dotenv import load_dotenv

load_dotenv()

True

## Prepare Data

In [2]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')

In [6]:
train = train.dropna(subset=['Sentence1', 'Sentence2', 'Explanation_1'])
train = train.reset_index(drop=True)

In [7]:
n_dev = 20 # number of dev data points

np.random.seed(12345) # seed for numpy package
random.seed(12345) # seed for random package
dev_indices = list(np.random.choice(dev.index.values, size=n_dev, replace=False))
prompts_with_label_size_one = [prepare_examples(train, size_per_class=1, include_label=True) + '\n\n' + create_query(dev.loc[i], include_label=True) for i in dev_indices]

np.random.seed(12345) # seed for numpy package
random.seed(12345) # seed for random package
dev_indices = list(np.random.choice(dev.index.values, size=n_dev, replace=False))
prompts_without_label_size_one = [prepare_examples(train, size_per_class=1, include_label=False) + '\n\n' + create_query(dev.loc[i], include_label=False) for i in dev_indices]

np.random.seed(12345) # seed for numpy package
random.seed(12345) # seed for random package
dev_indices = list(np.random.choice(dev.index.values, size=n_dev, replace=False))
prompts_with_label_size_four = [prepare_examples(train, size_per_class=4, include_label=True) + '\n\n' + create_query(dev.loc[i], include_label=True) for i in dev_indices]

np.random.seed(12345) # seed for numpy package
random.seed(12345) # seed for random package
dev_indices = list(np.random.choice(dev.index.values, size=n_dev, replace=False))
prompts_without_label_size_four = [prepare_examples(train, size_per_class=4, include_label=False) + '\n\n' + create_query(dev.loc[i], include_label=False) for i in dev_indices]

In [8]:
dev_prepared = pd.DataFrame()
dev_prepared['gold_standard_explanation'] = dev.loc[dev_indices].Explanation_1
dev_prepared['gold_standard_label'] = dev.loc[dev_indices].gold_label
dev_prepared['prompts_with_label_size_one'] = prompts_with_label_size_one
dev_prepared['prompts_without_label_size_one'] = prompts_without_label_size_one
dev_prepared['prompts_with_label_size_four'] = prompts_with_label_size_four
dev_prepared['prompts_without_label_size_four'] = prompts_without_label_size_four

In [9]:
dev_prepared.head()

Unnamed: 0,gold_standard_explanation,gold_standard_label,prompts_with_label_size_one,prompts_without_label_size_one,prompts_with_label_size_four,prompts_without_label_size_four
9594,Just because a lady is opening a present doesn...,neutral,Statement: People walking under a covered arch...,Statement: People walking under a covered arch...,Statement: A boy on a scooter.\nStatement: The...,Statement: A boy on a scooter.\nStatement: The...
3300,Does skateboard tricks implies on a skateboard.,entailment,Statement: Little girl in pink coat on a swing...,Statement: Little girl in pink coat on a swing...,Statement: Two people are having a conversatio...,Statement: Two people are having a conversatio...
7520,A man observes must be examining.,entailment,Statement: An artist in a black sweater is scu...,Statement: An artist in a black sweater is scu...,Statement: A little blond-haired girls peers a...,Statement: A little blond-haired girls peers a...
840,people are men and are friends,neutral,Statement: A man with a dark green apron stand...,Statement: A man with a dark green apron stand...,Statement: A man in a white baseball cap is si...,Statement: A man in a white baseball cap is si...
5854,A group of construction workers bears no resem...,contradiction,Statement: A man in a white shirt and glasses ...,Statement: A man in a white shirt and glasses ...,"Statement: A man stands at a microphone, readi...","Statement: A man stands at a microphone, readi..."


In [14]:
print(dev_prepared.prompts_with_label_size_four.iloc[10])

Statement: A man in a black t-shirt is standing next to a parking meter.
Statement: A man is outside standing next to a parking meter
Label: entailment
Explanation: The meter is a parking meter.

Statement: The motocross rider jumps high in the air on his bike.
Statement: The motorcross rider rides down the hill.
Label: contradiction
Explanation: If he is high in the air, then he is not riding down a hill.

Statement: two men waiting in line one wearing black suit.
Statement: a bird was on rocks
Label: entailment
Explanation: A bird is black and can also wait in line.

Statement: A worker stands near his truck, guarding the manhole being worked on by his assistant from passing cars.
Statement: The worker is employed by the water and sewer authority.
Label: neutral
Explanation: It is not apparent he's employed by the water and sewer authority.

Statement: Three girls and one man are loitering outside of a building next to a concrete structure.
Statement: Four people are are standing out

# Find Hyperparameters

In [237]:
tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
explanation_token_length = [len(tokenizer.tokenize(train.iloc[i].Explanation_1)) for i in range(train.shape[0])]

In [238]:
np.quantile(explanation_token_length, 0.999)

58.0

In [239]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [240]:
results = pd.DataFrame()
for col_name in dev_prepared.columns.values[2:]:
    for temp in [0, 0.5, 1]:
        response = openai.Completion.create(
            engine='text-davinci-003',
            prompt=list(dev_prepared[col_name]),
            temperature=temp,
            max_tokens=58,
            top_p=1,
        )
        try:
            preds = [re.findall('Explanation:.+', response.choices[i].text)[0].replace('Explanation: ', '') for i in range(len(response.choices))]
        except:
            preds = [response.choices[i].text.strip() for i in range(len(response.choices))]
        results['pred_temperature_' + str(temp) + '_' + col_name] = preds

In [267]:
results.head()

Unnamed: 0,pred_temperature_0_prompts_with_label_size_one,pred_temperature_0.5_prompts_with_label_size_one,pred_temperature_1_prompts_with_label_size_one,pred_temperature_0_prompts_without_label_size_one,pred_temperature_0.5_prompts_without_label_size_one,pred_temperature_1_prompts_without_label_size_one,pred_temperature_0_prompts_with_label_size_four,pred_temperature_0.5_prompts_with_label_size_four,pred_temperature_1_prompts_with_label_size_four,pred_temperature_0_prompts_without_label_size_four,pred_temperature_0.5_prompts_without_label_size_four,pred_temperature_1_prompts_without_label_size_four
0,The lady opening the present does not necessar...,There is no indication that it is the lady's b...,The statement does not necessarily have to be ...,Opening a present does not necessarily mean it...,Opening a present does not necessarily mean it...,Opening a present does not necessarily mean it...,We cannot infer that it is the lady's birthday...,We cannot infer that it is the lady's birthday...,The lady could be receiving a present for any ...,We cannot infer that it is the lady's birthday...,We cannot infer that it is the lady's birthday...,The lady opening the present does not necessar...
1,The man with dreadlocks is on a skateboard in ...,The man with dreadlocks is on a skateboard in ...,Skateboarding in the middle of the street impl...,The man with dreadlocks may be skateboarding i...,The man with dreadlocks may or may not be doin...,The statement does not specify that the man is...,Doing skateboard tricks implies that the man i...,Skateboard tricks implies that the man is doin...,Doing skateboard tricks implies that the man w...,"Not all skateboarders do tricks, so this canno...",The statement does not imply that the man is d...,Not all people skateboarding do skateboard tri...
2,Observing and examining are similar actions.,Observing and examining are both forms of look...,The statement is worded differently but convey...,Observing and examining are similar actions.,Observing and examining are both similar actions.,Observes and examines are similar and the stat...,Observing and examining are synonymous.,Observing and examining are synonymous.,Observing a wavelength given off by an electro...,Observing and examining are similar actions.,Observing and examining are synonymous in this...,Observing implies examining.
3,There is no evidence to suggest that the two p...,The statement does not provide any information...,The statement does not provide any information...,There is no evidence to suggest that the two p...,The statement does not provide any information...,There is no evidence to suggest that the two p...,Just because two people are working constructi...,Just because two people are working together i...,Just because two people work together on const...,Just because two people are working together d...,Just because two people are working together i...,Just because two people work together does not...
4,CONSTRUCTION WORKERS ARE NOT FLIES AND ARE NOT...,CONSTRUCTION WORKERS ARE NOT FLIES AND ARE NOT...,CONSTRUCTION WORKERS IS NOT FLIES OR SHARK.,CONSTRUCTION WORKERS ARE NOT FLIES AND THEY AR...,CONSTRUCTION WORKERS ARE NOT FLIES EATING SHARK,CONSTRUCTION WORKERS ARE NOT FLIES AND NOT EAT...,Construction workers are not flies and they ar...,Construction workers are not flies and flies a...,Construction workers are not flies and Sharks ...,Construction workers are not flies and they ar...,Construction workers are not flies and flies a...,Construction workers have nothing to do with f...


In [266]:
results.to_csv('output_data/predsictions.csv', sep=';')

In [260]:
F1_means = dict()
for col_name in results.columns.values:
    P, R, F1 = score(list(results[col_name]), list(dev_prepared['gold_standard_explanation']), lang="en", verbose=False)
    F1_means[col_name] = F1.mean().item()
print(F1_means)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaM

{'pred_temperature_0_prompts_with_label_size_one': 0.8974002599716187, 'pred_temperature_0.5_prompts_with_label_size_one': 0.8969234228134155, 'pred_temperature_1_prompts_with_label_size_one': 0.888841986656189, 'pred_temperature_0_prompts_without_label_size_one': 0.8909212946891785, 'pred_temperature_0.5_prompts_without_label_size_one': 0.8894920349121094, 'pred_temperature_1_prompts_without_label_size_one': 0.8887431025505066, 'pred_temperature_0_prompts_with_label_size_four': 0.9044081568717957, 'pred_temperature_0.5_prompts_with_label_size_four': 0.9054492712020874, 'pred_temperature_1_prompts_with_label_size_four': 0.8984190225601196, 'pred_temperature_0_prompts_without_label_size_four': 0.9031659960746765, 'pred_temperature_0.5_prompts_without_label_size_four': 0.8955985903739929, 'pred_temperature_1_prompts_without_label_size_four': 0.8973878622055054}


In [261]:
F1_means

{'pred_temperature_0_prompts_with_label_size_one': 0.8974002599716187,
 'pred_temperature_0.5_prompts_with_label_size_one': 0.8969234228134155,
 'pred_temperature_1_prompts_with_label_size_one': 0.888841986656189,
 'pred_temperature_0_prompts_without_label_size_one': 0.8909212946891785,
 'pred_temperature_0.5_prompts_without_label_size_one': 0.8894920349121094,
 'pred_temperature_1_prompts_without_label_size_one': 0.8887431025505066,
 'pred_temperature_0_prompts_with_label_size_four': 0.9044081568717957,
 'pred_temperature_0.5_prompts_with_label_size_four': 0.9054492712020874,
 'pred_temperature_1_prompts_with_label_size_four': 0.8984190225601196,
 'pred_temperature_0_prompts_without_label_size_four': 0.9031659960746765,
 'pred_temperature_0.5_prompts_without_label_size_four': 0.8955985903739929,
 'pred_temperature_1_prompts_without_label_size_four': 0.8973878622055054}