In [1]:
import language_tool_python
import pandas as pd
from spellchecker import SpellChecker

# Initialize the language tool for English and spellchecker
tool = language_tool_python.LanguageTool('en-US')
spell = SpellChecker()

def correct_text(text):
    # Correct grammar using language_tool_python
    matches = tool.check(text)
    corrected_text = tool.correct(text)

    # Correct spelling using pyspellchecker
    words = corrected_text.split()
    misspelled = spell.unknown(words)
    for word in misspelled:
        corrected_word = spell.correction(word)
        # Only replace if a correction is found
        if corrected_word:
            corrected_text = corrected_text.replace(word, corrected_word, 1)

    return corrected_text

# Example usage with a DataFrame
'''
data = {
    'premise': ["This is an example premise."],
    'hypothesis': ["This is a hypothesis."],
    'machine_explanation': ["This is teh explanation for the premise and hypothesis."],
    'label': ['entailment']
}

df = pd.DataFrame(data)
df_spelling = df.copy()
df_spelling['corrected_explanation'] = df_spelling['machine_explanation'].apply(correct_text)
df_spelling.head()
'''

Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:18<00:00, 13.3MB/s] 
Unzipping /var/folders/ry/pyw8c_113gv5wnpxnq6tw6380000gn/T/tmppp89879e.zip to /Users/javier/.cache/language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /Users/javier/.cache/language_tool_python.


'\ndata = {\n    \'premise\': ["This is an example premise."],\n    \'hypothesis\': ["This is a hypothesis."],\n    \'machine_explanation\': ["This is teh explanation for the premise and hypothesis."],\n    \'label\': [\'entailment\']\n}\n\ndf = pd.DataFrame(data)\ndf_spelling = df.copy()\ndf_spelling[\'corrected_explanation\'] = df_spelling[\'machine_explanation\'].apply(correct_text)\ndf_spelling.head()\n'

In [51]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def check_semantic_similarity(text1, text2):
    """Calculate semantic similarity between two texts."""
    embeddings1 = sbert_model.encode(text1, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(text2, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return similarity.item()

def process_explanations(row):
    """Process each row of the DataFrame."""
    premise = row['premise']
    hypothesis = row['hypothesis']
    explanation = row['machine_explanation']

    # Combine premise and hypothesis for a full context comparison
    full_context = premise + " " + hypothesis

    # Calculate similarity
    similarity_score = check_semantic_similarity(explanation, full_context)

    return pd.Series({
        'similarity_score': similarity_score,
        'processed_explanation': explanation  # Placeholder for any additional processing
    })

In [7]:
from nltk import word_tokenize, sent_tokenize

# calculate edit distance between sentences
def edit_distance(s1, s2):
    s1 = [word.lower() for word in word_tokenize(s1)]
    s2 = [word.lower() for word in word_tokenize(s2)]
    m = len(s1)
    n = len(s2)
    dp = [[0 for x in range(n+1)] for x in range(m+1)]
    for i in range(m+1):
        for j in range(n+1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif s1[i-1] == s2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = min(dp[i][j-1] + 1,        # Insert
                            dp[i-1][j] + 1,        # Remove
                            dp[i-1][j-1] + 2)      # Replace
    return dp[m][n]

count = 0
def remove_redundancy(text, max_edit_distance):
    sentences = sent_tokenize(text)
    n = len(sentences)
    new_sentence = []
    for i in range(n):
        if i == n - 1:
            new_sentence.append(sentences[i]) # add last sentence (already compared with previous sentence last iteration)
            break

        if edit_distance(sentences[i], sentences[i+1]) < max_edit_distance:
            global count
            count += 1
            # print(text)
            continue

        new_sentence.append(sentences[i])

    return " ".join(new_sentence)

# Test

In [8]:
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from safetensors.torch import load_model, save_model

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.metrics import f1_score, classification_report

import matplotlib.pyplot as plt

In [9]:
label_to_id = {"entailment": 0, "neutral": 1, "contradiction": 2}
id_to_label = {v: k for k, v in label_to_id.items()}

def renameColumns(df):
    return df.rename(columns={'Sentence1': 'premise', 'Sentence2': 'hypothesis', 'Explanation_1': 'explanation'})

def filterNan(df):
    return df.dropna()

def convert_to_tensors(df):
    return torch.tensor(df.values)

def encode_labels(df):
    return df.apply(lambda x: int(label_to_id[x]))

template = "Given that {}, it is hypothesized that {}. {}."

def tokenize(df, tokenizer):
    tokenized_batch = []
    for _, row in df.iterrows():
        premise = row['premise'].lower()
        if premise[-1] in ['.', '!', '?']:
            premise = premise[:-1]
        hypothesis = row['hypothesis'].lower()
        if hypothesis[-1] in ['.', '!', '?']:
            hypothesis = hypothesis[:-1]
        explanation = row['explanation'].lower()
        if explanation[-1] in ['.', '!', '?']:
            explanation = explanation[:-1]


        encoded_dict = tokenizer.encode_plus(
            text = template.format(premise, hypothesis, explanation),
            padding=True,
            return_tensors='pt',
        )
        tokenized_batch.append(encoded_dict)
    return tokenized_batch

def calc_f1_score(predicted_classes, actual_labels):
    return f1_score(predicted_classes, actual_labels, average='weighted'), f1_score(predicted_classes, actual_labels, average='micro'), f1_score(predicted_classes, actual_labels, average='macro')

In [11]:
premise_template = 'Given that {}, it is hypothesized that {}.'
explanation_template = 'This is {} because {}.'

class eSNLIDataset(Dataset):
    def __init__(self, df, tokenizer, train=True):
        self.df = df
        self.train = train
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        example = self.df.iloc[idx,:]
        premise = example["premise"]
        hypothesis = example["hypothesis"]
        explanation = example["explanation"]

        if premise[-1] in ['.', '!', '?']:
            premise = premise[:-1]
        if hypothesis[-1] in ['.', '!', '?']:
            hypothesis = hypothesis[:-1]
        if explanation[-1] in ['.', '!', '?']:
            explanation = explanation[:-1]

        premise = premise_template.format(premise, hypothesis)
        explanation = explanation_template.format(self.tokenizer.mask_token, explanation)

        if self.train:
            label = example["gold_label"]
            return premise, explanation, label

        return premise, explanation

In [109]:
df_test = pd.read_csv("../data/raw/gpt_output.csv")
df_test_renamed = renameColumns(df_test)
df_test_cleaned = filterNan(df_test_renamed)
df_test_cleaned.loc[:, "gold_label"] = encode_labels(df_test_cleaned["gold_label"])
df_test_final = df_test_cleaned.loc[:, ["gold_label", "premise", "hypothesis", "machine_explanation"]]

In [110]:
df_test_final

Unnamed: 0,gold_label,premise,hypothesis,machine_explanation
0,2,An older man sits with his orange juice at a s...,A boy flips a burger.,An older man in a coffee shop and a boy flippi...
1,1,An older man sits with his orange juice at a s...,An elderly man sits in a small shop.,The detailed setting of a coffee shop and othe...
2,1,Two blond women are hugging one another.,Some women are hugging on vacation.,The specifics of the women being blond and the...
3,2,Two blond women are hugging one another.,The women are sleeping.,Women hugging and women sleeping are mutually ...
4,0,Two blond women are hugging one another.,There are women showing affection.,Women showing affection directly supports the ...
...,...,...,...,...
435,2,Asian students wearing blazers sit at differen...,Libriarians are shelving books.,Asian students studying and librarians shelvin...
436,1,A group of people are standing on sidewalk beh...,A group of people are walking some place,Standing behind a barrier and walking are dist...
437,0,A group of people are standing on sidewalk beh...,A group of people are standing outside,Standing outside encompasses various scenarios...
438,2,A group of people are standing on sidewalk beh...,A group of dogs are running,A group of people and a group of dogs describe...


In [111]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# change the model's classifier to a smaller one
dense = nn.Linear(768, 256)
out = nn.Linear(256, 3)
model.classifier.dense = dense
model.classifier.out_proj = out

# freeze all the parameters in the base model
for name, param in model.named_parameters():
    param.requires_grad = False

# only train the classification head
for param in model.classifier.parameters():
    param.requires_grad = True

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
load_model(model, "model.safetensors")

# predict after training
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Test without PP

In [68]:
df_test_no_pp = df_test_final.copy()
df_test_no_pp.rename(columns={'machine_explanation': 'explanation'}, inplace=True)

In [69]:
test_dataset = eSNLIDataset(df_test_no_pp, tokenizer, train=False)

In [71]:
predictions = []
actual_labels = df_test_final['gold_label'].astype("int")

with torch.no_grad():
    for batch in test_dataset:
        premise, hypothesis = batch

        encoded_input = tokenizer(premise, hypothesis, return_tensors="pt",padding=True, truncation=True).to(device)

        outputs = model(**encoded_input)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predicted_classes = [pred.item() for pred in predicted_classes]
        predictions.extend(predicted_classes)

print(classification_report(actual_labels, predictions, target_names=list(label_to_id.keys())))
print(calc_f1_score(predictions, actual_labels))

               precision    recall  f1-score   support

   entailment       0.50      0.71      0.59       174
      neutral       0.35      0.19      0.25       162
contradiction       0.56      0.57      0.56       164

     accuracy                           0.49       500
    macro avg       0.47      0.49      0.47       500
 weighted avg       0.47      0.49      0.47       500

(0.519018764804707, 0.494, 0.4656819021936371)


## Spell checking and grammar

In [88]:
df_test_spelling = df_test_final.copy()
df_test_spelling['explanation'] = df_test_spelling['machine_explanation'].apply(correct_text)

In [89]:
test_dataset = eSNLIDataset(df_test_spelling, tokenizer, train=False)

In [90]:
predictions = []
actual_labels = df_test_final['gold_label'].astype("int")

with torch.no_grad():
    for batch in test_dataset:
        premise, hypothesis = batch

        encoded_input = tokenizer(premise, hypothesis, return_tensors="pt",padding=True, truncation=True).to(device)

        outputs = model(**encoded_input)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predicted_classes = [pred.item() for pred in predicted_classes]
        predictions.extend(predicted_classes)

print(classification_report(actual_labels, predictions, target_names=list(label_to_id.keys())))
print(calc_f1_score(predictions, actual_labels))

               precision    recall  f1-score   support

   entailment       0.42      0.07      0.13       175
      neutral       0.32      0.82      0.46       158
contradiction       0.41      0.16      0.23       163

     accuracy                           0.34       496
    macro avg       0.38      0.35      0.27       496
 weighted avg       0.38      0.34      0.27       496

(0.41058535763646825, 0.3387096774193548, 0.27227564787977093)


In [91]:
df_test_spelling.to_csv("../data/v2/PP/output_gen_test(0-500)_spelling.csv", index=False)

## Semantic similarity checking

In [113]:
df_test_semantics = df_test_final.copy()
df_test_semantics.loc[:, ['similarity_score', 'processed_explanation']] = df_test_semantics.apply(process_explanations, axis=1)
df_test_semantics.rename(columns={'processed_explanation': 'explanation'}, inplace=True)

In [114]:
test_dataset = eSNLIDataset(df_test_semantics, tokenizer, train=False)

In [116]:
df_test_semantics

Unnamed: 0,gold_label,premise,hypothesis,machine_explanation,similarity_score,explanation
0,2,An older man sits with his orange juice at a s...,A boy flips a burger.,An older man in a coffee shop and a boy flippi...,0.440353,An older man in a coffee shop and a boy flippi...
1,1,An older man sits with his orange juice at a s...,An elderly man sits in a small shop.,The detailed setting of a coffee shop and othe...,0.430189,The detailed setting of a coffee shop and othe...
2,1,Two blond women are hugging one another.,Some women are hugging on vacation.,The specifics of the women being blond and the...,0.694433,The specifics of the women being blond and the...
3,2,Two blond women are hugging one another.,The women are sleeping.,Women hugging and women sleeping are mutually ...,0.587220,Women hugging and women sleeping are mutually ...
4,0,Two blond women are hugging one another.,There are women showing affection.,Women showing affection directly supports the ...,0.748105,Women showing affection directly supports the ...
...,...,...,...,...,...,...
435,2,Asian students wearing blazers sit at differen...,Libriarians are shelving books.,Asian students studying and librarians shelvin...,0.630914,Asian students studying and librarians shelvin...
436,1,A group of people are standing on sidewalk beh...,A group of people are walking some place,Standing behind a barrier and walking are dist...,0.548094,Standing behind a barrier and walking are dist...
437,0,A group of people are standing on sidewalk beh...,A group of people are standing outside,Standing outside encompasses various scenarios...,0.529034,Standing outside encompasses various scenarios...
438,2,A group of people are standing on sidewalk beh...,A group of dogs are running,A group of people and a group of dogs describe...,0.315518,A group of people and a group of dogs describe...


In [115]:
predictions = []
actual_labels = df_test_final['gold_label'].astype("int")

with torch.no_grad():
    for batch in test_dataset:
        premise, hypothesis = batch

        encoded_input = tokenizer(premise, hypothesis, return_tensors="pt",padding=True, truncation=True).to(device)

        outputs = model(**encoded_input)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predicted_classes = [pred.item() for pred in predicted_classes]
        predictions.extend(predicted_classes)

print(classification_report(actual_labels, predictions, target_names=list(label_to_id.keys())))
print(calc_f1_score(predictions, actual_labels))

               precision    recall  f1-score   support

   entailment       0.92      0.92      0.92       146
      neutral       0.78      0.96      0.86       148
contradiction       0.96      0.73      0.83       146

     accuracy                           0.87       440
    macro avg       0.88      0.87      0.87       440
 weighted avg       0.88      0.87      0.87       440

(0.8716580178102378, 0.8704545454545455, 0.8692905480417427)


In [117]:
df_test_semantics.to_csv("../data/gpt4/gpt_output_semantics.csv", index=False)

## Redundancy removal

In [96]:
df_test_redundant = df_test_final.copy()
df_test_redundant["explanation"] = df_test_redundant["machine_explanation"].apply(lambda x: remove_redundancy(x, 5))

In [97]:
test_dataset = eSNLIDataset(df_test_redundant, tokenizer, train=False)

In [98]:
predictions = []

actual_labels = df_test_final['gold_label'].astype("int")

with torch.no_grad():
    for batch in test_dataset:
        premise, hypothesis = batch

        encoded_input = tokenizer(premise, hypothesis, return_tensors="pt",padding=True, truncation=True).to(device)

        outputs = model(**encoded_input)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predicted_classes = [pred.item() for pred in predicted_classes]
        predictions.extend(predicted_classes)

print(classification_report(actual_labels, predictions, target_names=list(label_to_id.keys())))
print(calc_f1_score(predictions, actual_labels))

               precision    recall  f1-score   support

   entailment       0.40      0.10      0.16       175
      neutral       0.32      0.68      0.43       158
contradiction       0.40      0.28      0.33       163

     accuracy                           0.34       496
    macro avg       0.37      0.35      0.31       496
 weighted avg       0.37      0.34      0.30       496

(0.3830362674581212, 0.34274193548387094, 0.30716419730504235)


In [99]:
df_test_redundant.to_csv("../data/v2/PP/output_gen_test(0-500)_redundant.csv", index=False)

# Test

In [119]:
df = pd.read_csv("../data/v3/PP/output_gen_test(0-500)_semantics.csv")
df

Unnamed: 0,gold_label,premise,hypothesis,machine_explanation,similarity_score,explanation
0,1,this church choir sings to the masses as they ...,the church has cracks in the ceiling,the roof is falling down.The church was built...,0.464577,the roof is falling down.The church was built...
1,0,this church choir sings to the masses as they ...,the church is filled with song,the choir is singing joyously.The church has ...,0.762740,the choir is singing joyously.The church has ...
2,2,this church choir sings to the masses as they ...,a choir singing at a baseball game,the church is singing to a different choir.Th...,0.563366,the church is singing to a different choir.Th...
3,1,a woman with a green headscarf blue shirt and ...,the woman is young,she is a girl.The woman in the picture is the...,0.314043,she is a girl.The woman in the picture is the...
4,0,a woman with a green headscarf blue shirt and ...,the woman is very happy,she is a Muslim.The woman's face is covered b...,0.389600,she is a Muslim.The woman's face is covered b...
...,...,...,...,...,...,...
495,2,many children play in the water,the children are playing mini golf,they are not playing in a pool.The children's...,0.436987,they are not playing in a pool.The children's...
496,2,a group of kids is splashing in deep water nea...,the kids are singing in a choir,they are not singing to the tune of the song....,0.457720,they are not singing to the tune of the song....
497,0,a group of kids is splashing in deep water nea...,the kids are in deep water,they are swimming in the water.The kids were ...,0.450511,they are swimming in the water.The kids were ...
498,1,a group of kids is splashing in deep water nea...,they are wearing lifejackets,they have been swimming in the water for a lo...,0.532402,they have been swimming in the water for a lo...


In [125]:
# get row with highest similarity score
test_df = df.sort_values(by="similarity_score", ascending=False).head()
test_df

Unnamed: 0,gold_label,premise,hypothesis,machine_explanation,similarity_score,explanation
113,0,a crowd of people looking up at 3 people on th...,the crowd on the ground is watching 3 people o...,the people in the building are looking down a...,0.876279,the people in the building are looking down a...
253,1,a dog standing near snow looking at water,the dog is thinking about going to for a swim,of the snow.The dog's behavior is related wit...,0.774791,of the snow.The dog's behavior is related wit...
1,0,this church choir sings to the masses as they ...,the church is filled with song,the choir is singing joyously.The church has ...,0.76274,the choir is singing joyously.The church has ...
151,1,a woman is painting a mural of a woman's face,there is a woman painting for fun,she is doing it for her own enjoyment.The wom...,0.761445,she is doing it for her own enjoyment.The wom...
190,1,a group of people dancing together,they are doing the tango,they have a group.The tangos are not a dance....,0.761111,they have a group.The tangos are not a dance....


In [131]:
test_dataset = eSNLIDataset(test_df, tokenizer, train=False)

In [130]:
predictions = []

actual_labels = test_df['gold_label'].astype("int")

with torch.no_grad():
    for batch in test_dataset:
        premise, hypothesis = batch

        encoded_input = tokenizer(premise, hypothesis, return_tensors="pt",padding=True, truncation=True).to(device)

        outputs = model(**encoded_input)
        logits = outputs.logits.cpu()

        predicted_classes = torch.argmax(logits, dim=1)
        predicted_classes = [pred.item() for pred in predicted_classes]
        predictions.extend(predicted_classes)

print(classification_report(actual_labels, predictions, target_names=list(label_to_id.keys())))
print(calc_f1_score(predictions, actual_labels))

ValueError: Found input variables with inconsistent numbers of samples: [5, 500]

In [None]:
test_df_with_pred = pd.concat((test_df, pd.Series(predictions, name="predicted_label")), axis=1)
test_df_with_pred.head()