# Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install transformers==4.22.2

!pip install statsmodels

!pip install datasets

!pip install -U tensorflow==2.10 

!nvidia-smi

In [None]:
# main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict
from tqdm.autonotebook import tqdm
import spacy
import re
import statsmodels
import statsmodels.api as sm
import scipy

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, mean_absolute_percentage_error, r2_score, jaccard_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# specific machine learning functionality
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as K
import datasets
from datasets import Dataset
from datasets import load_from_disk

# Transformers
import transformers
from transformers import (
    BertTokenizer, 
    TFBertForSequenceClassification, 
    TFBertForMaskedLM, 
    TFBertModel,
)

In [None]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
#tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
word_dir = "/content/drive/MyDrive/"

# Data

In [None]:
qadataset = datasets.load_dataset("squad")

# Labeling the Data

In [None]:
qadataset_train = qadataset['train']
qadataset_test = qadataset['validation']

In [None]:
def make_negative(example):
    context = example['context']
    answer_start = example['answers']['answer_start'][0]
    sentence_number = context[:answer_start].count(".")
    sentences = context.split(".")
    example['context'] = '.'.join(sentences[:sentence_number] + sentences[sentence_number+1:])
    example['label'] = False
    return example

In [None]:
def make_positive(example):
    example['label'] = True
    return example

In [None]:
qadataset_train_label_split = qadataset_train.train_test_split(test_size=0.5, shuffle=True, seed=109)

qadataset_train_positive = qadataset_train_label_split['train']
qadataset_train_negative = qadataset_train_label_split['test']

qadataset_train_negative = qadataset_train_negative.map(make_negative)
qadataset_train_positive = qadataset_train_positive.map(make_positive)

In [None]:
qadataset_test_label_split = qadataset_test.train_test_split(test_size=0.5, shuffle=True, seed=109)

qadataset_test_positive = qadataset_test_label_split['train']
qadataset_test_negative = qadataset_test_label_split['test']

qadataset_test_positive = qadataset_test_positive.map(make_positive)
qadataset_test_negative = qadataset_test_negative.map(make_negative)

In [None]:
qadataset_train = datasets.concatenate_datasets([qadataset_train_positive, qadataset_train_negative])
qadataset_test = datasets.concatenate_datasets([qadataset_test_positive, qadataset_test_negative])

In [None]:
def combine_q_a(example):
    example['text'] = '[CLS] ' + example['question'] + ' [SEP] ' + example['context']
    return example

In [None]:
qadataset_train = qadataset_train.map(combine_q_a)
qadataset_test = qadataset_test.map(combine_q_a)

# Tokenization

In [None]:
### Tokenization parameters
classifier_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(classifier_name, do_lower_case=True)
batch_size = 8 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
### Tokenization function
def tokenize_for_bert_classifier(df, should_shuffle=False):
    # Tokenization
    X_tokenized = bert_tokenizer.batch_encode_plus(
            df["text"],
            return_tensors='tf',
            add_special_tokens = True,
            return_token_type_ids=True,
            padding='max_length',
            max_length=256,
            return_attention_mask = True,
            truncation='longest_first'
    )
    # Creating TF datasets
    dataset = tf.data.Dataset.from_tensor_slices(((X_tokenized["input_ids"],
                                                   X_tokenized["token_type_ids"],
                                                   X_tokenized["attention_mask"]), 
                                                  df["label"]))
    if should_shuffle:
        buffer_train = len(df["text"])
        dataset = dataset.shuffle(buffer_size=buffer_train)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
qadataset_train = qadataset_train.train_test_split(test_size=0.2, shuffle=True, seed=109)

qadataset_train['validation'] = qadataset_train.pop('test')

classification_training_data = tokenize_for_bert_classifier(qadataset_train['train'], should_shuffle=True)
classification_validation_data = tokenize_for_bert_classifier(qadataset_train['validation'])

In [None]:
classification_test_data = tokenize_for_bert_classifier(qadataset_test)

# Model

In [None]:
classifier_model = TFBertForSequenceClassification.from_pretrained(word_dir + 'Senior Thesis models/model_classifier_bert_6/temp')

# QUACKIE

## Filter Data

In [None]:
classification_test_data = tokenize_for_bert_classifier(qadataset_test)
Y_pred = classifier_model.predict(classification_test_data)
Y_pred = Y_pred['logits'] > 0
Y_pred_flat = [temp[0] for temp in Y_pred]
qadataset_test = qadataset_test.add_column("predicted label", Y_pred_flat)

qadataset_test_TP = qadataset_test.filter(lambda example: example["label"] == example["predicted label"])

qadataset_test_TP = qadataset_test_TP.filter(lambda example: example["label"] == True)

qadataset_test_TP = qadataset_test_TP.filter(lambda example: len(np.unique(example["answers"]["answer_start"])) == 1)

In [None]:
qadataset_test_TP

## Creat Labels

In [None]:
split_re = r'[.]'
def get_ground_truth_sentence_positions(example):
    context = example['context']
    answer_letter_start = example['answers']['answer_start'][0]
    sentence_number = context[:answer_letter_start].count(". ")
    # sentence_number = len(re.findall(split_re, context[:answer_letter_start]))
    example["answer_sentence_position"] = sentence_number
    return example

qadataset_test_TP = qadataset_test_TP.map(get_ground_truth_sentence_positions)


In [None]:
qadataset_test_TP[100]

## Get Predictions

In [None]:
def get_predicted_tokens(example, classifier_model = classifier_model):
    text = example["text"]
    question_length = len(example['question'].split()) + 2

    _, full_replacement_list = show_multiple_masking_replacement_score(
      text, 
      classifier = classifier_model,
      n_samples_per_word = 1,
      return_type="list",
      ignore_first_x_words = question_length
    )
    full_replacement_list = full_replacement_list[question_length + 1:]

    _, full_masking_list = show_multiple_masking_score(
      text, 
      classifier = classifier_model,
      n_samples_per_word = 5,
      return_type="list",
      ignore_first_x_words = question_length
    )
    full_masking_list = full_masking_list[question_length + 1:]

    example["predicted_full_sentence_scores_by_replacement"] = full_replacement_list
    example["predicted_full_sentence_scores_by_masking"] = full_masking_list

    return example

In [None]:
def aggregate_sentence_scores(text, score_list):
    aggregated_scores = []
    sentences = text.split(". ")
    #sentences = re.split(split_re, text)
    if sentences[-1] == "":
        sentences = sentences[:-1]
    for sentence in sentences:
        sentence_length = len(sentence.split())
        sentence_scores =  np.asarray(score_list[:sentence_length])
        # current_score = np.mean(sentence_scores)
        # x[x > -np.percentile(-x, 10)]
        # current_score = np.mean(sentence_scores[sentence_scores > -np.percentile(-sentence_scores, 20)])
        top_words = 4
        # top_words = max(2, int(len(sentence_scores) * 0.2)) # 20%
        if len(sentence_scores) < top_words + 1:
            current_score = np.mean(sentence_scores)
        else:
            current_score = np.mean(np.partition(sentence_scores, -top_words)[-top_words:])
        # current_score = np.mean(sentence_scores)
        aggregated_scores.append(current_score)

        score_list = score_list[sentence_length:]
    return aggregated_scores

In [None]:
def get_predicted_sentence(example, classifier_model = classifier_model):
    context = example["context"]
    full_replacement_list = example["predicted_full_sentence_scores_by_replacement"]
    full_masking_list = example["predicted_full_sentence_scores_by_masking"]

    replacement_list = aggregate_sentence_scores(context, full_replacement_list)
    masking_list = aggregate_sentence_scores(context, full_masking_list)

    example["predicted_answer_sentence_position_by_replacement"] = np.argmax(replacement_list)
    example["predicted_answer_sentence_position_by_masking"] = np.argmax(masking_list)

    example["predicted_sentence_scores_by_replacement"] = replacement_list
    example["predicted_sentence_scores_by_masking"] = masking_list

    return example

## Evaluate on Sample

### Evaluation Functions

In [None]:
def evaluate_sentence_interpreters_by_accuracy(example): # is this IoU?
    example["masking_accuracy"] = (
      example["predicted_answer_sentence_position_by_masking"] ==
      example["answer_sentence_position"]
    )
    example["replacement_accuracy"] = (
      example["predicted_answer_sentence_position_by_replacement"] ==
      example["answer_sentence_position"]
    )
    return example

def evaluate_sentence_interpreters_by_snr(example): 
    def snr(gt_score, non_gt_scores):
        return ((gt_score - np.mean(non_gt_scores)) ** 2) / np.std(non_gt_scores)

    answer_position = example["answer_sentence_position"]
    example["masking_snr"] = snr(
      example["predicted_sentence_scores_by_masking"][answer_position],
      example["predicted_sentence_scores_by_masking"][:answer_position] +
      example["predicted_sentence_scores_by_masking"][answer_position + 1:]
    )
    example["replacement_snr"] = snr(
      example["predicted_sentence_scores_by_replacement"][answer_position],
      example["predicted_sentence_scores_by_replacement"][:answer_position] +
      example["predicted_sentence_scores_by_replacement"][answer_position + 1:]
    )
    return example

def evaluate_sentence_interpreters_by_hpd(example): 
    def hpd(scores, correct_position):
        return 1 / np.sum(np.asarray(scores) >= scores[correct_position])

    answer_position = example["answer_sentence_position"]
    example["masking_hpd"] = hpd(
      example["predicted_sentence_scores_by_masking"],
      answer_position
    )
    example["replacement_hpd"] = hpd(
      example["predicted_sentence_scores_by_replacement"],
      answer_position
    )
    return example

### Generate Sample

In [None]:
np.random.seed(109)
sample_indices = range(200) # np.random.choice(range(2784), 100, replace=False)
test_sample = qadataset_test_TP.select(sample_indices)

In [None]:
test_sample = test_sample.map(get_predicted_tokens)

In [None]:
test_sample = test_sample.map(get_predicted_sentence)

### Evaluate

In [None]:
test_sample = test_sample.map(evaluate_sentence_interpreters_by_accuracy)

test_sample = test_sample.map(evaluate_sentence_interpreters_by_snr)

test_sample = test_sample.map(evaluate_sentence_interpreters_by_hpd)

In [None]:
masking_mean_accuracy = np.mean(test_sample["masking_accuracy"])
print(f"Masking Accuracy: {masking_mean_accuracy}")

replacement_mean_accuracy = np.mean(test_sample["replacement_accuracy"])
print(f"Replacement Accuracy: {replacement_mean_accuracy}")

print("")

# NaNs and infs occur if there is only one or two sentences in the context because we can't estimate the std of the incorrect sentences. 
masking_mean_snr = np.mean(np.ma.masked_invalid(test_sample["masking_snr"]))
print(f"Masking SNR: {masking_mean_snr}")

replacement_mean_snr = np.mean(np.ma.masked_invalid(test_sample["replacement_snr"]))
print(f"Replacement SNR: {replacement_mean_snr}")

print("")

masking_mean_accuracy = np.mean(test_sample["masking_hpd"])
print(f"Masking HPD: {masking_mean_accuracy:0.4f}")

replacement_mean_accuracy = np.mean(test_sample["replacement_hpd"])
print(f"Replacement HPD: {replacement_mean_accuracy:0.4f}")
