# Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install transformers==4.22.2

!pip install statsmodels

!pip install datasets

!pip install -U tensorflow==2.10 

!nvidia-smi

In [None]:
# main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict
from tqdm.autonotebook import tqdm
import spacy
import re
import statsmodels
import statsmodels.api as sm
import scipy

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, mean_absolute_percentage_error, r2_score, jaccard_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# specific machine learning functionality
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as K
import datasets
from datasets import Dataset
from datasets import load_from_disk

# Transformers
import transformers
from transformers import (
    BertTokenizer, 
    TFBertForSequenceClassification, 
    TFBertForMaskedLM, 
    TFBertModel,
    #create_optimizer,
    #DataCollatorForLanguageModeling,
    #PreTrainedTokenizerFast
)

In [None]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
#tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
word_dir = "/content/drive/MyDrive/"

# Data

In [None]:
qadataset = datasets.load_dataset("squad")

# Labeling the Data

In [None]:
qadataset_train = qadataset['train']
qadataset_test = qadataset['validation']

In [None]:
def make_negative(example):
    context = example['context']
    answer_start = example['answers']['answer_start'][0]
    sentence_number = context[:answer_start].count(".")
    sentences = context.split(".")
    example['context'] = '.'.join(sentences[:sentence_number] + sentences[sentence_number+1:])
    example['label'] = False
    return example

In [None]:
def make_positive(example):
    example['label'] = True
    return example

In [None]:
qadataset_train_label_split = qadataset_train.train_test_split(test_size=0.5, shuffle=True, seed=109)

qadataset_train_positive = qadataset_train_label_split['train']
qadataset_train_negative = qadataset_train_label_split['test']

qadataset_train_negative = qadataset_train_negative.map(make_negative)
qadataset_train_positive = qadataset_train_positive.map(make_positive)

In [None]:
qadataset_test_label_split = qadataset_test.train_test_split(test_size=0.5, shuffle=True, seed=109)

qadataset_test_positive = qadataset_test_label_split['train']
qadataset_test_negative = qadataset_test_label_split['test']

qadataset_test_positive = qadataset_test_positive.map(make_positive)
qadataset_test_negative = qadataset_test_negative.map(make_negative)

In [None]:
qadataset_train = datasets.concatenate_datasets([qadataset_train_positive, qadataset_train_negative])
qadataset_test = datasets.concatenate_datasets([qadataset_test_positive, qadataset_test_negative])

In [None]:
def combine_q_a(example):
    example['text'] = '[CLS] ' + example['question'] + ' [SEP] ' + example['context']
    return example

In [None]:
qadataset_train = qadataset_train.map(combine_q_a)
qadataset_test = qadataset_test.map(combine_q_a)

# Tokenization

In [None]:
### Tokenization parameters
classifier_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(classifier_name, do_lower_case=True)
batch_size = 8 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
### Tokenization function
def tokenize_for_bert_classifier(df, should_shuffle=False):
    # Tokenization
    X_tokenized = bert_tokenizer.batch_encode_plus(
            df["text"],
            return_tensors='tf',
            add_special_tokens = True,
            return_token_type_ids=True,
            padding='max_length',
            max_length=256,
            return_attention_mask = True,
            truncation='longest_first'
    )
    # Creating TF datasets
    dataset = tf.data.Dataset.from_tensor_slices(((X_tokenized["input_ids"],
                                                   X_tokenized["token_type_ids"],
                                                   X_tokenized["attention_mask"]), 
                                                  df["label"]))
    if should_shuffle:
        buffer_train = len(df["text"])
        dataset = dataset.shuffle(buffer_size=buffer_train)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
qadataset_train = qadataset_train.train_test_split(test_size=0.2, shuffle=True, seed=109)

qadataset_train['validation'] = qadataset_train.pop('test')

classification_training_data = tokenize_for_bert_classifier(qadataset_train['train'], should_shuffle=True)
classification_validation_data = tokenize_for_bert_classifier(qadataset_train['validation'])

In [None]:
classification_test_data = tokenize_for_bert_classifier(qadataset_test)

# Filter Data

In [None]:
classification_test_data = tokenize_for_bert_classifier(qadataset_test)
Y_pred = classifier_model.predict(classification_test_data)
Y_pred = Y_pred['logits'] > 0
Y_pred_flat = [temp[0] for temp in Y_pred]
qadataset_test = qadataset_test.add_column("predicted label", Y_pred_flat)

qadataset_test_TP = qadataset_test.filter(lambda example: example["label"] == example["predicted label"])

qadataset_test_TP = qadataset_test_TP.filter(lambda example: example["label"] == True)

qadataset_test_TP = qadataset_test_TP.filter(lambda example: len(np.unique(example["answers"]["answer_start"])) == 1)

In [None]:
qadataset_test_TP

# Model

In [None]:
model = TFBertModel.from_pretrained("/content/drive/MyDrive/Senior Thesis models/model_classifier_bert_6/temp")

# Plots

## Data

In [None]:
test_sample = qadataset_test_TP
test_sample

### Masking Data

In [None]:
def get_masked_sample(example):
    text = example["text"]
    all_words = text.split()

    question_length = len(example['question'].split()) 
    full_length = len(all_words) 
    masking_size = int((full_length - question_length) * 0.15 + 1)

    word_indices = np.random.choice(
      range(question_length, full_length), 
      size=masking_size,
      replace = False,
    )
    text_with_deletion = " ".join([temp_word if j not in word_indices else "" for (j, temp_word) in enumerate(all_words)])
    example["text_with_deletion"] = text_with_deletion

    text_with_masking = " ".join([temp_word if j not in word_indices else "[MASK]" for (j, temp_word) in enumerate(all_words)])
    example["text_with_masking"] = text_with_masking
    return example

In [None]:
np.random.seed(109)
test_sample = test_sample.map(get_masked_sample)

### Replacement Data

In [None]:
def get_replacement_sample(example):
    text = example["text_with_masking"]
    all_words = example["text"].split()

    question_length = len(example['question'].split()) 
    full_length = len(all_words) 
    masking_size = int((full_length - question_length) * 0.15 + 1)

    # get gap filler logits
    inputs = bert_tokenizer(text, return_tensors="tf")
    logits = gap_untuned_model(**inputs).logits

    # retrieve indices of [MASK]
    mask_token_index = tf.where((inputs.input_ids == bert_tokenizer.mask_token_id)[0])
    selected_logits = tf.gather_nd(logits[0], indices=mask_token_index)

    # get top predictions
    top_k_words = 10
    predicted_token_ids = [tf.math.argmax(temp, axis=-1) for temp in selected_logits]
    options = [bert_tokenizer.decode([temp]) for temp in predicted_token_ids]

    # get scores of those predictions
    filled_sentence = text
    for j in range(masking_size):
        filled_sentence = filled_sentence.replace("[MASK]", options[j], 1)
    example["text_with_replacement"] = filled_sentence
    return example

In [None]:
np.random.seed(109)
test_sample = test_sample.map(get_replacement_sample)

### Preparing the Data

In [None]:
df_original = pd.DataFrame()
df_original["text"] = test_sample["text"]
df_original["source"] = "original"

df_deletion = pd.DataFrame()
df_deletion["text"] = test_sample["text_with_deletion"]
df_deletion["source"] = "deletion"

df_replacement = pd.DataFrame()
df_replacement["text"] = test_sample["text_with_replacement"]
df_replacement["source"] = "replacement"

In [None]:
df_tsne = pd.concat([df_replacement, df_deletion, df_original], 
                    ignore_index=True)

### BERT Encoding

In [None]:
### Tokenize all the data
text_encoding = bert_tokenizer.batch_encode_plus(df_tsne['text'],
                                                  return_tensors='tf',
                                                  add_special_tokens = True,
                                                  return_token_type_ids=True,
                                                  padding='max_length',
                                                  max_length=256,
                                                  return_attention_mask = True,
                                                  truncation='longest_first')
text_encoding_dataset = tf.data.Dataset.from_tensor_slices((text_encoding["input_ids"],
                                                             text_encoding["token_type_ids"],
                                                             text_encoding["attention_mask"]))
text_encoding_dataset = text_encoding_dataset.batch(batch_size)
text_encoding_dataset = text_encoding_dataset.prefetch(buffer_size=AUTOTUNE)

In [None]:
### Get pooled outputs, which are the outputs of the last layer of Bert
embedding_to_concat = []
for batch in tqdm(text_encoding_dataset):
    batch_embedding = model(batch)
    embedding_to_concat.append(batch_embedding['pooler_output'])
text_hidden_layer = tf.concat(embedding_to_concat, axis = 0).numpy()

## TSNE

In [None]:
### Get TSNE components
text_tsne_representation = TSNE(n_components=2, random_state = 109).fit_transform(text_hidden_layer)

In [None]:
text_tsne_x = [temp[0] for temp in text_tsne_representation]
text_tsne_y = [temp[1] for temp in text_tsne_representation]

In [None]:
df_tsne["x_tsne"] = text_tsne_x
df_tsne["y_tsne"] = text_tsne_y

In [None]:
### Plot
groups = df_tsne.groupby('source', sort = False)
colors = ["#332288", (0.53, 0.8, 0.93, 0.5), "orange"]

plt.figure(figsize=(10, 7))
for current_color, (name, group) in zip(colors, groups):
  plt.scatter(group.x_tsne, group.y_tsne, label = name, color = current_color)
plt.legend()
plt.show()

## PCA

In [None]:
### Get TSNE components
text_hidden_layer_standardized = StandardScaler().fit_transform(text_hidden_layer)
text_pca_representation = PCA(n_components=2, random_state = 109).fit_transform(text_hidden_layer_standardized)

In [None]:
text_pca_x = [temp[0] for temp in text_pca_representation]
text_pca_y = [temp[1] for temp in text_pca_representation]

In [None]:
df_tsne["x_pca"] = text_pca_x
df_tsne["y_pca"] = text_pca_y

In [None]:
### Plot
groups = df_tsne.groupby('source', sort = False)

plt.figure(figsize=(10, 7))
for current_color, (name, group) in zip(colors, groups):
  plt.scatter(group.x_pca, group.y_pca, label = name, color = current_color)
plt.legend()
plt.show()

In [None]:
### Plot
groups = df_tsne.groupby('source', sort = False)


for current_color, (name, group) in zip(colors, groups):
    plt.figure(figsize=(10, 7))
    plt.scatter(group.x_pca, group.y_pca, label = name, color = current_color)
    plt.legend()
    plt.show()

# FID Distance

In [None]:
replacement_data = text_hidden_layer[:2784]
deletion_data = text_hidden_layer[2784:5568]
original_data = text_hidden_layer[5568:]

In [None]:
def calculate_fid(x, y):
    mu_x = np.mean(x, axis = 0)
    mu_y = np.mean(y, axis = 0)

    cov_x = np.cov(x, rowvar=False)
    cov_y = np.cov(y, rowvar=False)

    mu_difference = np.sum((mu_x - mu_y) ** 2)
    cov_sqrt = scipy.linalg.sqrtm(np.dot(cov_x, cov_y))
    if np.iscomplexobj(cov_sqrt):
        cov_sqrt = cov_sqrt.real
    fid = mu_difference + np.trace(cov_x + cov_y - 2.0 * cov_sqrt)
    return fid

In [None]:
calculate_fid(original_data, deletion_data)

calculate_fid(original_data, replacement_data)