# Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install transformers==4.22.2

!pip install statsmodels

!pip install datasets

!pip install -U tensorflow==2.10 

!nvidia-smi

In [None]:
# main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict
from tqdm.autonotebook import tqdm
import spacy
import re
import statsmodels
import statsmodels.api as sm
import scipy

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, mean_absolute_percentage_error, r2_score, jaccard_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# specific machine learning functionality
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as K
import datasets
from datasets import Dataset
from datasets import load_from_disk

# Transformers
import transformers
from transformers import (
    BertTokenizer, 
    TFBertForSequenceClassification, 
    TFBertForMaskedLM, 
    TFBertModel,
    #create_optimizer,
    #DataCollatorForLanguageModeling,
    #PreTrainedTokenizerFast
)

In [None]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
#tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
word_dir = "/content/drive/MyDrive/"

# Preprocessing

In [None]:
#df = pd.read_csv("data/IMDB Dataset.csv")
df = pd.read_csv(word_dir + "Colab Notebooks/IMDB Dataset.csv") 
df = df.rename(columns={"review":"text", "sentiment": "label"})
df["label"] = df["label"] == "positive"

In [None]:
display(df.head())

In [None]:
print(f"Positive Rate: {np.mean(df.label)}")

In [None]:
get_len = lambda s: len(s.split())
lengths = df["text"].apply(get_len)

In [None]:
plt.hist(lengths)
plt.xlabel("Text length", fontsize = 20)
plt.ylabel("Frequency", fontsize = 20)
plt.gcf().set_size_inches(15, 10)

In [None]:
print(f"Number of data points of lengths high than 256: {sum(lengths > 256)}")
print(f"Number of data points of lengths high than 512: {sum(lengths > 512)}")
print(f"Number of data points of lengths high than 1024: {sum(lengths > 1024)}")

In [None]:
df_gap_filler, df_classification = train_test_split(df, test_size=0.5, random_state=1, stratify=df["label"])
print(f"Positive Rate in Gap filler data: {np.mean(df_gap_filler.label)}")
print(f"Positive Rate in Classifier data: {np.mean(df_classification.label)}")

# Train SA Classifier

## Tokenization

In [None]:
### Tokenization parameters
classifier_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(classifier_name, do_lower_case=True)
batch_size = 8 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
### Tokenization function
def tokenize_for_bert_classifier(df, should_shuffle=False):
    # Tokenization
    X_tokenized = bert_tokenizer.batch_encode_plus(
            df["text"],
            return_tensors='tf',
            add_special_tokens = True,
            return_token_type_ids=True,
            padding='max_length',
            max_length=256,
            return_attention_mask = True,
            truncation='longest_first'
    )
    # Creating TF datasets
    dataset = tf.data.Dataset.from_tensor_slices(((X_tokenized["input_ids"],
                                                   X_tokenized["token_type_ids"],
                                                   X_tokenized["attention_mask"]), 
                                                  df["label"]))
    if should_shuffle:
      buffer_train = len(df["text"])
      dataset = dataset.shuffle(buffer_size=buffer_train)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
df_classification_train_all, df_classification_test = train_test_split(df_classification, 
                                                                   test_size=0.2, 
                                                                   random_state=1, 
                                                                   stratify=df_classification["label"])

df_classification_train, df_classification_val = train_test_split(df_classification_train_all, 
                                                                  test_size=0.2, 
                                                                  shuffle=True, 
                                                                  random_state=1, 
                                                                  stratify=df_classification_train_all["label"])

In [None]:
classification_training_data = tokenize_for_bert_classifier(df_classification_train, should_shuffle=True)
classification_validation_data = tokenize_for_bert_classifier(df_classification_val)
classification_test_data = tokenize_for_bert_classifier(df_classification_test)

## Training

In [None]:
### BERT Setup
learning_rate = 2e-5
epochs = 5
def get_bert_classifier():
    return TFBertForSequenceClassification.from_pretrained(classifier_name, 
                                                           num_labels=1, from_pt = True)

def get_compiled_bert_classifier(model = None):
    # Free up memory
    K.clear_session()

    # Build the model
    if model is None:
      model = get_bert_classifier()

    # Print the model architecture
    print(model.summary())

    # Optimizer
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    # Loss
    loss = keras.losses.BinaryCrossentropy(from_logits=True)

    # Compile
    model.compile(loss=loss,
                      optimizer=optimizer,
                      metrics=['accuracy'])
    
    return model

In [None]:
classifier_model = get_compiled_bert_classifier()

In [None]:
### Train model
train_model = False
if train_model:
    start_time = time.time()
    training_results = classifier_model.fit(
            classification_training_data,
            validation_data=classification_validation_data,
            epochs=epochs,
            verbose=1)
    execution_time = (time.time() - start_time)/60.0
    print("Training execution time (mins)",execution_time)
    classifier_model.save_pretrained(word_dir + 'Senior Thesis models/model_classifier_bert_1/temp')
else:
    classifier_model = TFBertForSequenceClassification.from_pretrained(word_dir + 'Senior Thesis models/model_classifier_bert_1/temp')

## Evaluation

In [None]:
### Evaluation Function
def evaluate_bert_classifier(bert_model, dataset, Y_true, only_accuracy = False):
    Y_pred = bert_model.predict(dataset)
    Y_pred = Y_pred['logits'] > 0
    acc = accuracy_score(Y_true, Y_pred)
    print(f"Accuracy: {acc}")
    if only_accuracy:
        return
    f1 = f1_score(Y_true, Y_pred)
    print(f"F1 score: {f1}")
    recall = recall_score(Y_true, Y_pred)
    print(f"Recall score: {recall}")
    precision = precision_score(Y_true, Y_pred)
    print(f"Precision score: {precision}")
    
    Y_pred = np.asarray([x[0] for x in Y_pred])
    
    TN = np.sum((Y_true == Y_pred) & (Y_pred == 0))
    TP = np.sum((Y_true == Y_pred) & (Y_pred == 1))
    
    FN = np.sum((Y_true != Y_pred) & (Y_pred == 0))
    FP = np.sum((Y_true != Y_pred) & (Y_pred == 1))
    
    print(f"TN: {TN}, TP:{TP}, FN:{FN}, FP:{FP}")

In [None]:
# Must have should_shuffle=False 
evaluate_bert_classifier(classifier_model, classification_training_data, df_classification_train["label"])

In [None]:
evaluate_bert_classifier(classifier_model, classification_validation_data, qadataset_train['validation']["label"])

In [None]:
evaluate_bert_classifier(classifier_model, classification_test_data, df_classification_test["label"])