# Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install transformers==4.22.2

!pip install statsmodels

!pip install datasets

!pip install -U tensorflow==2.10 

!nvidia-smi

In [None]:
# main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict
from tqdm.autonotebook import tqdm
import spacy
import re
import statsmodels
import statsmodels.api as sm
import scipy

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, mean_absolute_percentage_error, r2_score, jaccard_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# specific machine learning functionality
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as K
import datasets
from datasets import Dataset
from datasets import load_from_disk

# Transformers
import transformers
from transformers import (
    BertTokenizer, 
    TFBertForSequenceClassification, 
    TFBertForMaskedLM, 
    TFBertModel,
    create_optimizer,
    DataCollatorForLanguageModeling
)

In [None]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
#tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
word_dir = "/content/drive/MyDrive/"

# Preprocessing

In [None]:
#df = pd.read_csv("data/IMDB Dataset.csv")
df = pd.read_csv(word_dir + "Colab Notebooks/IMDB Dataset.csv") 

In [None]:
df_gap_filler, df_classification = train_test_split(df, test_size=0.5, random_state=1, stratify=df["label"])
print(f"Positive Rate in Gap filler data: {np.mean(df_gap_filler.label)}")
print(f"Positive Rate in Classifier data: {np.mean(df_classification.label)}")

# Tune Gap Filler

## Tokenization

In [None]:
### Tokenization parameters
classifier_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(classifier_name, do_lower_case=True)
batch_size = 8 
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
### BERT LM Setup
learning_rate = 2e-5
epochs = 2
gap_filler_model_name = "bert-base-uncased"
train_dataset = None

def get_bert_LM():
    return TFBertForMaskedLM.from_pretrained(gap_filler_model_name)

def tokenize_function(examples):
    return bert_tokenizer(
        examples["text"], 
        return_special_tokens_mask=True,
        padding='max_length',
        max_length=256,
        truncation='longest_first',
    )

In [None]:
# maybe move this and all following to only execute at train
raw_dataset = Dataset.from_pandas(df_gap_filler)
tokenized_datasets = raw_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=8,
)

In [None]:
train_indices, val_indices = train_test_split(
    list(range(len(tokenized_datasets))), test_size=0.2
)

train_dataset = tokenized_datasets.select(train_indices)
eval_dataset = tokenized_datasets.select(val_indices)

## Training

In [None]:
def get_compiled_bert_LM():
    # Free up memory
    K.clear_session()

    # Build the model
    model = get_bert_LM()

    # Print the model architecture
    print(model.summary())

    # get number of steps
    if train_dataset:
        num_train_steps = len(train_dataset) * epochs

        # Compile
        optimizer, _lr_schedule = create_optimizer(
          init_lr=learning_rate,
          num_train_steps=num_train_steps,
          num_warmup_steps=0
        )
        model.compile(optimizer=optimizer, run_eagerly=True)
    
    return model

In [None]:
gap_tuned_model = get_compiled_bert_LM()

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tokenizer, return_tensors="tf"
) 

In [None]:
options = tf.data.Options()

In [None]:
tf_train_dataset = gap_tuned_model.prepare_tf_dataset(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=data_collator,
).with_options(options)

In [None]:
tf_eval_dataset = gap_tuned_model.prepare_tf_dataset(
  eval_dataset,
  batch_size=8,
  shuffle=False,
  collate_fn=data_collator,
  drop_remainder=True,
).with_options(options)

In [None]:
train_model = False
if train_model:
    start_time = time.time()
    history = gap_tuned_model.fit(
      tf_train_dataset,
      validation_data=tf_eval_dataset,
      epochs=epochs,
    )
    execution_time = (time.time() - start_time)/60.0
    print("Training execution time (mins)",execution_time)
    gap_tuned_model.save_pretrained(word_dir + 'Senior Thesis models/model_LM_bert_1/temp')
else:
    gap_tuned_model = TFBertForMaskedLM.from_pretrained(word_dir + 'Senior Thesis models/model_LM_bert_1/temp')