# GPT-2 Lite Interview Scripter

## basic GPT-2 model test
- import necessary libraries
- load gpt 2 tokenizer, preprocessor and model
- text generation test

In [None]:
import tensorflow as tf
tf.debugging.set_log_device_placement(True)

In [None]:
tf.__version__
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
! pip install -q git+https://github.com/keras-team/keras-nlp.git@google-io-2023 tensorflow-text==2.10

In [None]:
! pip install transformers datasets

In [None]:
import numpy as np
from datasets import load_dataset
import keras_nlp
from tensorflow import keras
from tensorflow.lite.python import interpreter
import time

In [None]:
! pip install tensorflow-text

In [None]:
gpt2_tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset("gpt2_base_en")
gpt2_preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=256,
    add_end_token=True,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en", preprocessor=gpt2_preprocessor)

In [None]:
start = time.time()

output = gpt2_lm.generate("My trip to Yosemite was", max_length=200)
print("\nGPT-2 output:")
print(output.numpy().decode("utf-8"))

end = time.time()
print("TOTAL TIME ELAPSED: ", end - start)

In [None]:
start = time.time()

output = gpt2_lm.generate("That Italian restaurant is", max_length=200)
print("\nGPT-2 output:")
print(output.numpy().decode("utf-8"))

end = time.time()
print("TOTAL TIME ELAPSED: ", end - start)

## Fine Tune on Startup Interviews by Glavin001

- preprocess the dataset
- finetune the model
- save model weights

In [None]:
start = time.time()

raw_dataset = load_dataset("Glavin001/startup-interviews")

end = time.time()
print("TOTAL TIME ELAPSED: ", end - start)

In [None]:
dataset = raw_dataset.remove_columns(['input', 'start', 'instruction_length', 'output_length', 'title'])

In [None]:
dataset = dataset['train'].select(range(500))

In [None]:
dataset

In [None]:
max_length = 512
all_sentences = []
count = 0
total = len(dataset)

In [None]:
def merge_sentences(first_s, second_s):
  return first_s + " : " + second_s

raw_sentences = list(map(merge_sentences, dataset['instruction'], dataset['output']))

In [None]:
raw_sentences[3]

In [None]:
from nltk import tokenize
import nltk

nltk.download('punkt')

In [None]:
for raw_sentences in raw_sentences:
  # Use NLTK tokenize to split sentences into sentences
  sentences = tokenize.sent_tokenize(str(raw_sentences))
  # If it exceed max_length, trim the tails.
  if len(sentences) > max_length:
    sentences = sentences[:max_length]
  # Add merged context into collection
  all_sentences.extend(sentences)

In [None]:
tf_train_ds = tf.data.Dataset.from_tensor_slices(all_sentences)
processed_ds = tf_train_ds.map(gpt2_preprocessor, tf.data.AUTOTUNE).batch(4).cache().prefetch(tf.data.AUTOTUNE)
part_of_ds = processed_ds.take(100)

In [None]:
gpt2_lm.include_preprocessing = False

num_epochs = 1

lr = tf.keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=part_of_ds.cardinality() * num_epochs,
    end_learning_rate=0.00002,
)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.experimental.Adam(lr),
    loss=loss,
    weighted_metrics=["accuracy"]
    )

gpt2_lm.fit(part_of_ds, epochs=num_epochs)

In [None]:
start = time.time()

output = gpt2_lm.generate("How do you determine the effectiveness of implementing suggested changes in a startup?", max_length=200)
print("\nGPT-2 output:")
print(output.numpy().decode("utf-8"))

end = time.time()
print("TOTAL TIME ELAPSED: ", end - start)

In [None]:
gpt2_lm.backbone.save_weights("finetuned_model.h5")

In [None]:
del gpt2_tokenizer, gpt2_preprocessor, gpt2_lm