In [1]:
#@title # Setting up the environment { vertical-output: true, display-mode: "form" }

###################
#####  SETUP  #####
###################

#@title Setting up project paths
import os

colab_setup = False #@param {type:"boolean"}
PROJECT_PATH = "/content/drive/MyDrive/TWM/Graduation-Project/" #@param {"type":"string"}

if colab_setup:
    from google.colab import drive
    print("Mounting Google Drive...", end="", flush=True)
    drive.mount('/content/drive')
    print("Done")

else:
    # set this to the parent directory of the whole project
    PROJECT_PATH = rf"C:\Users\{os.environ['USERNAME']}\Graduation-Project"

print("PROJECT_PATH:", PROJECT_PATH)
os.chdir(PROJECT_PATH)
os.listdir()

PROJECT_PATH: C:\Users\LAPTOP\Graduation-Project


['.git',
 '.gitignore',
 '.vscode',
 'chatbot-env',
 'DataEngineering',
 'FineTuning',
 'hierarchy.txt',
 'README.md',
 'requirements.txt',
 'Terminal.ipynb',
 'Testing Interface.ipynb',
 'Utils',
 'WebSocket Interface']

In [2]:
#@title # Environment Watermark
%load_ext watermark
%watermark --author "Mohamed Hisham" --email "Mohamed00Hisham@gmail.com" --github_username "Mhmd-Hisham"
%watermark
%watermark --iversions

Author: Mohamed Hisham

Github username: Mhmd-Hisham

Email: Mohamed00Hisham@gmail.com

Last updated: 2022-10-16T05:38:32.890313+02:00

Python implementation: CPython
Python version       : 3.9.5
IPython version      : 8.5.0

Compiler    : MSC v.1928 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 165 Stepping 2, GenuineIntel
CPU cores   : 12
Architecture: 64bit

sys: 3.9.5 (tags/v3.9.5:0a7dcbd, May  3 2021, 17:27:52) [MSC v.1928 64 bit (AMD64)]



In [3]:
import Utils.EasyT5 as EasyT5
import multiprocessing
from transformers import T5ForConditionalGeneration
from transformers import T5TokenizerFast as T5Tokenizer

In [4]:
DATASET_PATH = "DataEngineering/FinalDataset/large/" #@param {"type": "string"}
MODEL_CHECKPOINTS = "FineTuning/T5/checkpoints/" #@param {"type": "string"}
TENSORBOARD_LOGS = "FineTuning/T5/TB_LOGS/" #@param {"type":"string"}

In [5]:
parameters = EasyT5.ExperimentParameters()

SEED = 512

# parameters related to the training process
# and the PyTorch Lightning trainer
parameters['trainer'] = {
    # saves the last recent 'n' epochs
    "save_last_n_epochs": 3,
    # the fixed learning rate for the model
    "fixed_learning_rate": 1e-4,
    # the monitor of the early stopping
    "early_stopping_monitor": "val_loss",
    # the minimum delta between the epochs to apply early stopping
    "early_stopping_min_delta": 0.01,
    # 0 to disable early stopping feature
    "early_stopping_patience_epochs": 0,
    # the mode of the early stopping criteria
    "early_stopping_mode": "min",
    # the maximum number of epochs to train/fine-tune the model on
    "max_epochs": 5,
    # the floating point numbers precision
    "precision": 32,
    # the training batch size 
    # the batch size at which the data is loaded into memory
    "batch_size": 8,
}

# general parameters about the working environment
parameters['general'] = {
    # the output directory
    "output_dir":"",
    # the name/path of the checkpoint to be loaded from Hugging face
    # or from the local disk
    "checkpoint_name": MODEL_CHECKPOINTS+"-epoch-9-tloss-1.5577-vloss-1.8296",
    # the name that will appear on tensorboard
    "tensorboard_name": "t5-v1_1-base_BatchSize-16_N-Splits-4_DatasetSize-large_Topic-Food&Drink_version_2",
    # the number of cpu cores in the current machine
    "cpu_cores": multiprocessing.cpu_count(),
    # the environment seed
    'seed':SEED,
}

# the parameters passed to the tokenizer when encoding text
parameters['encoder'] = {
    # the padding method for the input sequences
    "padding":"max_length",
    # whether to truncate long sequences or not
    "truncation":True,
    # whether to add special tokens in the input sequences or not
    "add_special_tokens": True,
    # the maximum length of the input sequence
    "max_length": 512,
}

# the parameters passed to the model when generating text
parameters['generator'] = {
    # the number of beams used in the beam search (also known as beam width)
    "num_beams": 2,
    # the maximum length of the generated sequences
    "max_length": 512,
    # the repetition penalty added when the model repeats words
    "repetition_penalty": 2.5,
    # the penalty aded when the model generates lengthly sequences
    "length_penalty": 1.0,
    # whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
    "early_stopping": True,
    # if set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
    "top_p": 0.95,
    # the number of highest probability vocabulary tokens to keep for top-k-filtering.
    "top_k": 50,
    # the number of returned sequences
    "num_return_sequences": 1,
    # whether to skip special tokens when generating or not
    "skip_special_tokens": True,
    # whether to clean all tokenization spaces before returning the output or not
    "clean_up_tokenization_spaces": True,
}


In [6]:
parameters['general']["checkpoint_name"] = os.path.join(
    PROJECT_PATH,
    MODEL_CHECKPOINTS,
    "-epoch-9-tloss-1.5577-vloss-1.8296"
)
parameters['general']["tensorboard_name"] = "t5-v1_1-base_BatchSize-16_N-Splits-4_DatasetSize-large_Topic-Food&Drink_version_2",

# load the model
model = EasyT5.EasyT5(parameters)
model.from_pretrained(T5Tokenizer, T5ForConditionalGeneration, return_dict=False)

In [7]:
# you can experiment with different generator parameters like this
custom_parameters = parameters.copy()

# custom_parameters['generator']['max_length'] = 
# custom_parameters['generator']['repetition_penalty'] = 
# custom_parameters['generator']['length_penalty'] = 
# custom_parameters['generator']['early_stopping'] = 
custom_parameters['generator']['num_beams'] = 3
custom_parameters['generator']['top_p'] = 0.95
custom_parameters['generator']['top_k'] = 100
custom_parameters['generator']['num_return_sequences'] = 3

suggestion = model.predict("complete: I want to eat ", custom_parameters)
print(suggestion)

['a pizza.', 'a pizza. What kind of food?', 'a pizza with my friends.']
