***IMPORT LIBRARIES***

In [None]:
# ////GENERALS////
import re
import json
import pandas as pd
import numpy as np
from typing_extensions import Type
import matplotlib.pyplot as plt
import warnings

# ////AUTOMATIC LEARNING////
import tensorflow as tf
from datasets import load_dataset,DatasetDict,Dataset
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ModelCheckpoint

# ////TRANSFORMERS BY Hugging Face////
import transformers
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import create_optimizer, AdamWeightDecay

# ////NLP MEASURES////
import nltk
from nltk.translate.bleu_score import sentence_bleu,corpus_bleu
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import evaluate
meteor_measure = evaluate.load('meteor')

warnings.filterwarnings("ignore")

***READ THE DATASET***

In [None]:
#Reading the dataset JSON file
with open('PATH TO DATASET') as f:
   dataset = json.load(f)

rows = dataset['pairs']

# Reading Natural language (nl) and Controlled Natural lagauge (cnl)
nl = [row['NL'] for row in rows]
cnl = [row['CNL'] for row in rows]

***DATASET STRUCTURE***

In [None]:
#To Dataset Structure with train test split ratio of 85:15
train_dict = Dataset.from_dict({"sentences": nl,"targets": cnl})
train_dict = train_dict.train_test_split(test_size=0.15)
train_dict['train'][0]

***TRAINING TRANSFORMERS***

In [None]:
#Using AutoTokenizer from trasnformer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
#Defined Prefix and model inputs
prefix = "translate English to CNL: "
def preprocess(data):
  inputs = [prefix + example for example in data["sentences"]]
  targets = data["targets"]
  model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
  return model_inputs

In [None]:
# Mapped the model inputs
tokenized_cnl = train_dict.map(preprocess, batched=True)
tokenized_cnl

In [None]:
#Importing models 't5-small' or 'facebook/bart-base'
model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-small") # update model name 'facebook/bart-base' or give path of saved model

In [None]:
#Used DataCollator For Seq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

In [None]:
## Dataset spliting for training on full dataset (No test set as we are training on full dataset)
tf_train_set = model.prepare_tf_dataset(
    tokenized_cnl["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_cnl["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
#Defined Adam optimizer with learning and weight decay rates
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
#Model compilation
model.compile(optimizer=optimizer)

In [None]:
# Define the CSVLogger callback
csv_logger = tf.keras.callbacks.CSVLogger('training-t5Small_FullDataset.log', separator=",", append=True)

# Define the EarlyStopping callback
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

In [None]:
# Model training
model.fit(tf_train_set, validation_data=tf_test_set, epochs=200)
history = model.fit(tf_train_set, validation_data=tf_test_set, epochs=200, callbacks=[csv_logger, es])
model.save_pretrained('content/t5mall_Complete_Dataset/') #save the model in specified path

In [None]:
# Load the log file using pandas
log_data = pd.read_csv('training-t5Small_FullDataset.log')

# Plotting training loss vs validation loss
plt.figure(figsize=(10, 6))
plt.plot(log_data['loss'], label='Training Loss')
plt.plot(log_data['val_loss'], label='Validation Loss')
plt.title('Training Loss vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
#load the trained model
model = TFAutoModelForSeq2SeqLM.from_pretrained('content/t5mall_Complete_Dataset/')
print(model)

***BLEU SCORE FUNCTION***

In [None]:
# Function to compute the BLEU Score
def bleu_score(model, tokenizer, test_dict):
    # Get the bleu score of a model
    #sents = test_dict['sentences'][0:5] #specify the number of sentences in case to for test run
    sents = test_dict['sentences'] #complete sentences
    predicted = predict_cnl(model, tokenizer, sents)
    targets = test_dict['targets']
    references, hypothesis = [], []
    for i, pred in enumerate(predicted):
        references.append([targets[i].split()])
        hypothesis.append(pred.split())

    bleu_dic = {}
    print(hypothesis)
    bleu_dic['1-grams'] = corpus_bleu(references, hypothesis, weights=(1.0, 0, 0, 0))
    bleu_dic['1-2-grams'] = corpus_bleu(references, hypothesis, weights=(0.5, 0.5, 0, 0))
    bleu_dic['1-3-grams'] = corpus_bleu(references, hypothesis, weights=(0.3, 0.3, 0.3, 0))
    bleu_dic['1-4-grams'] = corpus_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25))

    return bleu_dic

# Compute the BLEU Score
bleu_train = bleu_score(model, tokenizer, train_dict)
bleu_train

***METEOR SCORE FUNCTION***

In [None]:
# Function to compute the METEOR Score
def compute_meteor(model, test_dict, alpha=0.9, beta=3, gamma=0.5):
    sents = test_dict['sentences']

    predicted = predict_cnl(model, tokenizer, sents)
    targets = test_dict['targets']
    references, hypothesis = [], []
    for i, pred in enumerate(predicted):
        references.append([targets[i]])
        hypothesis.append(pred)

    meteor_dic = {}
    meteor_dic = meteor_measure.compute(predictions= hypothesis, references= references)

    return meteor_dic

#Compute the METEOR Score
meteor_train = compute_meteor(model, train_dict)
meteor_train


***INFERENCE MULTI SENTENCES***

In [None]:
#Sentence Cleaning function
def clean_sentence(sentence):
  regExp = '(<\/s>|<s>|<pad>|<unk>)'
  return re.sub(regExp, "", sentence).strip()

##Evaluate Sentence Cleaning function
clean_sentence('</s><s>Waiter W is working when waiter W serves a drink.</s><pad><pad><pad><pad><pad><pad><pad><pad>')

In [None]:
#Predicting the CNL function
%%time
def predict_cnl(model, tokenizer, sentences):
  task_prefix = "translate English to CNL: "
  prefixeds = [task_prefix + sentence for sentence in sentences]
  predict_input = tokenizer(prefixeds, return_tensors="tf", padding=True)
  output_sequences = model.generate(
    input_ids=predict_input["input_ids"],
    attention_mask=predict_input["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
    max_length=300, min_length=5,num_beams=1
  )

  cnls = [tokenizer.decode(encoded) for encoded in output_sequences]
  cnls = [clean_sentence(sentence) for sentence in cnls]
  return cnls

##Evaluate predict_cnl function
toTranslate = ["The pub 1 is close to the pub number 2 and the pub number X, where X is equal to 3, 4."]
translateds = predict_cnl(model, tokenizer, toTranslate)

***PREDICTION ON TEST DATASET***

In [None]:
# Read the dataset
test_data = pd.read_excel('test.xlsx')

In [None]:
# Get the sentences to predict
sentences_to_predict = test_data['Test Natural Language'].tolist()

# Use your model to predict the CNL
predicted_cnl = predict_cnl(model, tokenizer, sentences_to_predict)

# Now you have the predicted CNL, you need to compare with the actual CNL
actual_cnl = test_data['CNL'].tolist()

In [None]:
# Create a DataFrame to store the actual and predicted CNLs
results_df = pd.DataFrame({'Actual CNL': actual_cnl, 'Predicted CNL': predicted_cnl})

# Save the DataFrame to a CSV file
results_df.to_csv('cnl_predictions_t5Small_Complete.csv', index=False)

In [None]:
def sentence_to_words(sentence):
    return sentence.split(' ')

def sentences_to_word_lists(sentences):
    return [sentence_to_words(sentence) for sentence in sentences]


def calculate_metrics_word_level(actual, predicted):
    mlb = MultiLabelBinarizer()

    actual_words = sentences_to_word_lists(actual)
    predicted_words = sentences_to_word_lists(predicted)

    # Fit the MultiLabelBinarizer on the union of actual and predicted words
    all_words = list(set().union(*actual_words, *predicted_words))
    mlb.fit([all_words])

    actual_binary = mlb.transform(actual_words)
    predicted_binary = mlb.transform(predicted_words)

    accuracy = accuracy_score(actual_binary, predicted_binary)
    precision = precision_score(actual_binary, predicted_binary, average='micro', zero_division=0)
    recall = recall_score(actual_binary, predicted_binary, average='micro', zero_division=0)
    f1 = f1_score(actual_binary, predicted_binary, average='micro', zero_division=0)

    return accuracy, precision, recall, f1

In [None]:
print("Number of actual CNL sentences: ", len(actual_cnl))
print("Number of predicted CNL sentences: ", len(predicted_cnl))

In [None]:
accuracy, precision, recall, f1 = calculate_metrics_word_level(actual_cnl, predicted_cnl)
print('Accuracy word_level:', accuracy)
print('Precision word_level:', precision)
print('Recall word_level:', recall)
print('F1 score word_level:', f1)

In [None]:
# To Calculate the BLEU & MERTEOR, Load the Test Dataset from the provided Excel file
test_data = pd.read_excel('test.xlsx')
test_nl = test_data['Test Natural Language'].tolist()
test_cnl = test_data['CNL'].tolist()
test_dict = Dataset.from_dict({"sentences": test_nl, "targets": test_cnl})

# Compute BLEU Score
bleu_results = bleu_score(model, tokenizer, test_dict)
print("BLEU Scores:", bleu_results)

# Compute METEOR Score
meteor_results = compute_meteor(model, test_dict)
print("METEOR Score:", meteor_results)
