# Train the model

In [None]:
# Optional
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install torch

In [None]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [None]:
pip install transformers[torch]
pip install accelerate -U

In [None]:
#Configure the training hyperparameters

import logging
import os

def train_chatbot(directory, model_output_path):
    # Read documents from the directory
    combined_text = read_documents_from_directory(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip()  # Remove excess newline characters
    train_text = combined_text

    # Save the training data as text files with utf-8 encoding
    with open("train.txt", "w", encoding="utf-8") as f:
        f.write(train_text)

    # Set up the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Prepare the dataset
    train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)

    # helper object used to be initialized with a tokenizer and set to handle masked language modeling (MLM) tasks.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Set up the training arguments
    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=4,

        num_train_epochs=100,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='/content/logs',
        logging_steps=200,
        logging_first_step=True,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        # eval_dataset=val_dataset,

    )

    # Extract the log file name from the given path
    log_file_name = os.path.basename(training_args.logging_dir)

    # Configure the logging module to save logs in UTF-8 format
    logging.basicConfig(
        filename=os.path.join(training_args.logging_dir, log_file_name),
        filemode='w',
        level=logging.DEBUG,
        format='%(asctime)s - %(levelname)s - %(message)s',
        encoding='utf-8'
    )

    trainer.train()
    trainer.save_model(model_output_path)

    # Save the tokenizer
    tokenizer.save_pretrained(model_output_path)

In [None]:
if __name__ == "__main__":

    directory = "/content/drive/MyDrive/0Colab_Notebooks/0jia_gpt_test6/training" # directory that contains the training dataset file
    model_output_path = "/content/gpt2model" # directory to store the fine-tuned model and tokenizer

    # Train the chatbot
    train_chatbot(directory, model_output_path)

    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_output_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)


# Generate response

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
def generate_response(model, tokenizer, prompt, max_length):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    # Configure the hyperparameters when generating response
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id,
        do_sample=True,
        temperature=0.3,
        top_k=20,
        top_p=0.2,
        repetition_penalty=1.0
    )

    # Remove the prompt from the generated output
    generated_text = tokenizer.decode(output[0])[len(prompt):].strip().strip('"').strip(',')

    return generated_text

**Remark:** Fine-tuned model and tokenizer's directory: https://drive.google.com/drive/folders/1myVJhNC6R7wIznBGxIn-jJdlDf3C4rQn?usp=sharing

In [None]:
model_path = "/content/drive/MyDrive/0Colab_Notebooks/0jia_gpt_test6/gpt2model" # directory that contains the fine-tuned model

# Load the fine-tuned model and tokenizer
my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Enter the prompt that wanted to ask the fine-tuned model
prompt = "What are the educational goals of Sunway University, and how are they elaborated upon?"

# Load the response generated by the fine-tuned model
response = generate_response(my_chat_model, my_chat_tokenizer, prompt, 150)
print("Generated response:", response)

# Evaluate the performance of the fine-tuned model based on several performance metrics

In [None]:
!pip install rouge

In [None]:
# Define a function to calculate similarity scores (Jaccard Similarity score, BLEU score and ROUGE score)
def calculate_similarity_score(response, correct_answer):
    jaccard_similarity = len(set(response).intersection(correct_answer)) / len(set(response).union(correct_answer))
    bleu_score = sentence_bleu([correct_answer.split()], response.split())
    rouge_scores = rouge.get_scores(response, correct_answer)
    return jaccard_similarity, bleu_score, rouge_scores

In [None]:
import timeit
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from collections import Counter

# Run the main function to calculate cimilarity scores
if __name__ == "__main__":
  rouge = Rouge()

  model_path = "/content/drive/MyDrive/0Colab_Notebooks/0jia_gpt_test6/gpt2model" # path that contains the fine-tuned model
  test_path = "/content/drive/MyDrive/0Colab_Notebooks/0jia_gpt_test6/testing/rephrasedQ&A.txt" # path that contains the testing dataset file

  # Load the fine-tuned model and tokenizer
  my_chat_model = GPT2LMHeadModel.from_pretrained(model_path)
  my_chat_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

  test_questions = []
  groundtruth = []
  bot_responses = []
  response_times = []
  jaccard_similarities = []
  bleu_scores = []
  rouge_1_scores = []
  rouge_2_scores = []
  rouge_l_scores = []

  # Read the testing dataset
  with open(test_path, 'r') as file:
    test_data = eval(file.read())

  # Determine a list to store the question/prompt and validate answer/response extracted from the testing dataset
  for item in test_data:
      question = item[0]
      answer = item[1]
      test_questions.append(question)
      groundtruth.append(answer)

  # Evaluate the similarities between the generated response from the chatbot and validate response
  for question, correct_answer in zip(test_questions, groundtruth):
    start_time = timeit.default_timer() # start time
    response = generate_response(my_chat_model, my_chat_tokenizer, question, 150) # generate a response based on the prompt ask
    end_time = timeit.default_timer() # stop time

    # Append the response to a list
    bot_responses.append(response)

    # Measure the duration of generating a response
    response_times.append(end_time - start_time)

    # Call the function to calculate respective similarity scores
    jaccard_similarity, bleu_score, rouge_scores = calculate_similarity_score(response, correct_answer)

    # Append the similarity scores to respective lists
    jaccard_similarities.append(jaccard_similarity)
    bleu_scores.append(bleu_score)
    rouge_1_scores.append(rouge_scores[0]['rouge-1']['f'])
    rouge_2_scores.append(rouge_scores[0]['rouge-2']['f'])
    rouge_l_scores.append(rouge_scores[0]['rouge-l']['f'])

In [None]:
# Define a list of:
# num_c : to store the number of tokens that shared between gold and predicted answers,
# num_p : to store the number of predicted tokens,
# num_p : to store the number of ground truth tokens
num_c = []
num_p = []
num_g = []
num_total = len(bot_responses)

# Calculate the number of common tokens that shared between ground truth and predicted answers,
# the number of predicted tokens, and the number of ground truth tokens.
for a in range(num_total):
    common = Counter(groundtruth[a].split()) & Counter(bot_responses[a].split())  # tokens shared between ground truth and predicted tokens
    num_common = sum(common.values())

    num_pred = len(str(bot_responses[a]).split())  # the number of predicted tokens
    num_gold = len(str(groundtruth[a]).split())  # the number of ground truth tokens

    num_c.append(num_common)
    num_c_sum = sum(num_c)

    num_p.append(num_pred)
    num_p_sum = sum(num_p)

    num_g.append(num_gold)
    num_g_sum = sum(num_g)

# Calculate the precision, recall and f1-score
precision = 1.0 * num_c_sum / (num_c_sum + (num_p_sum - num_c_sum))
recall = 1.0 * num_c_sum / (num_c_sum + (num_g_sum - num_c_sum))
f1_score = (2 * precision * recall) / (precision + recall)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')


Precision: 0.42658009173765415
Recall: 0.48923959827833574
F1 Score: 0.45576629327902246


In [None]:
import pandas as pd
import datetime
# Create a DataFrame with the results
results = pd.DataFrame({
    "Question": test_questions,
    "Bot Response": bot_responses,
    "Expected Response": groundtruth,
    "Jaccard Similarity": jaccard_similarities,
    "BLEU Score": bleu_scores,
    "ROUGE-1": rouge_1_scores,
    "ROUGE-2": rouge_2_scores,
    "ROUGE-L": rouge_l_scores,
    "Response Time": response_times
})

# Save the results to a CSVfile
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
results.to_csv(f'gpt2_results_{timestamp}.csv', index=False)
print("CSV file 'gpt2_results.csv' has been created.")