In [1]:
import json

input_file = "/content/training_data_multiarith.jsonl"  # Replace with the path to your JSONL file
output_file = "output.txt"  # Output file path

with open(input_file, "r") as f:
    data = f.readlines()

with open(output_file, "w") as f:
    for line in data:
        json_data = json.loads(line.strip())
        prompt = json_data["prompt"]
        completion = json_data["completion"]
        f.write("[Q] " + prompt + "\n")
        f.write("[A] " + completion + "\n")
        f.write("\n")

In [20]:
data

['{"prompt": "For Halloween Debby and her sister combined the candy they received. Debby had 32 pieces of candy while her sister had 42. If they ate 35 pieces the first night, how many pieces do they have left?", "completion": "First, combine the candy Debby and her sister had. 32 + 42 = <<32+42=74>>74 pieces of candy.\\nSecond, subtract the candy they ate the first night from their total candy. 74 - 35 = <<74-35=39>>39 pieces of candy. Answer: \\\\boxed{39}.--> 39 END"}\n',
 '{"prompt": "A pet store had 13 siamese cats and 5 house cats. During a sale they sold 10 cats. How many cats do they have left?", "completion": "The pet store started with 13 Siamese cats + 5 house cats = <<13+5=18>>18 cats in total.\\nAfter selling 10 cats, the pet store has 18 cats - 10 cats = <<18-10=8>>8 cats left. Answer: \\\\boxed{8}.--> 8 END"}\n',
 '{"prompt": "Luke was trying to expand his game collection. He bought 2 games from a friend and bought 2 more at a garage sale. If 2 of the games didn\'t work,

In [2]:
import pandas as pd
import numpy as np
import re
import os

In [3]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory():
    combined_text = ""
    file_path = '/content/output.txt'
    combined_text += read_txt(file_path)
    return combined_text

In [4]:
# train_directory = '/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/training_data/q_and_a'
text_data = read_documents_from_directory()
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters

In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [6]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [7]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [8]:
train_file_path = "/content/output.txt"
model_name = 'gpt2'
output_dir = '/content/custom_q_and_a'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50
save_steps = 50000

In [9]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.952
1000,0.3746
1500,0.2199
2000,0.1519
2500,0.1216


In [28]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [34]:
model2_path = "/content/custom_q_and_a"
sequence2 = "[Q] Kaleb had 12 dollars. If he spent 8 bucks on a new game, how many 2 dollar toys could he buy with the money he had left?"
max_len = 150
answer = generate_text(model2_path, sequence2, max_len)
print(answer)

[Q] Kaleb had 12 dollars. If he spent 8 bucks on a new game, how many 2 dollar toys could he buy with the money he had left?
[A] Kaleb originally had 12 dollars.
When he spent 8 dollars, he was left with 12 - 8 = <<12-8=4>>4 dollars.
When he bought a game, he got 4 more + 18 = <<4+18=24>>24 dollars. Answer: \boxed{24}.--> 24 END

[Q] A store had 40 oranges in a bin. If they threw away 20 of the old ones and put 24 new ones in the bin how many would be in the bin?
[A] First


In [24]:
f = open('/content/multiarith.json')
test_data = json.load(f)

In [33]:
correct_predictions = 0
model2_path = "/content/custom_q_and_a"
max_len = 150
for i in range(int(len(test_data['data'])*0.7), len(test_data['data'])):

  new_prompt = test_data['data'][i]['question']
  answer = generate_text(model2_path, "[Q] "+new_prompt, max_len)
  try:
    pred = int(answer.split("-->")[1].split("END")[0].strip())
  except:
    pred = -9999
    pass
  ans = test_data['data'][i]['answer']
  if '.' in test_data['data'][i]['answer']:
    pred=float(pred)
    ans = float(ans)
  else:
    ans = int(ans)
  if pred == ans:
    correct_predictions+= 1

total_data_points = len(test_data['data']) - int(len(test_data['data'])*0.7)
print("Accuracy = ",(correct_predictions/total_data_points)*100, "%")

Accuracy =  9.444444444444445 %
