In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

In [2]:
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

____

__Start training from the last checkpoint__

In [None]:
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/NLP/NewModel")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [4]:
train_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = "/content/drive/MyDrive/NLP/merged.txt",
    block_size = 128
)



___

In [None]:
train_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = '/content/drive/MyDrive/NLP/train.csv',
    block_size = 128
)

In [5]:
validation_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = '/content/drive/MyDrive/NLP/validation.txt',
    block_size = 128
)

In [None]:
test_dataset = TextDataset(
    tokenizer = tokenizer,
    file_path = '/content/drive/MyDrive/NLP/test.csv',
    block_size = 128
)

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = False
)

In [7]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/NLP/model",
    overwrite_output_dir = False,
    num_train_epochs = 3,
    per_device_train_batch_size = 3,
    gradient_accumulation_steps = 3,
    save_steps = 600,
    save_total_limit = 2,
    logging_dir = "/content/drive/MyDrive/NLP/logs",
    save_strategy = 'steps',
    evaluation_strategy = 'steps',
    eval_steps = 600,
    logging_steps = 100,
    do_train = True,
    do_eval = True,
    load_best_model_at_end = True,
    remove_unused_columns = True
)

In [8]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset
)

In [9]:
trainer.train()

Step,Training Loss,Validation Loss
600,3.7826,3.852644
1200,3.7026,3.802611
1800,3.6189,3.768546
2400,3.4751,3.753518
3000,3.4392,3.745816
3600,3.4709,3.738574
4200,3.4573,3.72755
4800,3.3749,3.727616
5400,3.3752,3.723081
6000,3.3731,3.721565


TrainOutput(global_step=6621, training_loss=3.50673429520536, metrics={'train_runtime': 3035.6443, 'train_samples_per_second': 19.629, 'train_steps_per_second': 2.181, 'total_flos': 3892336754688000.0, 'train_loss': 3.50673429520536, 'epoch': 3.0})

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset)

In [None]:
results

{'eval_loss': 3.6007895469665527,
 'eval_runtime': 7.3307,
 'eval_samples_per_second': 96.171,
 'eval_steps_per_second': 12.141,
 'epoch': 3.0}

In [None]:
def calculate_perplexity(model, tokenizer, test_dataset):
  perplexities = []

  for example in test_dataset:
    input_ids = example['input_ids']
    attention_mask = example['attention_mask']

    with torch.no_grad():
      # Remove the extra square brackets around input_ids and attention_mask
      loss = model(torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))[0]
      perplexity = torch.exp(loss)
      perplexities.append(perplexity.item())

  return np.mean(perplexities)

perplexity = calculate_perplexity(model, tokenizer, test_dataset)
print(f"Perplexity on test dataset: {perplexity}")

In [10]:
trainer.save_model('/content/drive/MyDrive/NLP/model1')

In [16]:
model_dir = "/content/drive/MyDrive/NLP/model1"
model = GPT2LMHeadModel.from_pretrained(model_dir)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def chat_with_model():
  while True:
    user_input = input("You: ")
    input_ids = tokenizer.encode(user_input, return_tensors='pt')

    response_ids = model.generate(input_ids, max_length=100, num_return_sequences=1,
                                  temperature=0.9, top_k=20, do_sample=True,
                                  pad_token_id=model.config.eos_token_id)

    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    print(f"Model: {response}")

chat_with_model()

You: How to achieve financial freedom?
Model: How to achieve financial freedom? How to turn it into money? How to turn it into a safe haven? How to use it as a tool of control?

There are so many ways to achieve financial freedom. I have read some of them and can tell you the one I like the most:
I started using crypto to trade.  I use a phone called The Kraken because I can do a lot of trading and I like the look of it when I open a trade.  I
You: How to set up a hedge fund?
Model: How to set up a hedge fund?
Richard Wilson: I’m very familiar with the hedge funds market. I’ve been in the business for a long time. There is a lot of money to be done.
Richard Wilson: How do you set up a fund so that it’s a hedge fund and not just a hedge fund?
Syed Ali: We are all in this together. We make a ton.
Richard Wilson: How do you find the
You: Give me a few advices on how to trade?
Model: Give me a few advices on how to trade?
What kind of equipment are you using?
You need to know how to trade 

KeyboardInterrupt: ignored