<a href="https://colab.research.google.com/github/Madhusri02/Poem_Generator/blob/master/POEM_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi
#for gpu

/bin/bash: line 1: nvidia-smi: command not found


In [18]:
!pip install tokenizer
!pip install transformers



In [19]:
import pandas as pd
import numpy as np
import random
import os

# time related modules
import time
import datetime


# PyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing, primarily developed by Facebook's AI Research lab.
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
import plotly.express as px

In [20]:
df = pd.read_csv('/content/drive/MyDrive/PoetryFoundationData.csv')
df = df.fillna('')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\r\n Objects Used to Prop...,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",Michelle Menting,
1,1,\r\r\n The New Church\r\r\n...,"\r\r\nThe old cupola glinted above the clouds,...",Lucia Cherciu,
2,2,\r\r\n Look for Me\r\r\n ...,\r\r\nLook for me under the hood\r\r\nof that ...,Ted Kooser,
3,3,\r\r\n Wild Life\r\r\n ...,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",Grace Cavalieri,
4,4,\r\r\n Umbrella\r\r\n ...,\r\r\nWhen I push your button\r\r\nyou fly off...,Connie Wanek,


In [21]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {
    'bos_token': '',
    'eos_token': '',
    'pad_token': ''}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

In [22]:
class Poem_dset(Dataset):
  def __init__(self , data , tokenizer , gpt2_type = 'gpt2' , max_len = 1024):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

        # Iterate over data, tokenize each sequence and append its input_id and attention_mask to respective lists
    for i in data:
      encodings_dict = tokenizer('' + i + '',truncation=True,
                                     max_length=max_len,
                                     padding='max_length')

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def ___len__ (self):
      return len(self.input_ids)

    def __get_item__(self , indx):
      return self.input_ids[indx] , self.attn_masks[indx]

poem_stanza_dataset = Poem_dset(df['Poem'].values, tokenizer, max_len= 1024)

In [23]:
def dset_train(split , data_set):
  l =len(data_set)
  train_size = int(split * l)
  val_data_size = l - train_size
  return train_size , val_data_size


# function call for splitting the data
poem_train_size , poem_validation_size = dset_train(0.8 , df)

# to split data randomly for validation and training
poem_train_dataset, poem_val_dataset = random_split(df, [poem_train_size, poem_validation_size])


In [24]:
# initializing the randomizer and pytorch with seed varible
random_seed = 73
torch.cuda.manual_seed_all(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

<torch._C.Generator at 0x79bee1914b10>

In [25]:
# loading data
batchsize  = 2
poem_train_data = DataLoader(poem_train_dataset , sampler = RandomSampler(poem_train_dataset) , batch_size = batchsize)
poem_validation_data = DataLoader(poem_val_dataset , sampler = SequentialSampler(poem_val_dataset) , batch_size = batchsize)

In [26]:

# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# hyperparameters
learning_rate = 1e-3
eps = 1e-8
warmup_steps = 50
device = torch.device('cuda')

**MODEL**

In [27]:
epochs = 10
max_len = 1024
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions= max_len).from_pretrained('gpt2', output_hidden_states=True)
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
model.resize_token_embeddings(len(tokenizer))
model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(poem_train_data) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)
model = model.to(device)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

**TRAINING THE MODEL**

In [None]:
#TO HANDLE LOSSES
loses = []
valid_loss = []
start_time = time.time()
for i in range(0,epochs):
  print(f'Epoch {i+1} of {epochs}')
  t0 = time.time()

    # Reset the total training loss for this epoch
  total_train_loss = 0
  model.train()

  #training the data
  for step, batch in enumerate(poem_train_data):
    # Move the input ids, labels and masks to the GPU
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        # Clear out the gradients from the previous training step
        model.zero_grad()

        # Forward pass: compute the outputs of the model by passing in the input
        outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks, token_type_ids=None)

        # Extract the loss from the outputs
        loss = outputs[0]

        # Extract and accumulate the total loss
        batch_loss = loss.item()
        total_train_loss += batch_loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        loses.append(loss.item())


        if step % 50 == 0:
            print(f"Step: {step}, Loss: {loss.item():.4f}")

        # Break the loop after 1000 steps.
        if step > 1000:
            break
  avg_train_loss = total_train_loss / len(poem_train_dataloader)

  training_time = format_time(time.time() - t0)

  print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')
  model.eval()

    # Reset the total validation loss
  total_eval_loss = 0
  nb_eval_steps = 0

  # Loop over each batch from the validation data loader
  for batch in poem_validation_data:
    b_input_ids = batch[0].to(device)
    b_labels = batch[0].to(device)
    b_masks = batch[1].to(device)

    with torch.no_grad():
      # Forward pass
      outputs  = model(b_input_ids, attention_mask=b_masks, labels=b_labels)
      loss = outputs[0]
      batch_loss = loss.item()
      total_eval_loss += batch_loss
      valid_loss.append(batch_loss)
    avg_val_loss = total_eval_loss / len(poem_val_data)

    print(f'Average Validation Loss: {avg_val_loss}')
print(f'Total Training Time: {format_time(time.time()-start_time)}')

In [None]:
torch.save(model.state_dict(),  'poem_stanza_model.pth')

In [None]:
device = torch.device("cuda"
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

poem_stanza_model = model.to(device)

prompt = " I miss home"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(
                                generated,
                                do_sample=True,
                                top_k=50,
                                max_length=MAX_LEN,
                                top_p=0.95,
                                num_return_sequences=3
                                )

print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
