In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import nltk
import os

### Loading Data

In [57]:
topics=os.listdir('topics/')

In [58]:
poems=pd.DataFrame(columns=['title','content','topic'])

In [59]:
poems

Unnamed: 0,title,content,topic


In [60]:
for topic in topics:
    path='topics/'+topic
    
    for f in os.listdir(path):
        file_path=path+'/'+f
        with open(file_path, 'r') as file:
            text = file.read()
        
        df=pd.DataFrame([[f,text,topic]], columns=['title','content','topic'])
        
        poems=pd.concat([poems,df],ignore_index=True)
    
        #print(os.listdir(path))

In [61]:
poems

Unnamed: 0,title,content,topic
0,ButterflyPoemsVersesOnAButterflyPoembyJosephWa...,Fair Child of Sun and Summer! we behold\nWith ...,butterfly
1,ButterflyPoemsChristmasButterflyPoembyMichaelS...,Perhaps it had arrived undercover\nbetween the...,butterfly
2,ButterflyPoemsBleedingButterflyPoembyButterfli...,I'm a bleeding butterfly.\nMy wings are made o...,butterfly
3,ButterflyPoemsButterflyPoembyFrankLisaIndiRaFr...,Feel like a butterfly; mostly\nhalf finished\n...,butterfly
4,ButterflyPoemsButterflyEffectPoembyHarunAlNasi...,Butterfly Effect\nMy very birth sent forth a t...,butterfly
...,...,...,...
14330,AngerPoemsAngerPoembyVidyadharDurgekar.txt,"Searing reason and rationale,\nWith the seethi...",anger
14331,AngerPoemsTheAngerInMePoembyJonathanPendley.txt,This ill temper I have is making me mad\nI blo...,anger
14332,AngerPoemsAMothersAngerPoembywardhajawdat.txt,"you....!\nissue ofmine\nborne with love,\nbirt...",anger
14333,AngerPoemsAngerPoembyScarlet.txt,it boils within\nslowly poisoning all that it...,anger


### Creating a Test Set in order to compare generated text with the original

In [62]:
#Drop the poems with long text (because having more than 1024 tokens, does not work)
poems = poems[poems['content'].apply(lambda x: len(x.split(' ')) < 250)]

In [63]:
test_set = poems.sample(n = 20)
poems = poems.loc[~poems.index.isin(test_set.index)]

In [64]:
#Reset the indexes
test_set = test_set.reset_index()
poems = poems.reset_index()

In [65]:
#For the test set only, we are going to keep last 10 words in a new column, then remove them from original column
test_set['True_end_content'] = test_set['content'].str.split().str[-10:].apply(' '.join)
test_set['content'] = test_set['content'].str.split().str[:-10].apply(' '.join)

In [66]:
test_set

Unnamed: 0,index,title,content,topic,True_end_content
0,173,PoemPoemsInAPoemPoembyRobertFrost.txt,The sentencing goes blithely on its way And ta...,poem,takes the stroke and time In having its undevi...
1,10901,GreenPoemsTheGardenerLxxxivOverTheGreenPoembyR...,Over the green and yellow rice-fields sweep th...,green,"flood. Brothers, let us squander our morning i..."
2,14028,MurderPoemsmurderPoemASoulStealerPoembyAceOfBl...,"It comes upon me, time and time again. It's th...",murder,you a serial killer. I call you a soul stealer.
3,1510,AngelPoemsStPeterAndTheAngelPoembyDeniseLevert...,"Delivered out of raw continual pain, smell of ...",angel,"the next door, the next terrors of freedom and..."
4,5589,TeacherPoemsMyTeacherPoembyRUPPOKHAREL.txt,Same smiling face of his Same eyes falling on ...,teacher,to remember- salute his stature; The teacher's...
5,9724,DeathPoemsDonTFearDeathPoembyAleksandrAleksand...,Don't fear death in earthly travels. Don't fea...,death,you are not condemned to slow And everlasting ...
6,2193,RainPoemsAprilRainPoembyMathildeBlind.txt,"The April rain, the April rain, Comes slanting...",rain,"The rain-clouds flash with April mirth, Like L..."
7,3476,SleepPoemsWhileTheFatesSleepPoembyLucyMaudMont...,"Come, let us to the sunways of the west, Haste...",sleep,as lovers danced of yore­ The fates will waken...
8,7785,SympathyPoemsSympathyСочувствиеPoembyAnnaFokin...,"Не падай духом никогда, Пока звезда в ночи сия...",sympathy,"Хоть долог тяжкий путь смирения, Но сердце жиз..."
9,6907,NaturePoemsOhHowSilentIsTheNaturePoembyNikolai...,"Oh, how silent is the nature, It only looks an...",nature,"dazzling comets To a point, that is not in sight."


In [67]:
class PoemText(Dataset):
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.text = []

        for row in poems['content']:
          self.text.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        if truncate:
            self.text = self.text[:20000]
        self.text_count = len(self.text)

    def __len__(self):
        return self.text_count

    def __getitem__(self, item):
        return self.text[item]


In [68]:
dataset = PoemText(poems['content'], truncate=True, gpt2_type="gpt2")

Token indices sequence length is longer than the specified maximum sequence length for this model (1110 > 1024). Running this sequence through the model will result in indexing errors


### Training the model

In [41]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [42]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [43]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [44]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model