In [20]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

In [21]:
import torch
print(torch.cuda.is_available())

True


In [22]:
TRAIN = True

In [23]:
# df = pd.read_csv('test.csv', header=None, names=['haiku'], delimiter="@")
df = pd.read_parquet('dataset2.parquet')
df = df.rename(columns={"text":"haiku"})

In [24]:
df.head()

Unnamed: 0,source,haiku,text_phonemes,keywords,keyword_phonemes,gruen_score,text_punc
0,bfbarry,Delicate savage. / You'll never hold the cinde...,deh|lax|kaxt sae|vaxjh / yuwl neh|ver hhowld d...,cinder,sihn|der,0.639071,
1,bfbarry,A splash and a cry. / Words pulled from the ri...,ax splaesh aend ax kray / werdz puhld frahm dh...,the riverside,dhax rih|ver|sayd,0.563353,
2,bfbarry,"Steamy, mist rising. / Rocks receiving downwar...",stiy|miy mihst ray|zaxng / raaks rax|siy|vaxng...,mist rising,mihst ray|zaxng,0.538326,
3,bfbarry,You were broken glass. / But I touched you eve...,yuw wer brow|kaxn glaes / baht ay tahcht yuw i...,broken glass,brow|kaxn glaes,0.703446,
4,bfbarry,Eyes dance with firelight. / The Moon and I ar...,ayz daens wihdh faxr|layt / dhax muwn aend ay ...,eyes dance,ayz daens,0.830985,


In [25]:
df.shape

(49024, 7)

In [26]:
df_test = df.sample(n = int(0.2 * len(df)))
df_train = df.loc[~df.index.isin(df_test.index)]

#Reset the indexes
# df_test = df_test.reset_index()
# df_train = df_train.reset_index()

In [27]:
df_train.shape

(39220, 7)

In [28]:
df_test.shape

(9804, 7)

In [29]:
# class Haiku(Dataset):
#     def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

#         self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
#         self.haiku = []

#         for row in df['haiku']:
#             self.haiku.append(torch.tensor(
#                 self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
#             ))
                
#         if truncate:
#             self.haiku = self.haiku[:20000]
            
#         self.haiku_count = len(self.haiku)
        
#     def __len__(self):
#         return self.haiku_count

#     def __getitem__(self, item):
#         return self.haiku[item]

class Haiku(Dataset):  # GPT suggested
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.haiku = []

        for row in df['haiku']:
            # Ensure line breaks are preserved
            formatted_row = row[:-1].replace("/", "\\n")
            tokenized_haiku = self.tokenizer.encode(
                f"<|{control_code}|>{formatted_row}<|endoftext|>",
                truncation=True,
                max_length=max_length
            )
            self.haiku.append(torch.tensor(tokenized_haiku))
                
        if truncate:
            self.haiku = self.haiku[:20000]
            
        self.haiku_count = len(self.haiku)
        
    def __len__(self):
        return self.haiku_count

    def __getitem__(self, item):
        return self.haiku[item]

In [30]:
haiku = Haiku(df['haiku'], truncate=True, gpt2_type="gpt2")

In [31]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [32]:
# Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [33]:
def train(
    dataset, model, tokenizer,
    batch_size=24, epochs=10, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [34]:
#Train the model on the specific data we have
if TRAIN:
    model = train(haiku, model, tokenizer)



Training epoch 0
0


20000it [05:07, 65.12it/s]


Training epoch 1
tensor(0.3244, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:08, 64.89it/s]


Training epoch 2
tensor(0.2450, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:09, 64.58it/s]


Training epoch 3
tensor(0.2689, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [04:55, 67.68it/s]


Training epoch 4
tensor(0.2850, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:08, 64.74it/s]


Training epoch 5
tensor(0.2921, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:09, 64.68it/s]


Training epoch 6
tensor(0.2578, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:08, 64.79it/s]


Training epoch 7
tensor(0.3231, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:09, 64.67it/s]


Training epoch 8
tensor(0.2539, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:09, 64.67it/s]


Training epoch 9
tensor(0.3230, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [05:09, 64.61it/s]


In [35]:
#Save the model to a pkl or something so it can be reused later on
if TRAIN:
    torch.save(model, 'model.pt')

In [36]:
model = torch.load('model.pt')

In [37]:
model_prev = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:
                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            # halo syg :D D:
            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}<|endoftext| >" 
                generated_list.append(output_text)
                
    return generated_list

In [45]:
x = generate(model.to("cpu"), tokenizer, "Mountains high and low", entry_count=1)
print(x)

100%|██████████| 1/1 [00:02<00:00,  2.95s/it]

['Mountains high and low. Pushing to create more of the beaches in the sandy mountain areas and the shadow dunes.\nJungle land, beautiful from the sun.<|endoftext| >']





In [46]:
print(generate(model_prev.to("cpu"), tokenizer, "Mountains high and low", entry_count=1))

100%|██████████| 1/1 [00:03<00:00,  3.01s/it]

["Mountains high and low above a volcano on land, across the Ionian Plateau, about 5 million years ago. Those glaciers are named after the father of today's largest<|endoftext| >"]





In [41]:
# Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
    generated_haiku = []
    for i in range(len(test_data[:10])):
        x = generate(model.to("cpu"), tokenizer, test_data["haiku"].iloc[i], entry_count=1)
        generated_haiku.append(x)
    return generated_haiku

In [42]:
generated_haiku = text_generation(df_test)

100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
100%|██████████| 1/1 [00:03<00:00,  3.65s/it]
100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
100%|██████████| 1/1 [00:03<00:00,  3.89s/it]
100%|██████████| 1/1 [00:03<00:00,  3.79s/it]
100%|██████████| 1/1 [00:03<00:00,  3.79s/it]
100%|██████████| 1/1 [00:03<00:00,  3.60s/it]
100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
100%|██████████| 1/1 [00:00<00:00, 10.20it/s]


In [43]:
print(generated_haiku)

[["Winter wind. / Sandpiper's Chase. / Their voices. / They were broken. / The thousand-year war. / And the flag. / But I don't know. / I'll never hold the<|endoftext| >"], ['What you know about? / Waking up everyday, like. / You on a mission? / A new world. / Words pulled from the riks. / Today I wake up. / And think about you. / I see you<|endoftext| >'], ["Lawn daisies. / A boy tosses the ball. / To himself. / Just his paws. / The little black kitten. / He'd been crying. / But I'm all right. / I'll sleep with you<|endoftext| >"], ["[ crochet hook] / Pulling the yarn through? / Deep November. / Sweet. / Because we're tired of talking about this ugly problem. / We'll just continue working until we're done with the last poo<|endoftext| >"], ['I forgot that I. / Got blue hair dye all over. / My white Halsey shirt. / Ties everything together. / I gotta go to the church tonight. / To get to a long-term relationship. / I want to be<|endoftext| >'], ["Yes, they are really. / That stupid at 

In [44]:
# Loop to keep only generated text and add it as a new column in the dataframe
my_generations = []

for i in range(len(generated_haiku)):
    a = df_test["haiku"][i].split()[-30:]  # Get the matching string we want (30 words)
    b = " ".join(a)
    c = " ".join(generated_haiku[i])  # Get all that comes after the matching string
    my_generations.append(c.split(b)[-1])

df_test["generated_haiku"] = my_generations

KeyError: 0

In [None]:
# Finish the sentences when there is a point, remove after that
final = []

for i in range(len(df_test)):
    to_remove = df_test["generated_haiku"][i].split(".")[-1]
    final.append(df_test["generated_haiku"][i].replace(to_remove, ""))

df_test["generated_haiku"] = final
df_test.head()

In [None]:
df_test['generated_haiku'][7]

In [None]:
df_test['haiku'][7]