In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
TRAIN = True

In [4]:
df = pd.read_csv('test.csv', header=None, names=['haiku'], delimiter="@")

In [5]:
df.head()

Unnamed: 0,haiku
0,tomato
1,tomato
2,tomato
3,tomato
4,tomato


In [6]:
df.shape

(6143, 1)

In [7]:
df_test = df.sample(n = int(0.2 * len(df)))
df_train = df.loc[~df.index.isin(df_test.index)]

#Reset the indexes
# df_test = df_test.reset_index()
# df_train = df_train.reset_index()

In [8]:
df_train.shape

(4915, 1)

In [9]:
df_test.shape

(1228, 1)

In [10]:
# class Haiku(Dataset):
#     def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

#         self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
#         self.haiku = []

#         for row in df['haiku']:
#             self.haiku.append(torch.tensor(
#                 self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
#             ))
                
#         if truncate:
#             self.haiku = self.haiku[:20000]
            
#         self.haiku_count = len(self.haiku)
        
#     def __len__(self):
#         return self.haiku_count

#     def __getitem__(self, item):
#         return self.haiku[item]

class Haiku(Dataset):  # GPT suggested
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.haiku = []

        for row in df['haiku']:
            # Ensure line breaks are preserved
            formatted_row = row[:-1].replace("/", "\\n")
            tokenized_haiku = self.tokenizer.encode(
                f"<|{control_code}|>{formatted_row}<|endoftext|>",
                truncation=True,
                max_length=max_length
            )
            self.haiku.append(torch.tensor(tokenized_haiku))
                
        if truncate:
            self.haiku = self.haiku[:20000]
            
        self.haiku_count = len(self.haiku)
        
    def __len__(self):
        return self.haiku_count

    def __getitem__(self, item):
        return self.haiku[item]

In [11]:
haiku = Haiku(df['haiku'], truncate=True, gpt2_type="gpt2")

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [13]:
# Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [14]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()
    
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [15]:
#Train the model on the specific data we have
if TRAIN:
    model = train(haiku, model, tokenizer)



Training epoch 0
0


6143it [00:41, 146.36it/s]


Training epoch 1
tensor(0.7854, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.29it/s]


Training epoch 2
tensor(0.3107, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 152.25it/s]


Training epoch 3
tensor(0.1201, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 152.04it/s]


Training epoch 4
tensor(0.0756, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 150.50it/s]


Training epoch 5
tensor(0.0835, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.00it/s]


Training epoch 6
tensor(0.0761, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 153.42it/s]


Training epoch 7
tensor(0.0803, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 150.77it/s]


Training epoch 8
tensor(0.0927, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.15it/s]


Training epoch 9
tensor(0.0937, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.11it/s]


Training epoch 10
tensor(0.0759, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.51it/s]


Training epoch 11
tensor(0.0654, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 152.17it/s]


Training epoch 12
tensor(0.0881, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 153.39it/s]


Training epoch 13
tensor(0.0866, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 153.06it/s]


Training epoch 14
tensor(0.0688, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 152.54it/s]


Training epoch 15
tensor(0.1081, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 150.81it/s]


Training epoch 16
tensor(0.0827, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.71it/s]


Training epoch 17
tensor(0.0908, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:39, 153.74it/s]


Training epoch 18
tensor(0.0872, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 151.24it/s]


Training epoch 19
tensor(0.0840, device='cuda:0', grad_fn=<NllLossBackward0>)


6143it [00:40, 150.06it/s]


In [16]:
#Save the model to a pkl or something so it can be reused later on
if TRAIN:
    torch.save(model, 'model.pt')

In [17]:
model = torch.load('model.pt')

In [18]:
model_prev = GPT2LMHeadModel.from_pretrained('gpt2')

In [19]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:
                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
                output_list = list(generated.squeeze().numpy())
                output_text = f"{tokenizer.decode(output_list)}<|endoftext| >" 
                generated_list.append(output_text)
                
    return generated_list

In [39]:
x = generate(model.to("cpu"), tokenizer, "Mountains high and low", entry_count=1)
print(x)

100%|██████████| 1/1 [00:03<00:00,  3.37s/it]

['Mountains high and low on the mountain, or at least low enough to drive snow off the mountaintops. The mountain is about 30 minutes north of the capital (where<|endoftext| >']





In [40]:
print(generate(model_prev.to("cpu"), tokenizer, "Mountains high and low", entry_count=1))

100%|██████████| 1/1 [00:03<00:00,  3.14s/it]

['Mountains high and low in altitude, India is known for its long lines of rivers. The capital city of Madhya Pradesh is known for its abundant forests. South Asia is<|endoftext| >']





In [41]:
# Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
    generated_haiku = []
    for i in range(len(test_data[:10])):
        x = generate(model.to("cpu"), tokenizer, test_data["haiku"].iloc[i], entry_count=1)
        generated_haiku.append(x)
    return generated_haiku

In [42]:
generated_haiku = text_generation(df_test)

100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
100%|██████████| 1/1 [00:03<00:00,  3.31s/it]
100%|██████████| 1/1 [00:03<00:00,  3.15s/it]
100%|██████████| 1/1 [00:03<00:00,  3.24s/it]
100%|██████████| 1/1 [00:03<00:00,  3.37s/it]
100%|██████████| 1/1 [00:03<00:00,  3.20s/it]
100%|██████████| 1/1 [00:03<00:00,  3.13s/it]
100%|██████████| 1/1 [00:03<00:00,  3.19s/it]
100%|██████████| 1/1 [00:03<00:00,  3.17s/it]
100%|██████████| 1/1 [00:03<00:00,  3.23s/it]


In [43]:
print(generated_haiku)

[['tomato\n\n\nOffline\n\n\nActivity: 1518\n\nMerit: 1025\n\n\nLegendaryActivity: 1518Merit: 1025 Re: [ANN<|endoftext| >'], ['tomato, cherries, zucchini, cilantro, and tomato.\n\nThese tend to have a long, glossy texture. In fact, I<|endoftext| >'], ['tomato3b | 0x0 | 1x1 | 2x2 | 3x4 | 4x5 | 6x6 | 7x7<|endoftext| >'], ['tomato" is a gentle fizzy tomato soup, that is that it is blended with milk and herbs to make it so moist and bright, and it is<|endoftext| >'], ['tomato\n\nCuisinart\n\nVanilla\n\nPumpkin\n\nTomato\n\nTomatillo\n\nHickory\n\n<|endoftext| >'], ['tomato-8.jpg?crop=0,0,0,2160,1229&wid=300&hei=225&scl=9<|endoftext| >'], ['tomato sauce (1 large egg, 1 small red onion, 1 large tomato, 1 small tomato, 1 medium tomato, 1 large tomato, 1 large tomato<|endoftext| >'], ["tomato,\n\nbut not my armpit,\n\nbut my waist,\n\nand my back.\n\nI won't. I'll<|endoftext| >"], ['tomato (2x), black pepper (2x), jalapeño (1x), and red pepper (1x)\n\n1 package tomato<|endoftext| >'], ['tomato 

In [44]:
# Loop to keep only generated text and add it as a new column in the dataframe
my_generations = []

for i in range(len(generated_haiku)):
    a = df_test["haiku"][i].split()[-30:]  # Get the matching string we want (30 words)
    b = " ".join(a)
    c = " ".join(generated_haiku[i])  # Get all that comes after the matching string
    my_generations.append(c.split(b)[-1])

df_test["generated_haiku"] = my_generations

KeyError: 0

In [None]:
# Finish the sentences when there is a point, remove after that
final = []

for i in range(len(df_test)):
    to_remove = df_test["generated_haiku"][i].split(".")[-1]
    final.append(df_test["generated_haiku"][i].replace(to_remove, ""))

df_test["generated_haiku"] = final
df_test.head()

In [None]:
df_test['generated_haiku'][7]

In [None]:
df_test['haiku'][7]