In [20]:
import torch
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import os
import torchtext
import pyarrow
import sklearn

In [88]:
# Reading the new training data file
path = os.getcwd()
data_gutenberg_small = pd.read_parquet(path+'/dev.parquet')

In [89]:
# Examining the new training data
print(f"Columns in dataset are: {data_gutenberg_small.columns.to_list()}")
print(f"Number of books: {len(data_gutenberg_small['title'].unique())}")
print("Let's look at first 5 random rows of the dataset")
print(data_gutenberg_small.head(5))

Columns in dataset are: ['Unnamed: 0', 'title', 'text', 'genres']
Number of books: 20
Let's look at first 5 random rows of the dataset
   Unnamed: 0                               title  \
0        6140                   fruits of culture   
1        3854  the confessions of artemas quibble   
2        1427                        sir mortimer   
3        3987                 nouvelles histoires   
4        1183   autobiography and selected essays   

                                                text  \
0  Produced by Bryan Ness, Jana Srna and the Onli...   
1  Produced by an anonymous volunteer\n\n\n\n\nTr...   
2  Produced by Rick Niles, John Hagerson, Rick Ni...   
3  Produced by Chuck Greif and www.ebooksgratuits...   
4  Produced by Donald Lainson\n\n\n\n\n\nAUTOBIOG...   

                                              genres  
0  {'literary-fiction', 'classics', 'literature',...  
1         {'classics', 'literary-fiction', 'novels'}  
2                        {'adventure', 'hist

In [95]:
# Cleaning data using Setu's code
data_gutenberg_clean = data_gutenberg_small.drop(columns=['Unnamed: 0'])

### Pre-Processing text

In [91]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text):
    '''
    Function to clean text of books. Removes email addresses, new lines, html tags, and extra spaces.

    Input: Text (String)
    Output: Cleaned Text (String)
    '''
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', ' ', text)
    cleaned_text = re.sub(r'^.*?(?=\n\n\n)', ' ', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>.*?</a>', ' ', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s.?!]', ' ', cleaned_text)
    cleaned_text = re.sub(r' +', ' ', cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def remove_first_row(group):
    return group.iloc[1:]

def lowercase(text):
    return text.lower()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eshan23/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eshan23/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [96]:
# Apply the cleaning function to the training and validation data
# First we will remove the first row of each book as it contains the title of the book
data_gutenberg_clean['cleaned_text'] = data_gutenberg_clean['text'].apply(lambda x:clean_text(x))
data_gutenberg_clean.loc[:,'genres'] = data_gutenberg_clean.loc[:,'genres'].apply(lambda x:clean_text(x))
data_gutenberg_clean.head(5)

Unnamed: 0,title,text,genres,cleaned_text
0,fruits of culture,"Produced by Bryan Ness, Jana Srna and the Onli...",literary fiction classics literature fiction ...,transcriber s note this e book belongs to tol...
1,the confessions of artemas quibble,Produced by an anonymous volunteer\n\n\n\n\nTr...,classics literary fiction novels,transcriber s note quotation marks have been ...
2,sir mortimer,"Produced by Rick Niles, John Hagerson, Rick Ni...",adventure historical,sir mortimer a novel by mary johnston author ...
3,nouvelles histoires,Produced by Chuck Greif and www.ebooksgratuits...,contemporary fiction childrens school humor,edgar allan poe nouvelles histoires extraordi...
4,autobiography and selected essays,Produced by Donald Lainson\n\n\n\n\n\nAUTOBIOG...,biography non fiction,autobiography and selected essays by thomas h...


In [97]:
# Now we, will split massive text into smaller chunks (sentences)
# First, we drop title and text columns
data_gutenberg_clean = data_gutenberg_clean.drop(columns=['title', 'text'])
data_gutenberg_clean.head(5)

Unnamed: 0,genres,cleaned_text
0,literary fiction classics literature fiction ...,transcriber s note this e book belongs to tol...
1,classics literary fiction novels,transcriber s note quotation marks have been ...
2,adventure historical,sir mortimer a novel by mary johnston author ...
3,contemporary fiction childrens school humor,edgar allan poe nouvelles histoires extraordi...
4,biography non fiction,autobiography and selected essays by thomas h...


In [115]:
data_gutenberg_clean.head(1)['genres'].values[0]

' literary fiction classics literature fiction short stories th century philosophy historical fiction adult fiction drama plays '

### Load the GPT2 Tokenizer

In [74]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<BOS>', eos_token='<EOS>', pad_token='<PAD>')

In [216]:
# Defining a dataset class to be used by the dataloader
from torch.utils.data import Dataset, DataLoader
class GPT2Dataset(Dataset):

    def __init__(self, df, tokenizer, max_length=768):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for index, row in df.iterrows():
            genres = row['genres']
            cleaned_text = row['cleaned_text']
            tokenized_segments = self.tokenize_with_genre(genres, cleaned_text, max_length)
            self.input_ids.append(torch.tensor(tokenized_segments['input_ids']))
            self.attn_masks.append(torch.tensor(tokenized_segments['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

    def tokenize_with_genre(self, genres, cleaned_text, max_length):
        genre_tokens = self.tokenizer(genres)['input_ids']
        #print(genre_tokens)
        genre_length = len(genre_tokens)
        text_length = max_length - genre_length  # Remaining length for text tokens

        # Initialize list to store tokenized segments
        combined_tokens = []

        # Tokenize cleaned_text into chunks of length text_length
        for i in range(0, len(cleaned_text), text_length):
            #print(len(cleaned_text))
            # Get a chunk of text
            chunk = cleaned_text[i:i + text_length]

            # Tokenize the chunk
            text_tokens = self.tokenizer(chunk, truncation=True, max_length=text_length, padding="max_length")['input_ids']

            # Combine genre tokens with text tokens
            segment_tokens = genre_tokens + text_tokens

            # Add the combined tokens to the list
            combined_tokens.append(segment_tokens)

            # Print the sizes of genre_tokens, text_tokens, and segment_tokens
            #print("Size of genre_tokens:", len(genre_tokens))
            #print("Size of text_tokens:", len(text_tokens))
            #print("Size of segment_tokens:", len(segment_tokens))

        # Pad the segments to ensure they all have the same length
        max_segment_length = max(map(len, combined_tokens))
        combined_tokens = [segment + [0] * (max_segment_length - len(segment)) for segment in combined_tokens]

        # Create attention masks
        attention_masks = [[1] * len(segment) for segment in combined_tokens]

        # Convert lists to tensors
        input_ids = torch.tensor(combined_tokens)
        attn_masks = torch.tensor(attention_masks)

        encodings_dict = {
            'input_ids': input_ids,
            'attention_mask': attn_masks
        }
        return encodings_dict

In [237]:
# Running on a sample dataset to test the GPT2 class
sample_dataset = data_gutenberg_clean.sample(1) # For entire dataset, enter (length of dataset)
dataset = GPT2Dataset(sample_dataset, tokenizer)

  self.input_ids.append(torch.tensor(tokenized_segments['input_ids']))
  self.attn_masks.append(torch.tensor(tokenized_segments['attention_mask']))


In [238]:
# Printing input_ids to check that the concatenation of genre and text tokens is working
# Initial x tokens corresponding to genre should be similar
print(dataset[0][0][0][:25])
print(dataset[0][0][1][:25])

tensor([2106,  220, 2615,  257, 1181,  287, 2471, 4891, 1956,  422, 6685,  286,
        1149,  829,  288,   13, 1281,  261,  287,  262, 4808, 2502, 1044, 4911,
          62])
tensor([ 2106,   220,    69,   473,  9107,   330,   290,  6693,  4819,    13,
          340,   373,  1900,   284,   262, 15469,   326,   262, 45630,   272,
         5342,   287,   502,    87,  3713])


In [239]:
## Testing the size of output tensors
print(dataset[0][0].size())
print(dataset[0][1].size())

print(dataset[0][0][0].size())

input_id, attention = (dataset[0])
print(input_id.size(), attention.size())
print(input_id[0])

torch.Size([133, 768])
torch.Size([133, 768])
torch.Size([768])
torch.Size([133, 768]) torch.Size([133, 768])
tensor([ 2106,   220,  2615,   257,  1181,   287,  2471,  4891,  1956,   422,
         6685,   286,  1149,   829,   288,    13,  1281,   261,   287,   262,
         4808,  2502,  1044,  4911,    62,  1312,   703,   262,  7674,   373,
         9477,   287,  5336, 46754,  4861,   287,   262,  1903,   277,  2135,
          444,   612,   373,   257,  2156,   319,   262, 24287,  5228,   286,
         4283,  1122,   290, 20518,  1122,   286, 11091, 27070,  2181,  5736,
          329,   262,  2278,   543,   373,  1444,   262,  1230, 25523,  2156,
           13,   262,  2728,   286,   428, 24781,   341,   373,   326,   262,
         2386,   361,  3317, 15469,   290,   511,  4172,   257,  2888,   286,
         8681,   290,   465,  3656,   262, 16503,  2585, 22397,   282,   290,
         1811, 14494, 13469,   270,  3166,   286,   262,  2717,  1230, 49330,
          612,    13,   287,   8

- Note: There are probably more efficiencies to be gained by defining an input_token_type_id that captures the difference between genre and text
- Defining a function to generate input for encoder: can try this later


In [161]:
# TO BE INCORPORATED LATER
def generate_input_encoding(genre, input_text, max_length):
    # Tokenize the genre and input text
    genre_tokens = tokenizer(genre)['input_ids']
    text_tokens = tokenizer(input_text)['input_ids']

    # Define the token type IDs
    genre_token_type_id = [0] * len(genre_tokens)  # Token type ID for genre
    text_token_type_id = [1] * len(text_tokens)   # Token type ID for text

    # Concatenate the token type IDs and input tokens
    input_token_type_ids = genre_token_type_id + text_token_type_id
    input_tokens = genre_tokens + text_tokens

    # Add padding if necessary
    if len(input_tokens) < max_length:
        input_tokens += [tokenizer.pad_token_id] * (max_length - len(input_tokens))
        input_token_type_ids += [tokenizer.pad_token_id] * (max_length - len(input_token_type_ids))
    elif len(input_tokens) > max_length:
        input_tokens = input_tokens[:max_length]
        input_token_type_ids = input_token_type_ids[:max_length]

    return input_tokens, input_token_type_ids

In [163]:
# Split into training and validation sets
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

   18 training samples
    2 validation samples


In [165]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 

#batch_size = 2
#train_dataloader = DataLoader(
#            train_dataset,  # The training samples.
#            batch_size = batch_size # Trains with this batch size.
#        )

# For validation the order doesn't matter, so we'll just read them sequentially.
#validation_dataloader = DataLoader(
#            val_dataset, # The validation samples.
#            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
#            batch_size = batch_size # Evaluate with this batch size.
#        )

### Finally...Finetuning GPT-2

In [221]:
# I'm not really doing anything with the config buheret
import random
torch.manual_seed(42)
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
#model.resize_token_embeddings(len(tokenizer))

In [171]:
# Running this on mps (no better performance than cpu)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
print(f"torch.device: {device}")
model.to(device)

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

torch.device: mps


<torch._C.Generator at 0x1210e8f10>

In [173]:
# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [175]:
# Note: Incorporated the latest version of the optimizer from pytorch
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon)

In [176]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [245]:
import time

# Function to track time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

## Defining the training function
def train(
    dataset,
    model,
    tokenizer,
    batch_size=2,  # Edit later
    epochs=4,
    lr=2e-5,
    max_seq_len=400,
    warmup_steps=5000,
    gpt2_type="gpt2",
    device=device,
    output_dir=".",
    output_prefix="wreckgar",
    test_mode=False,
    save_model_on_epoch=False,
):

    model = model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        total_loss = 0.0
        for batch in tqdm(train_dataloader):
            input_ids = batch[0].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Avg Training Loss for Epoch {epoch}: {avg_loss}")

        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )

    return model


In [243]:
from tqdm import tqdm, trange
import torch.nn.functional as F

gpt2_type = "gpt2"
model = train(
    dataset,
    GPT2LMHeadModel.from_pretrained(gpt2_type),
    GPT2Tokenizer.from_pretrained(gpt2_type),
    batch_size=2,
    epochs=3,
    lr=3e-5,
    max_seq_len=140,
    warmup_steps=5000,
    gpt2_type=gpt2_type,
    device=device,
    output_dir="trained_models",
    output_prefix="gutenberg_small",
    save_model_on_epoch=True
)

Training epoch 0


  0%|          | 0/1 [00:14<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 12.64 GB, other allocations: 3.95 GB, max allowed: 18.13 GB). Tried to allocate 3.51 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [218]:
# Training the model 
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None # Can incorporate these later
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                           token_type_ids=None, 
                            attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...


RuntimeError: stack expects each tensor to be equal size, but got [479, 768] at entry 0 and [401, 768] at entry 1