In [1]:
# IMPORTS
import re
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

In [2]:
#DIRECTORIES
DATASET = "../input/harry-potter-gru-text-generator"
DATA_PATH = "../input/harry-potter-philosophers-stone-preprocessed/Harry_Potter_philosophers_stone.txt"
SAVED_MODEL_PATH = "../input/harry-potter-gru-text-generator/Best_weights.hdf5"
CHECKPOINT_PATH = "Best_weights.hdf5"

In [None]:
#Copy file from Input to Output(to easier create a new dataset with updated weights)
for file in os.listdir(DATASET):
    if file.endswith('hdf5') == False:
        path = os.path.join(DATASET, file)
        !cp -r $path ./

In [3]:
#Load the file
text = open(DATA_PATH, "r", encoding="utf-8").read().lower()
words = text.split()

In [4]:
#text preprocessing
endings  = ('.', '!', '?')

for idx, word in enumerate(words):
    if word.endswith(endings) and word not in endings:
        words[idx] = re.sub('[.!?]', '', word)
        words.insert(idx+1, word[-1])
    if words[idx].startswith('.') and word not in endings:
        words[idx] = re.sub('[.]', '', word)
        words.insert(idx-1, '.')
    if re.search('.[.].', words[idx]):
        w = word.split('.')
        words[idx] = '.'
        words.insert(idx-1, w[0])
        words.insert(idx+1, w[-1])

In [5]:
sentences = re.split('[.!?]', text)
sent = []
sent = [re.sub('[\n]', '', sentence) for sentence in sentences]

new_text = ''.join(sent)
new_text = re.sub('  ', ' ', new_text)
words = new_text.split()

In [6]:
#Transformers
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [12]:
tokenizer.pad_token = tokenizer.eos_token
data_token = tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=85)

In [13]:
import torch
from torch.utils.data import Dataset, random_split

MAX_LENGTH = 85

class Harry_dataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for sentence in sentences:
            encodings_dict = tokenizer('<|startoftext|>' + sentence + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
    

dataset = Harry_dataset(sent, tokenizer, max_length=MAX_LENGTH)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [28]:
training_args = TrainingArguments(output_dir='./', num_train_epochs=10, logging_steps=50, save_steps=5000,
                                  per_device_train_batch_size=32, per_device_eval_batch_size=64,
                                  warmup_steps=10, weight_decay=0.05, report_to = 'none')

PyTorch: setting up devices


In [29]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 5895
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1850


Step,Training Loss
50,0.7529
100,0.7444
150,0.7505
200,0.7145
250,0.6954
300,0.6869
350,0.6743
400,0.6622
450,0.6558
500,0.6468




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1850, training_loss=0.6182303299774995, metrics={'train_runtime': 488.8808, 'train_samples_per_second': 120.582, 'train_steps_per_second': 3.784, 'total_flos': 1278606145536000.0, 'train_loss': 0.6182303299774995, 'epoch': 10.0})

In [39]:
begin = 'Mr. and Mrs. Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much. They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense'
generated = tokenizer(f"<|startoftext|> {begin}", return_tensors="pt").input_ids.cuda()

sample_outputs = model.generate(generated, top_k=5, max_length=200, top_p=0.95, temperature=1.95)
                                
tokenizer.decode(sample_output, skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<|startoftext|> Mr. and Mrs. Dursley of number four Privet Drive were proud to say that they were perfectly normal thank you very much. They were the last people youd expect to be involved in anything strange or mysterious because they just didnt hold with such nonsense. a happy spell harry found them easy once harry had been informed that a large parcel called bdumbledore himself had taken possession o madam gam flitwick from professor dumbledore because they had been given restricted passage when a horrible surprise '