# Parse data and clean it

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class CleanHTML(object):

    def __init__(self, html):
        self.html_ = str(html)
        self.soup_ = None

    def get_soup(self):
        return self.soup_

    def process_html_tags(self, safe=None, unsafe=None):
        SAFE_TAGS = []

        safe = safe or SAFE_TAGS.copy()

        extract = ['script', 'style']
        save_attrs = ['highlight', 'iframe']

        safe = set(safe) - set(unsafe or [])

        soup = BeautifulSoup(self.html_, 'html5lib')

        for tag in soup.findAll():
            if tag.name not in save_attrs:
                tag.attrs = []

            if tag.name.lower() in extract:
                tag.extract()

            elif not tag.name.lower() in safe:
                tag.replaceWithChildren()
        self.soup_ = str(soup)

In [None]:
import re 

def clean_html(text):
    try:
        if (text):
            tmp_obj = CleanHTML(text)
            tmp_obj.process_html_tags(None, None)
            tmp_var = tmp_obj.get_soup()
            # remove reference
            tmp_obj = None
            return tmp_var
        else:
            return None
    except Exception as ex:
        raise ex

def remove_url_if_possible(text):
    if (text):
        result = re.sub(
            r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?',
            ' ', text)
        return result
    else:
        return None

def remove_multy_spaces(text):
    try:
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as ex:
        return None

In [None]:
main_url = 'https://www.hplovecraft.com/writings/fiction/'
texts_url = 'https://www.hplovecraft.com/writings/texts/fiction'

In [None]:
body = requests.post(main_url).text

In [None]:
soup = BeautifulSoup(body)

In [None]:
urls = []
for h in soup.findAll('li'):
    
    a = h.find('a')
    try:
          
        if 'href' in a.attrs:
              
            url = a.get('href')
              
            urls.append(url)
    except:
        pass
urls = [os.path.join(texts_url,i) for i in urls]

In [None]:
len(urls)

105

In [None]:
def get_url_body(url):
  body = requests.post(url).text
  return body

In [None]:
with ThreadPool(6) as pool:
  parsed = list(tqdm(pool.imap(get_url_body, urls), total=len(urls)))

100%|██████████| 105/105 [00:22<00:00,  4.69it/s]


In [None]:
def parse_text(parsed):
  soup = BeautifulSoup(parsed)
  text = None
  for i in soup.findAll('tr'):
     res = i.find('div', attrs={'align':"justify"})
     if res:
       text = remove_url_if_possible(clean_html(res)).strip()
       break
  return text


In [None]:
texts = [parse_text(i) for i in parsed]
texts = [i for i in texts if not(i is None)]

In [None]:
texts = [remove_multy_spaces(i) for i in texts]

In [None]:
sentences = [sent_tokenize(i) for i in texts]

In [None]:
min([len(i) for i in sentences]), max([len(i) for i in sentences])

(8, 1751)

# Fine-tuning GPT2

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 31.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [None]:
!nvidia-smi

Sat Jul 30 09:58:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.manual_seed(0)


<torch._C.Generator at 0x7f2ed8deb2f0>

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<start>',
                                          eos_token='<end>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Embedding(50260, 1024)

In [None]:
sentences = np.hstack(sentences)

In [None]:
max_length = max([len(tokenizer.encode(sentence)) for sentence in sentences])


In [None]:
max_length

299

In [None]:
class LovecraftDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length
                 ):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<start>' + txt + '<end>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = LovecraftDataset(sentences, tokenizer, max_length=max_length)


In [None]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
torch.cuda.empty_cache()
import gc 
gc.collect();


In [None]:
training_args = TrainingArguments(output_dir='drive/MyDrive/model_gpt_lovecraft', num_train_epochs=2, logging_steps=1000, save_steps=10000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=100, weight_decay=0.06, logging_dir='./logs', report_to = 'none')

In [None]:
trainer = Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})

In [None]:
trainer.train()

***** Running training *****
  Num examples = 25799
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 51598


Step,Training Loss
1000,0.4645
2000,0.4721
3000,0.443
4000,0.433
5000,0.4289
6000,0.4275
7000,0.4233
8000,0.4367
9000,0.4369
10000,0.4321


Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-10000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-10000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-20000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-20000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-30000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-30000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-30000/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-40000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-40000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-

TrainOutput(global_step=51598, training_loss=0.38298376347973456, metrics={'train_runtime': 20723.7711, 'train_samples_per_second': 2.49, 'train_steps_per_second': 2.49, 'total_flos': 2.798400475086029e+16, 'train_loss': 0.38298376347973456, 'epoch': 2.0})

In [None]:
trainer.save_model('drive/MyDrive/model_gpt_lovecraft/trained_model')

Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/trained_model
Configuration saved in drive/MyDrive/model_gpt_lovecraft/trained_model/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/trained_model/pytorch_model.bin


In [None]:
result = trainer.evaluate(val_dataset)


***** Running Evaluation *****
  Num examples = 2867
  Batch size = 1


In [None]:
result

{'epoch': 2.0,
 'eval_loss': 0.40212735533714294,
 'eval_runtime': 225.6686,
 'eval_samples_per_second': 12.704,
 'eval_steps_per_second': 12.704}

# Generate

In [None]:
generated = tokenizer("Ctulhu", return_tensors="pt").input_ids.cuda()


In [None]:
sample_outputs = model.generate(generated, dtop_p=0.99, temperature=1.9, do_sample=True,
                                max_length=300, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    print('*'*30)

0: Ctulhu has been talking long to another elder.
******************************
1: Ctulhu now sank flat upon his shoulder with an appearance of fatigue; but that his worn face had come open was of more interest to all than his exhausted form on either of the three occasions brought out.
******************************
2: Ctulhu himself would come along in his motor at a time when Lake, of Khephrenium origin, was abroad; so that a motor would probably take care of the long bus trip down and down to Teloe across the regions of air or into distant gulfs of earth from Gondral Desert, according to Lake’s suggested theories.
******************************
3: Ctulhu is a very strange, grotesque God, indeed, and he will come to the world many times—if through men, but very shortly after men’s ultimate defeat at sea against him; which men know, though from some other god or forces that none of me know.
******************************
4: Ctulhu could never get out, though I fancy she would have t

In [None]:
generated = tokenizer("The old folk", return_tensors="pt").input_ids.cuda()


In [None]:
sample_outputs = model.generate(generated, dtop_p=0.99, temperature=1.9, do_sample=True,
                                max_length=300, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    print('*'*30)

0: The old folk like those to whom that amulet had come often turned heads in tales of what the evil-faced merchant implied had struck poor Arthur Munroe hard in 1786; while the bearded farmer wondered whether some devilish deed could explain his frequent trips and travels that went on to that of great-great-grandfather Wilbour to Albany, and perhaps elsewhere to London and Salem and Cape Girardeau in some secret, forgotten cellar room, with never being told about the matter.
******************************
1: The old folk of the land used curious stones and made other wonders behind that black key with traces of it, but this little daemon he alone dared touch.
******************************
2: The old folk talked less, but there seemed a kind of suspicion that their silent and flaky faces were likethose before that terrible night in the village, as if evil laughter came from beyond the trees with that wailing callous of years far below the waves.
******************************
3: The o

# Conclusion
As it can be seen, tuned gpt-2 learns the structure and style of author's writing. In particular, the examples of it are the following:

1) *The old folk used to stand around each nameless corpse like the villagers did around Gawd’s statue, and there were also small farms where gugs had been shot on Sabbat-responsible days and the heads of beasts were devoured.*

2) *The old folk like those to whom that amulet had come often turned heads in tales of what the evil-faced merchant implied had struck poor Arthur Munroe hard in 1786; while the bearded farmer wondered whether some devilish deed could explain his frequent trips and travels that went on to that of great-great-grandfather Wilbour to Albany, and perhaps elsewhere to London and Salem and Cape Girardeau in some secret, forgotten cellar room, with never being told about the matter.*

3) *Ctulhu is a very strange, grotesque God, indeed, and he will come to the world many times—if through men, but very shortly after men’s ultimate defeat at sea against him; which men know, though from some other god or forces that none of me know.*

From the examples mentioned above it's clear, that generated text has attirbutes specific to the author's style (used words, names of characters from his compositions, overall context of the text which has a shade of horror)

