# Parse data and clean it

In [1]:
import requests
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
class CleanHTML(object):

    def __init__(self, html):
        self.html_ = str(html)
        self.soup_ = None

    def get_soup(self):
        return self.soup_

    def process_html_tags(self, safe=None, unsafe=None):
        SAFE_TAGS = []

        safe = safe or SAFE_TAGS.copy()

        extract = ['script', 'style']
        save_attrs = ['highlight', 'iframe']

        safe = set(safe) - set(unsafe or [])

        soup = BeautifulSoup(self.html_, 'html5lib')

        for tag in soup.findAll():
            if tag.name not in save_attrs:
                tag.attrs = []

            if tag.name.lower() in extract:
                tag.extract()

            elif not tag.name.lower() in safe:
                tag.replaceWithChildren()
        self.soup_ = str(soup)

In [3]:
import re 

def clean_html(text):
    try:
        if (text):
            tmp_obj = CleanHTML(text)
            tmp_obj.process_html_tags(None, None)
            tmp_var = tmp_obj.get_soup()
            # remove reference
            tmp_obj = None
            return tmp_var
        else:
            return None
    except Exception as ex:
        raise ex

def remove_url_if_possible(text):
    if (text):
        result = re.sub(
            r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?',
            ' ', text)
        return result
    else:
        return None

def remove_multy_spaces(text):
    try:
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as ex:
        return None

In [4]:
main_url = 'https://www.hplovecraft.com/writings/fiction/'
texts_url = 'https://www.hplovecraft.com/writings/texts/fiction'

In [5]:
body = requests.post(main_url).text

In [6]:
soup = BeautifulSoup(body)

In [7]:
urls = []
for h in soup.findAll('li'):
    
    a = h.find('a')
    try:
          
        if 'href' in a.attrs:
              
            url = a.get('href')
              
            urls.append(url)
    except:
        pass
urls = [os.path.join(texts_url,i) for i in urls]

In [8]:
len(urls)

105

In [9]:
def get_url_body(url):
  body = requests.post(url).text
  return body

In [10]:
with ThreadPool(6) as pool:
  parsed = list(tqdm(pool.imap(get_url_body, urls), total=len(urls)))

100%|██████████| 105/105 [00:05<00:00, 20.73it/s]


In [11]:
def parse_text(parsed):
  soup = BeautifulSoup(parsed)
  text = None
  for i in soup.findAll('tr'):
     res = i.find('div', attrs={'align':"justify"})
     if res:
       text = remove_url_if_possible(clean_html(res)).strip()
       break
  return text


In [12]:
texts = [parse_text(i) for i in parsed]
texts = [i for i in texts if not(i is None)]

In [13]:
texts = [remove_multy_spaces(i) for i in texts]

In [14]:
sentences = [sent_tokenize(i) for i in texts]

In [17]:
min([len(i) for i in sentences]), max([len(i) for i in sentences])

(8, 1751)

# Fine-tuning GPT2

In [19]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 29.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 26.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [20]:
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [18]:
!nvidia-smi

Sat Jul 30 06:31:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [21]:
torch.manual_seed(0)


<torch._C.Generator at 0x7fe96ede07f0>

In [20]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<start>',
                                          eos_token='<end>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50260, 1024)

In [21]:
sentences = np.hstack(sentences)

In [22]:
max_length = max([len(tokenizer.encode(sentence)) for sentence in sentences])


In [23]:
max_length

299

In [24]:
class LovecraftDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length
                 ):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<start>' + txt + '<end>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [25]:
dataset = LovecraftDataset(sentences, tokenizer, max_length=max_length)


In [26]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [27]:
torch.cuda.empty_cache()
import gc 
gc.collect();


In [28]:
training_args = TrainingArguments(output_dir='drive/MyDrive/model_gpt_lovecraft', num_train_epochs=2, logging_steps=1000, save_steps=10000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=100, weight_decay=0.06, logging_dir='./logs', report_to = 'none')

In [29]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 25799
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 25799


Step,Training Loss
1000,0.6443
2000,0.4716
3000,0.4417
4000,0.4324
5000,0.4271
6000,0.4249
7000,0.4211
8000,0.4334
9000,0.4331
10000,0.4282


Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-10000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-10000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to drive/MyDrive/model_gpt_lovecraft/checkpoint-20000
Configuration saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-20000/config.json
Model weights saved in drive/MyDrive/model_gpt_lovecraft/checkpoint-20000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=25799, training_loss=0.4293358824715983, metrics={'train_runtime': 10489.7958, 'train_samples_per_second': 2.459, 'train_steps_per_second': 2.459, 'total_flos': 1.3992002375430144e+16, 'train_loss': 0.4293358824715983, 'epoch': 1.0})

# Generate

In [27]:
model = GPT2LMHeadModel.from_pretrained('drive/MyDrive/model_gpt_lovecraft/checkpoint-10000',
                                        local_files_only=True)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<start>',
                                          eos_token='<end>', pad_token='<pad>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
generated = tokenizer("<start> ", return_tensors="pt").input_ids.cpu()


In [29]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=100, top_p=0.95, temperature=1.9, num_return_sequences=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [30]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    print('*'*30)

0:  ~~~~~~~~.” The old man paused as Wilcox made an appearance with two others. 3 copies were given each, accompanied on each one by a blank blank message printed out by Joseph Curwen, Phaesarian 0 copies of those earlier copies received. ~~~~~~~~ ——— It was then that the two
******************************
1:  ‘It’s the right place for men”—as the police and district officers now agree— and that I know where to bring home the men so you’ll know when there are too dangerous for the local criminals to handle. In these parts there seem more such houses and shops now than formerly..—and..” Mevana came within an opening in these walls when
******************************
2:  ’Nyl ‘Tohama, the way?” “And, for a boy, I cannot go up a ladder of even my size. d? ‘da da da dazhin? d’ autodecabatas. de * * * ** * ‣da da dazhin?” * * * * Ummm,
******************************
3:  ’Bare to our heads must go is thy life shall go. ATD Nd auß ATd nn Nu. DN DN –AT’N’S A DE M D E PEUN’C DE M E X Nd Nn au l

In [31]:
generated = tokenizer("The ancients", return_tensors="pt").input_ids.cpu()


In [32]:
sample_outputs = model.generate(generated, dtop_p=0.95, temperature=1.9, do_sample=True,
                                max_length=100, num_return_sequences=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [33]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    print('*'*30)

0: The ancients are so vain’ to find time—for in this world the time you do know or know how is not at once—I suppose that our generation�ks have had that idea about time. The picture has ceased to exist—a useless but incredibly accurate and faithful copy of some image of our own.” And suddenly I screamed aloud and became frightened, for before my mind and my mind only were still frozen in horror
******************************
1: The ancients called forth from the cavern depths above a dark, nebous fear, saying, that they had felt as bad, and did all them bad with wilders like wild dogs which grow in love too much into cats; and that something was, or had formerly been thought within their ranks..,! “Oh dear,” began Nodens
******************************
2: The ancients who saw him are silent, and all of one heart; because that is what has told, and those are the men behind that whisper—those are those whines and murmure whereon many have forgotten the tale, what things of an early gene