In [1]:
%env FASTAI_HOME=.
# from https://medium.com/@pierre_guillou/faster-than-training-from-scratch-fine-tuning-the-english-gpt-2-in-any-language-with-hugging-f2ec05c98787

In [2]:
# Freeze versions of dependencies for now
!pip install fastai2==0.0.30 fastcore==1.0.0 tokenizers==0.8.1rc2 transformers==3.3.1 torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html


from fastai2.text.all import *
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tokenizers import ByteLevelBPETokenizer

import logging
logging.basicConfig(
        format="%(asctime)s — %(levelname)s — %(name)s — %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
lang = 'es'
name = f'{lang}wiki'
config = Config()
data_path = config['data_path']
path_data = data_path/name
path_data.mkdir(exist_ok=True, parents=True)
path_data

Path('data/eswiki')

# Wiki download and extraction

In [5]:
# source: https://github.com/fastai/course-nlp/blob/master/nlputils.py

from fastai2.basics import *
import re
import pandas as pd


def get_wiki(path,lang):
    name = f'{lang}wiki'
    if (path/name).exists():
        print(f"{path/name} already exists; not downloading")
        return

    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"

    if not (path/xml_fn).exists():
        print("downloading...")
        download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
        print("unzipping...")
        bunzip(path/zip_fn)

    # Get wikiextractor
    #if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
    # Extraction

    if not (path/'text/AA/wiki_00').exists():
        print("extracting...")
        try:
            from wikiextractor import WikiExtractor
        except ImportError:
            os.system('pip install wikiextractor')
        os.system(f"python -m wikiextractor.WikiExtractor --processes 4 --no_templates --min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {path/xml_fn}")
    
    dest = path/name
    dest.mkdir(exist_ok=True, parents=True)
    shutil.move(str(path/'text/AA/wiki_00'), str(dest))
    shutil.rmtree(path/'text')
    return dest

def split_wiki(path,lang):
    dest = path/'docs'
    name = f'{lang}wiki'
    if dest.exists():
        print(f"{dest} already exists; not splitting")
        return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
    lines = (path/name).open()
    f=None
    
    for i,l in enumerate(lines):
        if i%100000 == 0: print(i)
        if l.startswith('<doc id="'):
            title = title_re.findall(l)[0].replace('/','_')
            if len(title)>150: continue
            if f: f.close()
            f = (dest/f'{title}.txt').open('w')
        else: f.write(l)
    f.close()
    return dest

def clean_files(dest):

    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # delete file content
        f.seek(0)
        f.truncate()
        # write modificated text in file
        f.write(text)
        f.close()
        
def get_one_clean_file(dest,lang):

    fname = f'all_texts_{lang}wiki.txt'
    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    all_texts = ''

    with open (dest.parent/fname, 'w') as fp: 
        for i,l in enumerate(dest.ls()):
            # open file and get content without first line which is the title
            f = l.open('r+', encoding="utf-8")
            f.readline()
            text = f.read()
            f.close()
            # get content without </doc> and delete empty line and whitespaces at the head and tail
            text = doc_re.findall(text)[0].strip()
            # concatenate text
            fp.write(text)
            fp.write("\n")
            if not (i % 1000): print(i)
 
    print(f"all texts from wikipedia {lang} in the file {dest.parent/fname}\n")

def get_one_clean_csv_file(dest,lang):    
                         
    fname = f'all_texts_{lang}wiki.csv'
    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    all_texts = list()
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        f.close()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # append text
        all_texts.append(text)
  
    # Create the pandas DataFrame 
    df = pd.DataFrame(all_texts, columns = ['text'])
    
    # save
    df.to_csv(dest.parent/fname, index=False)  
    print(f"all texts from wikipedia {lang} in the file {dest.parent/fname}\n")
                         
def get_num_tokens(dest):
    
    # Getting an idea of the number of words
    files = dest.ls()
    num_tokens = 0

    for i,l in enumerate(files):
        f = l.open('r', encoding="utf-8")
        words = f.read()
        num_tokens += len(words.split())
        f.close()
        
    num_files = i+1
    
    return num_files, num_tokens

In [6]:
#get_wiki(path_data, lang)

In [7]:
print("create one text file by article")
#dest = split_wiki(path_data,lang)
print("get all articles in one text file and one csv file")
#get_one_clean_file(dest,lang)
#get_one_clean_csv_file(dest,lang)

create one text file by article
get all articles in one text file and one csv file


In [8]:
%%time
# Size of downloaded data in the docs folder
#num_files, num_tokens = get_num_tokens(dest)
#print(f'{num_files} files - {num_tokens} tokens')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


# Tokenization

In [9]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

# The GPT2 Model transformer with a language modeling head on top
# (linear layer with weights tied to the input embeddings)

# GPT2Tokenizer: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2tokenizer
# GPT2TokenizerFast: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2tokenizerfast
# GPT2LMHeadModel: https://huggingface.co/transformers/model_doc/gpt2.html#transformers.GPT2LMHeadModel

pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model_en = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [10]:
# To correct the warning about token_pad (GPT2TokenizerFast), run the following code
# source: https://github.com/huggingface/transformers/issues/2648#issuecomment-616177044
tokenizer_en.pad_token = tokenizer_en.eos_token
model_en.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [None]:
# 1. Get the pre-trained GPT2 Tokenizer (pre-trained with an English
# corpus) from the Transformers library (Hugging Face) 
from tokenizers import ByteLevelBPETokenizer

pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tokenizer_en.pad_token = tokenizer_en.eos_token

# 2. Train a Byte Level BPE (BBPE) tokenizer on the Spanish
# Wikipedia corpus by using the Tokenizers library (Hugging Face)

# 2.1 Get GPT2 tokenizer_en vocab size
ByteLevelBPE_tokenizer_es_vocab_size = tokenizer_en.vocab_size
ByteLevelBPE_tokenizer_es_vocab_size

# 2.2 ByteLevelBPETokenizer Represents a Byte-level BPE
# as introduced by OpenAI with their GPT-2 model
ByteLevelBPE_tokenizer_es = ByteLevelBPETokenizer()

# 2.3 Get list of paths to corpus files
# and customize training with <|endoftext|> special GPT-2 token
paths = [str(path_data/'all_texts_eswiki.txt')]
ByteLevelBPE_tokenizer_es.train(files=paths,
                    vocab_size=ByteLevelBPE_tokenizer_es_vocab_size, 
                    min_frequency=2, 
                    special_tokens=["<|endoftext|>"])

# Get sequence length max of 1024
ByteLevelBPE_tokenizer_es.enable_truncation(max_length=1024)

# 2.4 save tokenizer
path_to_ByteLevelBPE_tokenizer_es_rep = path_data/'ByteLevelBPE_tokenizer_es'
if not (path_to_ByteLevelBPE_tokenizer_es_rep).exists():
    path_to_ByteLevelBPE_tokenizer_es_rep.mkdir(exist_ok=True, parents=True)
ByteLevelBPE_tokenizer_es.save_model(str(path_to_ByteLevelBPE_tokenizer_es_rep))

In [None]:
# Load the tokenizer ByteLevelBPE_tokenizer_pt
from tokenizers import ByteLevelBPETokenizer

# Get the path to ByteLevelBPE_tokenizer_pt config files
path_to_ByteLevelBPE_tokenizer_es_rep = path_data/'ByteLevelBPE_tokenizer_es'

ByteLevelBPE_tokenizer_es = ByteLevelBPETokenizer(
    vocab_file=f'{path_to_ByteLevelBPE_tokenizer_es_rep}/vocab.json',
    merges_file=f'{path_to_ByteLevelBPE_tokenizer_es_rep}/merges.txt'
)

# Get sequence length max of 1024
ByteLevelBPE_tokenizer_es.enable_truncation(max_length=1024)


# 3. Import the tokenizer config files in Spanish into the pre-trained GPT2 Tokenizer
tokenizer_es = GPT2TokenizerFast.from_pretrained(
    str(path_to_ByteLevelBPE_tokenizer_es_rep), 
    pad_token='<|endoftext|>')
# Get sequence length max of 1024
tokenizer_es.model_max_length = 1024

In [None]:
ByteLevelBPE_tokenizer_es_vocab = ByteLevelBPE_tokenizer_es.get_vocab() 
ByteLevelBPE_tokenizer_es_vocab_ls = [k for k, v in sorted(ByteLevelBPE_tokenizer_es_vocab.items(), key=lambda item: item[1])]
len(ByteLevelBPE_tokenizer_es_vocab_ls),ByteLevelBPE_tokenizer_es_vocab_ls[:5]

In [None]:
from fastai2.text.all import *

# 1. GPT2TokenizerFast (imported GPT-2 tokenizer) → fastai Tokenizer
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)
tokenizer_fastai_es = TransformersTokenizer(tokenizer_es)

# Get weights of the old wte
old_wgts = model_en.transformer.get_input_embeddings().weight.clone().detach()

# Get the mean embedding vetor of the old wte
wgts_m = old_wgts.mean(0)

# Initialize vocab size and weights of the new wte
new_vocab_size = tokenizer_fastai_es.tokenizer.vocab_size
new_wgts = old_wgts.new_zeros(new_vocab_size,old_wgts.size(1))

# Get the new wte keeping the embeddings vetors of tokens in common in the 2 vocabs
# A token present in the new vocab but not in the old one gets the mean embedding vetor of the old wte
old_vocab = tokenizer_fastai_en.tokenizer.get_vocab()
new_vocab = tokenizer_fastai_es.tokenizer.get_vocab()
same_tokens_list = list()
different_tokens_list = list()
    
for w,idx_new in new_vocab.items():    
    idx_old = old_vocab.get(w, -1)
    if idx_old>=0:
        new_wgts[idx_new] = old_wgts[idx_old]
        same_tokens_list.append((w,idx_new))
    else:
        new_wgts[idx_new] = wgts_m
        different_tokens_list.append((w,idx_new))

# setup in model the new wte
new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))
#new_wte.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
new_wte.weight.data = new_wgts
model_en.transformer.set_input_embeddings(new_wte)
print(f'Spanish wte matrix setup done!\n\nWe kept {len(same_tokens_list)} embeddings vectors from the English one.\nWe did not kept {len(different_tokens_list)} embeddings vectors from the English one (instead, we used the old wte mean vector).\n')

# Check identical tokens between the 2 vocabs               
num = 15
print(f'{num} first tokens IN common between the 2 vocabs:\n{same_tokens_list[:num]}\n')
print(f'{num} first tokens NOT in common between the 2 vocabs:\n{different_tokens_list[:num]}')

# save new_wgts
torch.save(new_wgts, path_data/'new_wte_wgts.es')
# save same_tokens_list and different_tokens_list
torch.save(same_tokens_list, path_data/'same_tokens_list.es')
torch.save(different_tokens_list, path_data/'different_tokens_list.es')

In [None]:
# load new_wgts
new_wgts = torch.load(path_data/'new_wte_wgts.es')
# load same_tokens_list and different_tokens_list
same_tokens_list = torch.load(path_data/'same_tokens_list.es')
different_tokens_list = torch.load(path_data/'different_tokens_list.es')
                      
# setup in model the new wte
new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))
new_wte.weight.data = new_wgts
model_en.transformer.set_input_embeddings(new_wte)
print(f'Spanish wte matrix setup done!\n\nWe kept {len(same_tokens_list)} embeddings vectors from the English one.\nWe did not kept {len(different_tokens_list)} embeddings vectors from the English one (instead, we used the old wte mean vector).\n')

# Check identical tokens between the 2 vocabs               
num = 15
print(f'{num} first tokens IN common between the 2 vocabs:\n{same_tokens_list[:num]}\n')
print(f'{num} first tokens NOT in common between the 2 vocabs:\n{different_tokens_list[:num]}')

In [None]:
# Check that the embeddings vetors of the common tokens are the ones from the old wte matrix
old_vocab = tokenizer_fastai_en.tokenizer.get_vocab()
#new_vocab = tokenizer_fastai_pt.tokenizer.get_vocab()
count = 0

for (tok,idx) in same_tokens_list:
    w = tokenizer_fastai_es.tokenizer.convert_ids_to_tokens(idx)
    tens_a = new_wgts[idx]
    idx_old = old_vocab.get(w, -1)
    if idx_old >= 0:
        tens_b = old_wgts[idx_old]
    else:
        tens_b = wgts_m
    if ( torch.all(tens_a.eq(tens_b)) == False) or (w != tok):
        print('idx,tok:',idx,tok)
        print('idx,w:',idx,w)
        print('idx_old:',idx_old)
        print('identical?',torch.all(tens_a.eq(tens_b)))
        count += 1

if count == 0:        
    print(f'Great! All the embeddings vetors of the {len(same_tokens_list)} common tokens are the ones of the old wte matrix :-)\n')
    
# Check that the embeddings vetors of the NOT common tokens are the old wte mean vetor
count = 0

for (tok,idx) in different_tokens_list:
    w = tokenizer_fastai_pt.tokenizer.convert_ids_to_tokens(idx)
    tens_a = new_wgts[idx]
    idx_old = old_vocab.get(w, -1)
    if idx_old >= 0:
        tens_b = old_wgts[idx_old]
    else:
        tens_b = wgts_m
    if ( torch.all(tens_a.eq(tens_b)) == False) or (w != tok):
        print('idx,tok:',idx,tok)
        print('idx,w:',idx,w)
        print('idx_old:',idx_old)
        print('identical?',torch.all(tens_a.eq(tens_b)))
        count += 1

if count == 0:        
    print(f'Great! All the embeddings vetors of the {len(different_tokens_list)} NOT common tokens are the old wte mean vetor :-)\n')

In [None]:
model_en.lm_head.weight = model_en.transformer.wte.weight
# Check atual weight of wte and lm_head and if wte = lm_head
tens_a = model_en.transformer.wte.weight
tens_b = model_en.lm_head.weight
model_en.transformer.wte.weight,model_en.lm_head.weight,torch.all(tens_a.eq(tens_b))

# Create fastai v2 Datasets and Dataloaders

In [None]:
lang = 'es'
fname = f'all_texts_{lang}wiki.csv'
df = pd.read_csv(path_data/fname)
len(df)

In [None]:
df_sample = df[:1000]

num = int(0.8*len(df_sample))

idxs = np.random.randint(0, len(df_sample), len(df_sample))
idxs_train = idxs[:num]
idxs_val = idxs[num:]

all_texts = np.concatenate([df_sample.iloc[idxs_train].text.values, df_sample.iloc[idxs_val].text.values])
splits = [list(idxs_train), list(idxs_val)]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer_es), splits=splits, dl_type=LMDataLoader)

In [None]:
num = int(0.8*len(df))

idxs = np.random.randint(0, len(df), len(df))
idxs_train = idxs[:num]
idxs_val = idxs[num:]

#save idxs train and valid
torch.save(idxs_train, path_data/'idxs_train.pt')
torch.save(idxs_val, path_data/'idxs_val.pt')

In [None]:
# load idxs train and valid
idxs_train = torch.load(path_data/'idxs_train.pt')
idxs_val = torch.load(path_data/'idxs_val.pt')
all_texts = np.concatenate([df.iloc[idxs_train].text.values, df.iloc[idxs_val].text.values])
splits = [list(idxs_train), list(idxs_val)]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer_es), splits=splits, dl_type=LMDataLoader)
#Since the GPT-2 model was trained with sequences of size 1024, we use this sequence length (it's a stateless model, so it will change the perplexity if we use less).
bs,sl = 8,1024
dls = tls.dataloaders(bs=bs, seq_len=sl)

In [None]:
def tokenize(text):
    toks = tokenizer.tokenize(text)
    return tensor(tokenizer.convert_tokens_to_ids(toks))
tokenized = [tokenize(t) for t in progress_bar(all_texts)]
torch.save(tokenized, path_data/'tokenized_gpt2.es')

In [None]:
tokenized_es = torch.load(path_data/'tokenized_gpt2.es')

class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        return x if isinstance(x, Tensor) else tokenize(x)
        
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [None]:
tls2 = TfmdLists(tokenized_es, TransformersTokenizer(tokenizer_es), splits=splits, dl_type=LMDataLoader)
dls2 = tls.dataloaders(bs=bs, seq_len=sl)
dls2.show_batch(max_n=5)

In [None]:
class DropOutput(Callback):
    def after_pred(self): self.learn.pred = self.pred[0]

def splitter(model):
    "Split a GPT2 `model` in 3 groups for differential learning rates."
    
    # First layers group : decoder blocks from 0 to 3
    modules = []
    for i in range(4): modules.append(model.transformer.h[i])
    groups = [nn.Sequential(*modules)]

    # Second layers group : decoder blocks from 4 to 7
    modules = []
    for i in range(4,8,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])

    # Third layers group : decoder blocks from 8 to 11
    modules = []
    for i in range(8,12,1): modules.append(model.transformer.h[i])
    groups = L(groups + [nn.Sequential(*modules)])
    
    # Fourth layers group : embeddings matrices wte and wpe + LayerNorm at the model output
    groups = L(groups + [nn.Sequential(model.transformer.wte,model.transformer.wpe,model.transformer.ln_f)])
    
    return groups.map(params)

# Learner: basic class for handling the training loop
# source: https://dev.fast.ai/learner#Learner
learn = Learner(dls, model_en, loss_func=CrossEntropyLossFlat(),
                splitter = splitter,
                cbs=[DropOutput], 
                metrics=[accuracy, Perplexity()]).to_fp16()
learn.create_opt()
print(f'number of parameters groups: {len(learn.opt.param_groups)}')

# ... and the list of Learning Rates (before its atualization by the Optimizer of the function fit_one_cycle())
for i,h in enumerate(learn.opt.hypers):
    print(i,h)

In [None]:
# loss, accuracy, Perplexity() of validation dataset
learn.validate()

In [None]:
#Freeze all layers but the last layers group (do not freeze wte, wpe embeddings matrices and last LayerNorm)
learn.freeze()
learn.summary()

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(1, 2e-3)
learn.recorder.plot_loss()

In [None]:
learn.save(path_data/'GPT2_es_1epoch_lr2e-3')
learn = learn.load(path_data/'GPT2_es_1epoch_lr2e-3')

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-3/(2.6**4),1e-3))
learn.recorder.plot_loss()

In [None]:
learn.save(path_data/'GPT2_es_2epoch_lr1e-3')
learn = learn.load(path_data/'GPT2_es_2epoch_lr1e-3')

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-4/(2.6**4),5e-4))
learn.recorder.plot_loss()

In [None]:
learn.save(path_data/'GPT2_es_3epoch_lr5e-4')
learn = learn.load(path_data/'GPT2_es_3epoch_lr5e-4')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-4/(2.6**4),1e-4))
learn.recorder.plot_loss()

In [None]:
learn.save(path_data/'GPT2_es_5epoch_lr1e-4_v2')
learn = learn.load(path_data/'GPT2_es_5epoch_lr1e-4_v2')