In [None]:
%env FASTAI_HOME=/content
# from https://medium.com/@pierre_guillou/faster-than-training-from-scratch-fine-tuning-the-english-gpt-2-in-any-language-with-hugging-f2ec05c98787

env: FASTAI_HOME=/content


In [None]:
# Freeze versions of dependencies for now
!pip install fastai2 fastcore==1.0.0
!pip install tokenizers
!pip install transformers

from fastai2.text.all import *
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tokenizers import ByteLevelBPETokenizer

import logging
logging.basicConfig(
        format="%(asctime)s — %(levelname)s — %(name)s — %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )

Collecting fastai2
[?25l  Downloading https://files.pythonhosted.org/packages/cc/50/2f37212be57b7ee3e9c947336f75a66724468b21a3ca68734eaa82e7ebf3/fastai2-0.0.30-py3-none-any.whl (179kB)
[K     |█▉                              | 10kB 17.3MB/s eta 0:00:01[K     |███▋                            | 20kB 1.7MB/s eta 0:00:01[K     |█████▌                          | 30kB 2.3MB/s eta 0:00:01[K     |███████▎                        | 40kB 2.6MB/s eta 0:00:01[K     |█████████▏                      | 51kB 2.0MB/s eta 0:00:01[K     |███████████                     | 61kB 2.2MB/s eta 0:00:01[K     |████████████▉                   | 71kB 2.5MB/s eta 0:00:01[K     |██████████████▋                 | 81kB 2.8MB/s eta 0:00:01[K     |████████████████▍               | 92kB 2.9MB/s eta 0:00:01[K     |██████████████████▎             | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████            | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████          | 122kB 2.8MB/s eta 0

In [None]:
lang = 'es'
name = f'{lang}wiki'
config = Config()
data_path = config['data_path']
path_data = data_path/name
path_data.mkdir(exist_ok=True, parents=True)
path_data

Path('/content/data/eswiki')

In [None]:
# source: https://github.com/fastai/course-nlp/blob/master/nlputils.py

from fastai2.basics import *
import re
import pandas as pd


def get_wiki(path,lang):
    name = f'{lang}wiki'
    if (path/name).exists():
        print(f"{path/name} already exists; not downloading")
        return

    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"

    if not (path/xml_fn).exists():
        print("downloading...")
        download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
        print("unzipping...")
        bunzip(path/zip_fn)

    # Change working directory to `path`
    prev_cwd = Path.cwd()
    os.chdir(path)
    
    # Get wikiextractor
    #if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
    # Extraction
    print("extracting...")
    try:
        from wikiextractor import WikiExtractor
    except ImportError:
        os.system('pip install wikiextractor')

    os.system("python -m wikiextractor.WikiExtractor --processes 4 --no_templates " + f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
    shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
    shutil.rmtree(path/'text')
    
    # Return working directory to previous
    os.chdir(prev_cwd)

def split_wiki(path,lang):
    dest = path/'docs'
    name = f'{lang}wiki'
    if dest.exists():
        print(f"{dest} already exists; not splitting")
        return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
    lines = (path/name).open()
    f=None

    for i,l in enumerate(lines):
        if i%100000 == 0: print(i)
        if l.startswith('<doc id="'):
            title = title_re.findall(l)[0].replace('/','_')
            if len(title)>150: continue
            if f: f.close()
            f = (dest/f'{title}.txt').open('w')
        else: f.write(l)
    f.close()
    return dest

def clean_files(dest):

    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # delete file content
        f.seek(0)
        f.truncate()
        # write modificated text in file
        f.write(text)
        f.close()
        
def get_one_clean_file(dest,lang):

    fname = f'all_texts_{lang}wiki.txt'
    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    all_texts = ''
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        f.close()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # concatenate text
        all_texts += text
        all_texts += "\n"
        if not (i % 1000): print(i)
  
    with open (dest.parent/fname, 'w') as fp: 
        fp.write(all_texts)
    print(f"all texts from wikipedia {lang} in the file {dest.parent/fname}\n")

def get_one_clean_csv_file(dest,lang):    
                         
    fname = f'all_texts_{lang}wiki.csv'
    doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
    
    all_texts = list()
    for i,l in enumerate(dest.ls()):
        # open file and get content without first line which is the title
        f = l.open('r+', encoding="utf-8")
        f.readline()
        text = f.read()
        f.close()
        # get content without </doc> and delete empty line and whitespaces at the head and tail
        text = doc_re.findall(text)[0].strip()
        # append text
        all_texts.append(text)
  
    # Create the pandas DataFrame 
    df = pd.DataFrame(all_texts, columns = ['text'])
    
    # save
    df.to_csv(dest.parent/fname, index=False)  
    print(f"all texts from wikipedia {lang} in the file {dest.parent/fname}\n")
                         
def get_num_tokens(dest):
    
    # Getting an idea of the number of words
    files = dest.ls()
    num_tokens = 0

    for i,l in enumerate(files):
        f = l.open('r', encoding="utf-8")
        words = f.read()
        num_tokens += len(words.split())
        f.close()
        
    num_files = i+1
    
    return num_files, num_tokens

In [None]:
get_wiki(path_data, lang)

downloading...


unzipping...
extracting...


In [None]:
# create one text file by article
dest = split_wiki(path_data,lang)
# get all articles in one text file and one csv file
get_one_clean_file(dest,lang)
get_one_clean_csv_file(dest,lang)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000
12300000

In [None]:
%%time
# Size of downloaded data in the docs folder
num_files, num_tokens = get_num_tokens(dest)
print(f'{num_files} files - {num_tokens} tokens')

In [None]:
# 1. Get the pre-trained GPT2 Tokenizer (pre-trained with an English
# corpus) from the Transformers library (Hugging Face) 
from tokenizers import ByteLevelBPETokenizer

pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tokenizer_en.pad_token = tokenizer_en.eos_token

# 2. Train a Byte Level BPE (BBPE) tokenizer on the Spanish
# Wikipedia corpus by using the Tokenizers library (Hugging Face)

# 2.1 Get GPT2 tokenizer_en vocab size
ByteLevelBPE_tokenizer_es_vocab_size = tokenizer_en.vocab_size
ByteLevelBPE_tokenizer_es_vocab_size

# 2.2 ByteLevelBPETokenizer Represents a Byte-level BPE
# as introduced by OpenAI with their GPT-2 model
ByteLevelBPE_tokenizer_es = ByteLevelBPETokenizer()

# 2.3 Get list of paths to corpus files
# and customize training with <|endoftext|> special GPT-2 token
paths = [str(path_data/'all_texts_eswiki.txt')]
ByteLevelBPE_tokenizer_es.train(files=paths,
                    vocab_size=ByteLevelBPE_tokenizer_es_vocab_size, 
                    min_frequency=2, 
                    special_tokens=["<|endoftext|>"])
# Get sequence length max of 1024
ByteLevelBPE_tokenizer_es.enable_truncation(max_length=1024)

# 2.4 save tokenizer
ByteLevelBPE_tokenizer_es_rep = 'ByteLevelBPE_tokenizer_es'
path_to_ByteLevelBPE_tokenizer_es_rep = path_data/ByteLevelBPE_tokenizer_es_rep
if not (path_to_ByteLevelBPE_tokenizer_es_rep).exists():
    path_to_ByteLevelBPE_tokenizer_es_rep.mkdir(exist_ok=True, parents=True)
ByteLevelBPE_tokenizer_es.save_model(str(path_to_ByteLevelBPE_tokenizer_es_rep))

# 3. Import the tokenizer config files in Spanish into the pre-trained GPT2 Tokenizer
tokenizer_pt = GPT2TokenizerFast.from_pretrained(
    str(path_to_ByteLevelBPE_tokenizer_es_rep), 
    pad_token='<|endoftext|>')
# Get sequence length max of 1024
tokenizer_es.model_max_length = 1024

In [None]:
# 1. GPT2TokenizerFast (imported GPT-2 tokenizer) → fastai Tokenizer
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x):
      return TitledStr(self.tokenizer.decode(x.cpu().numpy()))
    
tokenizer_fastai_en = TransformersTokenizer(tokenizer_en)
tokenizer_fastai_es = TransformersTokenizer(tokenizer_es)


# 2. Change vocab embedding in the GPT-2 pre-trained model to adapt to the Portuguese vocab
# Get weights of the old wte
old_wgts = model.transformer.get_input_embeddings().weight.clone().detach()

# Get the mean embedding vector of the old wte
wgts_m = old_wgts.mean(0)

# Initialize vocab size and weights of the new wte
new_vocab_size = tokenizer_fastai_es.tokenizer.vocab_size
new_wgts = old_wgts.new_zeros(new_vocab_size,old_wgts.size(1))

# Get the new wte keeping the embedding vectors of tokens 
# in common in the 2 vocabs
# A token present in the new vocab but not in the old one 
# gets the mean embedding vector of the old wte
old_vocab = tokenizer_fastai_en.tokenizer.get_vocab()
new_vocab = tokenizer_fastai_es.tokenizer.get_vocab()
same_tokens_list = list()
different_tokens_list = list()
    
for w,idx_new in new_vocab.items():    
    idx_old = old_vocab.get(w, -1)
    if idx_old>=0:
        new_wgts[idx_new] = old_wgts[idx_old]
        same_tokens_list.append((w,idx_new))
    else:
        new_wgts[idx_new] = wgts_m
        different_tokens_list.append((w,idx_new))
        
# setup in model the new wte
new_wte = nn.Embedding(new_vocab_size,old_wgts.size(1))
new_wte.weight.data = new_wgts
model.transformer.set_input_embeddings(new_wte)

# save new_wgts
torch.save(new_wgts, path_data/'new_wte_wgts.es')
# save same_tokens_list and different_tokens_list
torch.save(same_tokens_list, path_data/'same_tokens_list.es')
torch.save(different_tokens_list, path_data/'different_tokens_list.es')

# Changing lm_head weights with the new embedding
matrixmodel.lm_head.weight = model.transformer.wte.weight