In [1]:
%%time

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("corpora/wikipedia.v03.txt")]

print(paths)

tokenizer= ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size = 64_000, min_frequency=2, special_tokens=["<p>","</p>","<s>","<pad>","</s>","<unk>","<mask>",])



#from transformers import RobertaTokenizer

#tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


['corpora/wikipedia.v03.txt']



CPU times: user 8min 21s, sys: 1min 7s, total: 9min 28s
Wall time: 49.7 s


In [2]:
import os
token_dir = '/home/jmhuerta/content/iwebBERT_v03/'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model(token_dir)


['/home/jmhuerta/content/iwebBERT_v03/vocab.json',
 '/home/jmhuerta/content/iwebBERT_v03/merges.txt']

In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer('/home/jmhuerta/content/iwebBERT_v03/vocab.json', '/home/jmhuerta/content/iwebBERT_v03/merges.txt')

In [4]:
tokenizer._tokenizer.post_processor = BertProcessing(("</s>",tokenizer.token_to_id("</s>")),("<s>",tokenizer.token_to_id("<s>")))
tokenizer.enable_truncation(max_length=512)
                                                    

In [5]:
tokenizer.encode("The Critique of Pure Reason. And practical reason.")

Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [6]:
import torch

print("GPU available", torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


GPU available True


In [7]:
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=64_000,
    max_position_embedding=512,
    num_attention_heads=12, 
    num_hidden_layers=6,
    type_vocab_size=1)

In [8]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('/home/jmhuerta/content/iwebBERT_v03/',max_length=512)

In [9]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config).to(device)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [10]:
print(model.num_parameters())

92730880


In [11]:
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path ="./corpora/wikipedia.v03.txt", block_size=128)



CPU times: user 9min 9s, sys: 13.2 s, total: 9min 22s
Wall time: 9min 25s


In [12]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling (
    tokenizer= tokenizer, mlm=True, mlm_probability =0.15)

In [13]:
#@title Step 12: Initializing the Trainer
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./iwebBERT",
    overwrite_output_dir = True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)



In [14]:

%%time

trainer.train()



***** Running training *****
  Num examples = 2000000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 62500
  Number of trainable parameters = 92730880


Step,Training Loss
500,7.6455
1000,7.0222
1500,6.8101
2000,6.6615
2500,6.5563
3000,6.3474
3500,6.1152
4000,5.8738
4500,5.6695
5000,5.5021


Saving model checkpoint to ./iwebBERT/checkpoint-10000
Configuration saved in ./iwebBERT/checkpoint-10000/config.json
Model weights saved in ./iwebBERT/checkpoint-10000/pytorch_model.bin
Deleting older checkpoint [iwebBERT/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to ./iwebBERT/checkpoint-20000
Configuration saved in ./iwebBERT/checkpoint-20000/config.json
Model weights saved in ./iwebBERT/checkpoint-20000/pytorch_model.bin
Deleting older checkpoint [iwebBERT/checkpoint-30000] due to args.save_total_limit
Saving model checkpoint to ./iwebBERT/checkpoint-30000
Configuration saved in ./iwebBERT/checkpoint-30000/config.json
Model weights saved in ./iwebBERT/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [iwebBERT/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to ./iwebBERT/checkpoint-40000
Configuration saved in ./iwebBERT/checkpoint-40000/config.json
Model weights saved in ./iwebBERT/checkpoint-40000/pytorch_model.bin
Del

CPU times: user 3h 5min 21s, sys: 6min 28s, total: 3h 11min 49s
Wall time: 14h 4min 27s


TrainOutput(global_step=62500, training_loss=4.169751466796875, metrics={'train_runtime': 50667.42, 'train_samples_per_second': 39.473, 'train_steps_per_second': 1.234, 'total_flos': 6.633070678199501e+16, 'train_loss': 4.169751466796875, 'epoch': 1.0})

In [15]:
trainer.save_model("./iwebBERT/")

Saving model checkpoint to ./iwebBERT/
Configuration saved in ./iwebBERT/config.json
Model weights saved in ./iwebBERT/pytorch_model.bin


In [16]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from huggingface_hub import create_repo
#create_repo("iwebBERT")
model.push_to_hub("iwebBERT_2M")

Configuration saved in /tmp/tmpba7d_umd/config.json
Model weights saved in /tmp/tmpba7d_umd/pytorch_model.bin
Uploading the following files to jmhuerta/iwebBERT_2M: pytorch_model.bin,config.json


Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jmhuerta/iwebBERT_2M/commit/9fea52089e76864fd679807102af7b518317ee70', commit_message='Upload RobertaForMaskedLM', commit_description='', oid='9fea52089e76864fd679807102af7b518317ee70', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="./iwebBERT",
    tokenizer=tokenizer)
fill_mask("Human thinking involves human <mask> .")

loading configuration file ./iwebBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./iwebBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedding": 512,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64000
}

loading configuration file ./iwebBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./iwebBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 

[{'score': 0.01324025820940733,
  'token': 2942,
  'token_str': ' action',
  'sequence': 'Human thinking involves human action.'},
 {'score': 0.012830071151256561,
  'token': 1224,
  'token_str': ' life',
  'sequence': 'Human thinking involves human life.'},
 {'score': 0.01282454188913107,
  'token': 9054,
  'token_str': ' thinking',
  'sequence': 'Human thinking involves human thinking.'},
 {'score': 0.011552060022950172,
  'token': 5400,
  'token_str': ' behavior',
  'sequence': 'Human thinking involves human behavior.'},
 {'score': 0.010932030156254768,
  'token': 831,
  'token_str': 'ism',
  'sequence': 'Human thinking involves humanism.'}]

In [19]:
fill_mask("Autism has a strong <mask> basis ")

[{'score': 0.024219484999775887,
  'token': 4886,
  'token_str': ' alternative',
  'sequence': 'Autism has a strong alternative basis '},
 {'score': 0.017878247424960136,
  'token': 3182,
  'token_str': ' legal',
  'sequence': 'Autism has a strong legal basis '},
 {'score': 0.015577989630401134,
  'token': 1612,
  'token_str': ' political',
  'sequence': 'Autism has a strong political basis '},
 {'score': 0.012330393306910992,
  'token': 3030,
  'token_str': ' independent',
  'sequence': 'Autism has a strong independent basis '},
 {'score': 0.009066910482943058,
  'token': 3078,
  'token_str': ' related',
  'sequence': 'Autism has a strong related basis '}]

In [20]:
fill_mask("Lance Armstrong cycling career was <mask> by doping allegations.")

[{'score': 0.03342326357960701,
  'token': 2634,
  'token_str': ' replaced',
  'sequence': 'Lance Armstrong cycling career was replaced by doping allegations.'},
 {'score': 0.03025027923285961,
  'token': 2264,
  'token_str': ' followed',
  'sequence': 'Lance Armstrong cycling career was followed by doping allegations.'},
 {'score': 0.02391374297440052,
  'token': 3222,
  'token_str': ' supported',
  'sequence': 'Lance Armstrong cycling career was supported by doping allegations.'},
 {'score': 0.020141808316111565,
  'token': 6531,
  'token_str': ' criticized',
  'sequence': 'Lance Armstrong cycling career was criticized by doping allegations.'},
 {'score': 0.015271670185029507,
  'token': 4375,
  'token_str': ' influenced',
  'sequence': 'Lance Armstrong cycling career was influenced by doping allegations.'}]

In [24]:
fill_mask("The <mask> World War was started by an incident in the Balkans.")

[{'score': 0.5893740653991699,
  'token': 3477,
  'token_str': ' Second',
  'sequence': 'The Second World War was started by an incident in the Balkans.'},
 {'score': 0.23300762474536896,
  'token': 2768,
  'token_str': ' First',
  'sequence': 'The First World War was started by an incident in the Balkans.'},
 {'score': 0.01540341041982174,
  'token': 876,
  'token_str': ' American',
  'sequence': 'The American World War was started by an incident in the Balkans.'},
 {'score': 0.006034450139850378,
  'token': 1053,
  'token_str': ' early',
  'sequence': 'The early World War was started by an incident in the Balkans.'},
 {'score': 0.004087221343070269,
  'token': 2237,
  'token_str': ' Great',
  'sequence': 'The Great World War was started by an incident in the Balkans.'}]

In [30]:
fill_mask("In his Critique of Practical Reason, Aristotle argues that systematic division is a <mask> principle.")

[{'score': 0.019469674676656723,
  'token': 1314,
  'token_str': ' common',
  'sequence': 'In his Critique of Practical Reason, Aristotle argues that systematic division is a common principle.'},
 {'score': 0.018620166927576065,
  'token': 2799,
  'token_str': ' particular',
  'sequence': 'In his Critique of Practical Reason, Aristotle argues that systematic division is a particular principle.'},
 {'score': 0.0163904819637537,
  'token': 1846,
  'token_str': ' similar',
  'sequence': 'In his Critique of Practical Reason, Aristotle argues that systematic division is a similar principle.'},
 {'score': 0.013411641120910645,
  'token': 4917,
  'token_str': ' positive',
  'sequence': 'In his Critique of Practical Reason, Aristotle argues that systematic division is a positive principle.'},
 {'score': 0.01327616535127163,
  'token': 1203,
  'token_str': ' general',
  'sequence': 'In his Critique of Practical Reason, Aristotle argues that systematic division is a general principle.'}]

In [32]:
fill_mask("Godwin is generally regarded as the founder of the school of thought known as <mask> anarchism.")

[{'score': 0.04419669508934021,
  'token': 265,
  'token_str': ' a',
  'sequence': 'Godwin is generally regarded as the founder of the school of thought known as a anarchism.'},
 {'score': 0.030851706862449646,
  'token': 2265,
  'token_str': ' Christian',
  'sequence': 'Godwin is generally regarded as the founder of the school of thought known as Christian anarchism.'},
 {'score': 0.028029263019561768,
  'token': 268,
  'token_str': ' the',
  'sequence': 'Godwin is generally regarded as the founder of the school of thought known as the anarchism.'},
 {'score': 0.02710946463048458,
  'token': 427,
  'token_str': ' his',
  'sequence': 'Godwin is generally regarded as the founder of the school of thought known as his anarchism.'},
 {'score': 0.015148647129535675,
  'token': 1612,
  'token_str': ' political',
  'sequence': 'Godwin is generally regarded as the founder of the school of thought known as political anarchism.'}]

In [33]:
fill_mask("<mask> is generally regarded as the founder of the school of thought known as philosophical anarchism.")

[{'score': 0.4361608326435089,
  'token': 569,
  'token_str': ' He',
  'sequence': ' He is generally regarded as the founder of the school of thought known as philosophical anarchism.'},
 {'score': 0.10161503404378891,
  'token': 574,
  'token_str': ' It',
  'sequence': ' It is generally regarded as the founder of the school of thought known as philosophical anarchism.'},
 {'score': 0.010055812075734138,
  'token': 744,
  'token_str': ' This',
  'sequence': ' This is generally regarded as the founder of the school of thought known as philosophical anarchism.'},
 {'score': 0.005643005482852459,
  'token': 1294,
  'token_str': ' She',
  'sequence': ' She is generally regarded as the founder of the school of thought known as philosophical anarchism.'},
 {'score': 0.0032359748147428036,
  'token': 402,
  'token_str': ' it',
  'sequence': ' it is generally regarded as the founder of the school of thought known as philosophical anarchism.'}]

In [34]:
fill_mask("A Markov random field, also known as a Markov network, is a <mask> over an undirected graph.")

[{'score': 0.030602358281612396,
  'token': 8821,
  'token_str': ' vector',
  'sequence': 'A Markov random field, also known as a Markov network, is a vector over an undirected graph.'},
 {'score': 0.024418938905000687,
  'token': 1883,
  'token_str': ' field',
  'sequence': 'A Markov random field, also known as a Markov network, is a field over an undirected graph.'},
 {'score': 0.018840661272406578,
  'token': 4707,
  'token_str': ' graph',
  'sequence': 'A Markov random field, also known as a Markov network, is a graph over an undirected graph.'},
 {'score': 0.015372563153505325,
  'token': 2493,
  'token_str': ' function',
  'sequence': 'A Markov random field, also known as a Markov network, is a function over an undirected graph.'},
 {'score': 0.013797575607895851,
  'token': 900,
  'token_str': ' set',
  'sequence': 'A Markov random field, also known as a Markov network, is a set over an undirected graph.'}]

In [35]:
fill_mask("A Markov random field, also known as a Markov network, is a model over an <mask> graph.")

[{'score': 0.036914240568876266,
  'token': 12429,
  'token_str': ' infinite',
  'sequence': 'A Markov random field, also known as a Markov network, is a model over an infinite graph.'},
 {'score': 0.022134175524115562,
  'token': 14446,
  'token_str': ' arbitrary',
  'sequence': 'A Markov random field, also known as a Markov network, is a model over an arbitrary graph.'},
 {'score': 0.01123829185962677,
  'token': 10027,
  'token_str': ' integral',
  'sequence': 'A Markov random field, also known as a Markov network, is a model over an integral graph.'},
 {'score': 0.01084906142205,
  'token': 13225,
  'token_str': ' algebraic',
  'sequence': 'A Markov random field, also known as a Markov network, is a model over an algebraic graph.'},
 {'score': 0.008688466623425484,
  'token': 7019,
  'token_str': ' input',
  'sequence': 'A Markov random field, also known as a Markov network, is a model over an input graph.'}]