In [None]:
!pip install torch
!pip install tokenizers
!pip install transformers

Collecting tokenizers
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 30.7 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.10.3
Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 30.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.0 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 47.1 MB/s 
Installing collected packages: sacremoses, pyyaml, huggingface-hub, transformers
  Attempting u

In [None]:
# Load the tokenizer
from transformers import BertTokenizer, LineByLineTextDataset

vocab_file_dir = '/content/vocab.txt' 

tokenizer = BertTokenizer(vocab_file_dir)
sentence = 'There are many spoons on the table.'

encoded_input = tokenizer.tokenize(sentence)
#print(encoded_input)
# print(encoded_input['input_ids'])

In [None]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = '/content/sentences.txt',
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(dataset))



No. of lines:  4087


In [None]:
test_dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = '/content/tes_sentences.txt',
    block_size = 128  # maximum sequence length
)

print('No. of lines: ', len(test_dataset))



No. of lines:  1295


In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

config = BertConfig(
    vocab_size=50000,
    hidden_size=768, 
    num_hidden_layers=6, 
    num_attention_heads=12,
    max_position_embeddings=512
)
 
model = BertForMaskedLM(config)
print('No of parameters: ', model.num_parameters())


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

No of parameters:  81965648


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/content/working/',
    #evaluation_strategy = "epoch",
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)


In [None]:
import numpy as np
!pip install datasets
import datasets
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {metric.compute(predictions=predictions, references=labels)}

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 29.6 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 36.8 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 49.4 MB/s 
Installing collected packages: xxhash, fsspec, datasets
Successfully installed datasets-1.11.0 fsspec-2021.8.1 xxhash-2.0.2


Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    #compute_metrics=compute_metrics,
    #prediction_loss_only=True,
)

In [None]:
trainer.train()
trainer.save_model('/content/working/')

***** Running training *****
  Num examples = 4087
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 384


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/working/
Configuration saved in /content/working/config.json
Model weights saved in /content/working/pytorch_model.bin


In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1295
  Batch size = 8


Perplexity: 926.93
