In [None]:
#!pip install tensorflow==1.15
#!pip install transformers==2.8.0

In [None]:
import os
import json
from transformers import AutoTokenizer

In [None]:
os.chdir("/bachelor_project")
os.getcwd()

In [None]:
DATA_DIR = "/bachelor_project/models/ælæctra_uncased_32k" #@param {type: "string"}
MODEL_NAME = "ælæctra_uncased_32k" #@param {type: "string"}

In [None]:
# Save the pretrained WordPiece tokenizer to get `vocab.txt`
#tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
#tokenizer.save_pretrained(DATA_DIR)

# THE NEXT STEP TAKES AROUND 15 HOURS TO COMPLETE WITH 13GB DATA AND HAS TO BE RUN PRIOR TO ANY PRETRAINING

In [None]:
#docker exec -w /bachelor_project/electra_google gpu_0_electra python3 -u build_pretraining_dataset.py --corpus-dir /bachelor_project/data/training_data --vocab-file /bachelor_project/models/ælæctra_uncased_32k/vocab.txt --output-dir /bachelor_project/models/ælæctra_uncased_32k/pretrain_tfrecords --max-seq-length 128 --blanks-separate-docs False --do-lower-case --num-processes 6

!python3 build_pretraining_dataset.py \
  --corpus-dir $DATA_DIR \
  --vocab-file $DATA_DIR/vocab.txt \
  --output-dir $DATA_DIR/pretrain_tfrecords \
  --max-seq-length 128 \
  --blanks-separate-docs False \
  --no-lower-case \
  --num-processes 6

In [None]:
hparams = {
    "do_train": "true",
    "do_eval": "false",
    "model_size": "small",
    "do_lower_case": "true",
    "vocab_size": 32000,
    "num_train_steps": 1e6,
    "save_checkpoints_steps": 100000,
    "train_batch_size": 128,
}
           
with open("/bachelor_project/models/ælæctra_uncased_32k/hparams.json", "w") as f:
    json.dump(hparams, f)

In [None]:
#docker exec -w /bachelor_project gpu_0_electra python3 -u /bachelor_project/electra_google/run_pretraining.py --data-dir /bachelor_project/models/uncased_32k_danish_data --model-name ælæctra_uncased_32k --hparams "/bachelor_project/models/ælæctra_uncased_32k/hparams.json"

In [None]:
#tensorboard --logdir=/bachelor_project/models/ælæctra_uncased_32k/models/ælæctra_uncased_32k --host localhost --port 8088

!python3 run_pretraining.py \
  --data-dir $DATA_DIR \
  --model-name $MODEL_NAME \
  --hparams "hparams.json"

!git clone https://github.com/lonePatient/electra_pytorch.git

In [None]:
MODEL_DIR = "/bachelor_project/models/ælæctra_uncased_32k/models/ælæctra_uncased_32k/"

config = {
  "vocab_size": 32000,
  "embedding_size": 128,
  "hidden_size": 256,
  "num_hidden_layers": 12,
  "num_attention_heads": 4,
  "intermediate_size": 1024,
  "generator_size": "0.25",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "max_position_embeddings": 512,
  "type_vocab_size": 2,
  "initializer_range": 0.02
}

with open(MODEL_DIR + "config.json", "w") as f:
    json.dump(config, f)

In [None]:
!python electra_pytorch/convert_electra_tf_checkpoint_to_pytorch.py \
    --tf_checkpoint_path=$MODEL_DIR \
    --electra_config_file=$MODEL_DIR/config.json \
    --pytorch_dump_path=$MODEL_DIR/pytorch_model.bin

In [None]:
import torch
from transformers import ElectraForPreTraining, ElectraTokenizerFast

discriminator = ElectraForPreTraining.from_pretrained(MODEL_DIR)
tokenizer = ElectraTokenizerFast.from_pretrained(MODEL_DIR, do_lower_case=True)


In [None]:
sentence = "Fuglene synger" # The birds are singing
fake_sentence = "Fuglene taler" # The birds are speaking 

fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
discriminator_outputs = discriminator(fake_inputs)
predictions = discriminator_outputs[0] > 0

[print("%7s" % token, end="") for token in fake_tokens]
print("\n")
[print("%7s" % int(prediction), end="") for prediction in predictions.tolist()];

In [None]:
discriminator.num_parameters()