## This notebook is used to pre-train AraBERTv2 on a specific domain dataset that will later be used for a downtask problem, such as a question answering system.
### The code uses Tensorflow 1.x

In [4]:
from src.pretraining.preprocess import ArabertPreprocessor
import tensorflow as tf

### Download Tensorflow pretrained model from huggingface

In [4]:
!wget https://huggingface.co/aubmindlab/bert-base-arabertv2/resolve/main/tf1_model.tar.gz -O bert-base-arabertv2/tf1_model.tar.gz
!tar -xvf bert-base-arabertv2/tf1_model.tar.gz -C bert-base-arabertv2/
!rm bert-base-arabertv2/tf1_model.tar.gz

tf-base-arabertv2/
tf-base-arabertv2/config.json
tf-base-arabertv2/model.ckpt.data-00000-of-00001
tf-base-arabertv2/vocab.txt
tf-base-arabertv2/model.ckpt.index
tf-base-arabertv2/checkpoint
tf-base-arabertv2/model.ckpt.meta


### Preprocess the dataset with farasa_segmentation

In [5]:
model_name = "aubmindlab/bert-base-arabertv2"
sample_text = "dataset/pretraining-dataset/iskan.txt"
sample_text_output_after_farasa_segmentation = "dataset/pretraining-dataset/iskan_farasa_segmentation.txt"

arabert_prep = ArabertPreprocessor(model_name=model_name)

with open(sample_text, "r") as f:
    data = [d.strip("\n") for d in f.readlines()]

with open(sample_text_output_after_farasa_segmentation, "w") as f:
    for sample in data:
        f.write(arabert_prep.preprocess(sample) +"\n")

# Here is how the sample after farasa segmentation looks like
print(arabert_prep.preprocess(data[0]))



يعود تاريخ وزار +ة ال+ إسكان و+ ال+ تخطيط ال+ عمراني إلى ال+ عام 1975 ، عندما أصدر صاحب ال+ سمو ال+ شيخ عيسى بن سلمان آل خليف +ة أمير دول +ة ال+ بحرين – طيب الله ثرا +ه .


### Create data for pretraining AraBERTv2. This will convert the dataset in an expected format which is ".tfrecord"

In [None]:
!python src/pretraining/create_pretraining_data.py \
  --input_file=./dataset/pretraining-dataset/iskan_farasa_segmentation.txt \
  --output_file=./dataset/pretraining-dataset/iskan.tfrecord \
  --vocab_file=bert-base-arabertv2/tf-base-arabertv2/vocab.txt \
  --do_lower_case=True \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --masked_lm_prob=0.15 \
  --random_seed=12345 \
  --dupe_factor=5

### Pre-train/fine-tune AraBERTv2 on a specific domain

In [None]:
!python src/pretraining/run_pretraining.py \
  --input_file=dataset/pretraining-dataset/iskan.tfrecord \
  --output_dir=pretraining_output\
  --do_train=True \
  --do_eval=True \
  --bert_config_file=bert-base-arabertv2/tf-base-arabertv2/config.json \
  --init_checkpoint=bert-base-arabertv2/tf-base-arabertv2/model.ckpt \
  --train_batch_size=2 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=50 \
  --num_warmup_steps=10 \
  --learning_rate=5e-5 

### Convert Tensorflow checkpoints to a Pytorch model for later use in fine-tuning the model for downstream tasks, e.g., a question answering system.

In [None]:

!transformers-cli convert --model_type bert \
  --tf_checkpoint pretraining_output/model.ckpt-50 \
  --config bert-base-arabertv2/tf-base-arabertv2/config.json \
  --pytorch_dump_output bert-base-arabertv2/pytorch_model.bin