# Install dependencies

In [None]:
!pip install transformers
!git clone https://github.com/aub-mind/arabert
!pip install -r arabert/requirements.txt

# Imports

In [None]:
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor

# Initialize Model, Tokenizer and preprocessor

Available models are:
`aubmindlab/` +
```
bert-base-arabertv01
bert-base-arabert
bert-base-arabertv02
bert-base-arabertv2
bert-large-arabertv02
bert-large-arabertv2
araelectra-base-discriminator
araelectra-base-generator
aragpt2-base
aragpt2-medium
```

for `aragpt2-large` and `mega`, you need to use:
`from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel` instead of `AutoModel`

In [None]:
model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Preprocessing the text before passing through the model

In [None]:
text= "شعرها جميل اليوم"
text_preprocessed = arabert_prep.preprocess(text)                                
print(text)
print("---------------------")
print(text_preprocessed)

شعرها جميل اليوم
---------------------
شعر +ها جميل ال+ يوم


Converting the text to tensors suitable for model input

In [None]:
#inputs is a dictionary containing inputs_ids, attention_masks and token_type_ids as pytorch tensors
inputs = tokenizer.encode_plus(text_preprocessed, return_tensors='pt')
print(inputs['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))# some tokens might be split with ## by the tokenizer
#AraGPT2 output will look gibberish because of encoding but don't worry about it

tensor([  33, 2024,   10, 2243,   20,  437,   34])
['[CLS]', 'شعر', '+ها', 'جميل', 'ال+', 'يوم', '[SEP]']


Passing the input through the model

In [None]:
outputs = model(**inputs)

In [None]:
embeddings = outputs['last_hidden_state']
embeddings.shape # batch_size x seq_len x emb_dim

torch.Size([1, 7, 768])

In [None]:
embeddings_text_only = outputs['last_hidden_state'][0][1:-1] #without [CLS] and [SEP], only applicable in AraBERT and AraELECTRA
embeddings_text_only.shape # (seq_len - 2) x emb_dim

torch.Size([5, 768])

In [None]:
print(embeddings)

tensor([[[-0.4212,  0.0906, -0.2257,  ..., -0.2509, -0.4637, -0.2456],
         [ 0.1262, -0.1921,  0.3547,  ..., -0.0284,  0.0218,  0.8257],
         [ 0.3750, -0.5048,  0.3407,  ..., -0.5022, -0.5551, -0.6074],
         ...,
         [-0.0807, -0.1237,  0.2332,  ..., -0.2287, -0.5295, -0.5163],
         [ 0.2868, -0.4423,  0.1091,  ..., -0.0412, -0.3146, -0.1310],
         [ 0.1527, -0.0043, -0.1426,  ..., -0.2322,  0.1667,  0.1843]]],
       grad_fn=<NativeLayerNormBackward>)


In [None]:
# AraGPT2 and AraELECTRA does not have a pooler layer, you just take the embedding of the last token for AraGPT2, and the first for araElectra
pooled_vector = outputs['pooler_output']
pooled_vector.shape # batch_size x emb_dim

torch.Size([1, 768])