#Fine-tune AraBERT with Fast-BERT library
https://github.com/kaushaltrivedi/fast-ber

##Get Requirements

In [1]:
!git clone https://github.com/aub-mind/arabert
!pip install PyArabic farasapy fast-bert

Cloning into 'arabert'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 279 (delta 34), reused 38 (delta 15), pack-reused 214[K
Receiving objects: 100% (279/279), 3.68 MiB | 19.95 MiB/s, done.
Resolving deltas: 100% (157/157), done.
Collecting farasapy
  Downloading farasapy-0.0.11-py3-none-any.whl (12 kB)
Collecting fast-bert
  Downloading fast_bert-1.9.5-py3-none-any.whl (91 kB)
[K     |████████████████████████████████| 91 kB 3.7 MB/s 
Collecting python-box
  Downloading python_box-5.2.0-py3-none-any.whl (20 kB)
Collecting pytorch-lamb
  Downloading pytorch_lamb-1.0.0-py3-none-any.whl (4.4 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 912 kB/s 
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████

##Prepare the data

In [2]:
import pandas as pd
#from py4j.java_gateway import JavaGateway
from farasa.segmenter import FarasaSegmenter
from arabert.preprocess_arabert import preprocess
from sklearn.model_selection import train_test_split

# !pkill "java"
# gateway = JavaGateway.launch_gateway(classpath='./FarasaSegmenterJar.jar')
# farasa_segmenter = gateway.jvm.com.qcri.farasa.segmenter.Farasa()

farasa_segmenter = FarasaSegmenter(interactive=True)

df_AJGT = pd.read_csv('../input/sentiment/Train.csv',header=0)

DATA_COLUMN = 'content'
LABEL_COLUMN = 'score'

df_AJGT = df_AJGT[['content', 'score']]

df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]



df_AJGT[DATA_COLUMN] = df_AJGT[DATA_COLUMN].apply(lambda x: preprocess(x, do_farasa_tokenization=True , farasa=farasa_segmenter, use_farasapy = True))
# df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(lambda x: label_map[x])
df_AJGT['score']=df_AJGT['score'].astype(str)
train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.3,random_state=42)
!mkdir data
train_AJGT.to_csv("data/train.csv",index=True,columns=train_AJGT.columns,sep=',',header=True)
test_AJGT.to_csv("data/dev.csv",index=True,columns=test_AJGT.columns,sep=',',header=True)
with open('data/labels.csv','w') as f:
    f.write("\n".join(df_AJGT['score'].unique()))

  0%|          | 0.00/241M [00:00<?, ?iB/s]



100%|██████████| 241M/241M [00:12<00:00, 14.9MiB/s]



100%|██████████| 241M/241M [00:30<00:00, 14.9MiB/s]

In [3]:
with open('data/labels.csv','w') as f:
    f.write("\n".join(df_AJGT['score'].unique()))

##Create a DataBunch Object:
see https://github.com/kaushaltrivedi/fast-bert#text-classification

In [4]:
from fast_bert.data_cls import BertDataBunch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')

databunch = BertDataBunch('./data/', './data/',
                          tokenizer=tokenizer,
                          train_file='train.csv',
                          val_file='dev.csv',
                          label_file='labels.csv',
                          text_col='content',
                          label_col='score',
                          batch_size_per_gpu=16,
                          max_seq_length=256,
                          multi_gpu=True,
                          multi_label=False,
                          model_type='bert',
                          )



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=572.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=717153.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=406.0, style=ProgressStyle(description_…




##Create the Learner Object
see https://github.com/kaushaltrivedi/fast-bert#2-create-a-learner-object

In [5]:
import logging
import torch

from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

!mkdir 'output'
learner = BertLearner.from_pretrained_model(
						databunch,
						pretrained_path='aubmindlab/bert-base-arabert',
						metrics=metrics,
						device=device_cuda,
						logger=logger,
						output_dir='output',
						finetuned_wgts_path=None,
						warmup_steps=30,
						multi_gpu=False,
						is_fp16=False,
						multi_label=False,
						logging_steps=0)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=543450661.0, style=ProgressStyle(descri…




##Start Training and Validating

In [6]:
learner.fit(epochs=5,
			lr=2e-5,
			validate=True, 	# Evaluate the model after each epoch
			schedule_type="warmup_linear",
			optimizer_type="adamw")



(13000, 0.44645457174371067)

##You can see the output using tensorboard

In [7]:
#%load_ext tensorboard
#%tensorboard --logdir './output/tensorboard'

In [8]:
from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
#from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

In [9]:
from functools import partial

metrics = []
#metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'F1', 'function': partial(fbeta, beta=1)})
metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})


In [10]:
#Predictions
#Call predict_batch method on the learner object that contains the trained model.

texts = [
  "مخيب عيشتكم وملا حالة هالكورونا",
  "البلاد مشات يا لطيف"
]

predictions = learner.predict_batch(texts)
from fast_bert.prediction import BertClassificationPredictor

predictor = BertClassificationPredictor('aubmindlab/bert-base-arabert','./data/')

# Single prediction
single_prediction = predictor.predict("مخيب عيشتكم وملا حالة هالكورونا")

# Batch predictions
texts = [
  "مخيب عيشتكم وملا حالة هالكورونا",
  "البلاد مشات يا لطيف"
]

multiple_predictions = predictor.predict(texts)



In [11]:
multiple_predictions

[('1', 0.3484451472759247),
 ('0', 0.33461281657218933),
 ('-1', 0.31694209575653076)]

In [12]:
single_prediction = predictor.predict("Revoyez vos prix ainsi que la lenteur ke prend lapp pr se connecter.")
single_prediction

[('1', 0.36526912450790405),
 ('-1', 0.3601974844932556),
 ('0', 0.27453339099884033)]

In [13]:

single_prediction = predictor.predict("قمة الإستفزاز و النذالة في هالبلاد إنو في برنامج في تلفزة كيما التونسية يجيبو واحد طحان متع بن علي ...")
single_prediction

[('0', 0.3606773316860199),
 ('-1', 0.3212010860443115),
 ('1', 0.31812161207199097)]