In [None]:
%%capture
!pip install transformers
!pip install datasets

### Import

In [None]:
import os
import time
import torch
import numpy as np
from tqdm import tqdm
from torch.optim import AdamW
from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score
from transformers import (AutoConfig, AutoModelForSequenceClassification)

### Initialization

In [None]:
model_name = 'qarib/bert-base-qarib' # نموذج تم تدريبة سابقاً على جمل عربيه
dataset_name = 'ajgt_twitter_ar' #بيانات مجمعة من تويتر مقسه لإيجابي وسلبي
batch_size = 8
seed = 42

## Data Processing


### Load and split data

In [None]:
dataset = load_dataset(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1800 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1800
    })
})

In [None]:
print(dataset['train']['text'][2].strip())
print('ايجابي' if dataset['train']['label'][2] == 1 else 'سلبي')

كله رائع بجد ربنا يكرمك
ايجابي


### Data splitting

In [None]:
train_valid_dataset = dataset['train'].train_test_split(test_size=0.1, seed = 42)
dataset['valid'] = train_valid_dataset.pop('test')
dataset['train'] = train_valid_dataset['train']

train_test_dataset = dataset['train'].train_test_split(test_size=0.1, seed = seed)
dataset['test'] = train_test_dataset.pop('test')
dataset['train'] = train_test_dataset['train']

### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, model_max_length = 512) #طريقة لتقسيم النصوص
proc_dataset = dataset.map(lambda examples:tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
proc_dataset = proc_dataset.map(lambda examples:{'labels': examples['label']}, batched=True)

config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/718k [00:00<?, ?B/s]



Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/162 [00:00<?, ? examples/s]

In [None]:
tokenizer.tokenize('اكيد الله لا يحرمنا من هالنعمه العظيمه')

['اكيد', 'الله', 'لا', 'يحرمنا', 'من', 'هالن', '##عم', '##ه', 'العظيم', '##ه']

### Preparation for Training


In [None]:
columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels']

for split in proc_dataset:
  proc_dataset[split].set_format(type='torch', columns=columns)
  proc_dataset[split] = torch.utils.data.DataLoader(proc_dataset[split], batch_size=batch_size)

datasets = [proc_dataset['train'], proc_dataset['valid'], proc_dataset['test']]

### Model

In [None]:
class BaseTextClassficationModel:
    def __init__(self):
        self.num_labels = 2
        self.model_name = model_name
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        config = AutoConfig.from_pretrained(self.model_name,num_labels=self.num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config = config)


    def train(self, datasets):
        save_dir = '.'
        epochs = 1
        lr = 5e-5

        train_dataset, valid_dataset, test_dataset = datasets

        self.optimizer = AdamW(self.model.parameters(), lr = lr)
        filepath = os.path.join(save_dir, 'model.pth')

        best_accuracy = 0
        for epoch in range(epochs):
            accuracy = 0
            loss = 0
            self.model.train().to(self.device)
            for _, batch in enumerate(train_dataset):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs['loss']
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

                labels = batch['labels'].cpu()
                preds = outputs['logits'].argmax(-1).cpu()
                accuracy += accuracy_score(labels, preds) /len(train_dataset)
                loss += loss / len(train_dataset)
                batch = None
            print(f"Epoch {epoch} Train Loss {loss:.4f} Train Accuracy {accuracy:.4f}")

            self.model.eval().to(self.device)
            results = self.evaluate_dataset(valid_dataset)
            print(f"Epoch {epoch} Valid Loss {results['loss']:.4f} Valid Accuracy {results['accuracy']:.4f}")

            val_accuracy = results['accuracy']
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                torch.save(self.model.state_dict(), filepath)

        #Later to restore:
        self.model.load_state_dict(torch.load(filepath))
        self.model.eval()
        test_metrics = self.evaluate_dataset(test_dataset)
        print(f"Test Loss {test_metrics['loss']:.4f} Test Accuracy {test_metrics['accuracy']:.4f}")
        return {'accuracy':test_metrics['accuracy']}

    def evaluate_dataset(self, dataset):
        accuracy = 0
        loss = 0
        for _, batch in enumerate(dataset):
            batch = {k: v.to(self.device) for k, v in batch.items()}
            outputs = self.model(**batch)
            loss = outputs['loss']
            labels = batch['labels'].cpu()
            preds = outputs['logits'].argmax(-1).cpu()
            accuracy += accuracy_score(labels, preds) /len(dataset)
            loss += loss / len(dataset)
            batch = None
        return {'loss':float(loss.cpu().detach().numpy()), 'accuracy':accuracy}

In [None]:
bert_cls = BaseTextClassficationModel()

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at qarib/bert-base-qarib and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
results = bert_cls.train(datasets) #تدريب النموذج

Epoch 0 Train Loss 0.0502 Train Accuracy 0.9016
Epoch 0 Valid Loss 0.3354 Valid Accuracy 0.9239


  self.model.load_state_dict(torch.load(filepath))


Test Loss 0.2679 Test Accuracy 0.9286


In [None]:
def predict(text):
  device = 'cuda'
  encoded_review = tokenizer.encode_plus(
    text,
    padding=True,
    add_special_tokens=True,
    return_tensors='pt',
  )

  input_ids = encoded_review['input_ids'].to(device)
  attention_mask = encoded_review['attention_mask'].to(device)
  output = bert_cls.model(input_ids, attention_mask)
  pred = torch.argmax(output['logits'], dim = -1)
  print(f"الجملة  : {text}")
  print(f"المشاعر  : {'ايجابية' if pred[0] == 1 else 'سلبية'}")

In [None]:
predict('ورشة فهم الصيفية كانت تجربة رائعة')

الجملة  : ورشة فهم الصيفية كانت تجربة رائعة
المشاعر  : ايجابية
