In [1]:
import pandas as pd
import numpy as np
import os.path
from arabert.arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score


In [2]:
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer,AutoModel
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures

In [3]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import csv
import optuna

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [6]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [7]:
path="GAZTTweets1.xlsx"
df = pd.read_excel(path,header=0)
model_name = "bert-base-arabertv02"
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False,replace_urls_emails_mentions=True)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'
Q_COLUMN= 'question'
df = df[['text', 'sentiment']]
df.columns = [DATA_COLUMN, LABEL_COLUMN]

label_map = {
    'negative' : 0,
    'neutral': 2,
    'positive' : 1

}
df['orginal'] = df[DATA_COLUMN]
df[DATA_COLUMN] = df[DATA_COLUMN].apply(arabert_prep.preprocess)
df[LABEL_COLUMN] = df[LABEL_COLUMN].apply(lambda x: label_map[x])


train_AJGT, test_AJGT = train_test_split(df, test_size=0.2,random_state=42)



train_df = pd.DataFrame({
    'id':range(len(train_AJGT)),
    'label':train_AJGT["label"],
    'alpha':['a']*train_AJGT.shape[0],
    'text': train_AJGT["text"].replace(r'\n', ' ', regex=True),
    'orginal': train_AJGT["orginal"].replace(r'\n', ' ', regex=True)
})

dev_df = pd.DataFrame({
    'id':range(len(test_AJGT)),
    'label':test_AJGT["label"],
    'alpha':['a']*test_AJGT.shape[0],
    'text': test_AJGT["text"].replace(r'\n', ' ', regex=True),
    'orginal': test_AJGT["orginal"].replace(r'\n', ' ', regex=True)
})

In [8]:
df.shape

(929, 3)

In [9]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [10]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = BertTokenizer.from_pretrained(model_name,from_pt=True)
      self.max_len = max_len
      self.label_map = label_map
      

    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())


        
      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )     
    
      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)    
      
      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [11]:
model_name="bert-base-arabertv02"
arabert_prep = ArabertPreprocessor(model_name=model_name)

text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
arabert_prep.preprocess(text)

'ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري'

In [14]:
label_map = { v:index for index, v in enumerate(df[LABEL_COLUMN].unique()) }
max_len=64
#print(label_map)
model_name='bert-large-arabertv02-twitter'

In [15]:
train_dataset = BERTDataset(train_df['text'].to_list(),train_df['label'].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(dev_df['text'].to_list(),dev_df['label'].to_list(),model_name,max_len,label_map)

In [16]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
save_obj(label_map,'label_map')

In [19]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("bert-large-arabertv02-twitter", return_dict=True, num_labels=len(label_map))

In [20]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  #print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[0,1])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [21]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 32
training_args.per_device_eval_batch_size = 32
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 5


steps_per_epoch = (len(train_df)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

11
55


In [22]:
import tensorflow as tf

In [23]:
model = AutoModelForSequenceClassification.from_pretrained('bert-large-arabertv02-twitter', return_dict=True, num_labels=len(label_map))

Some weights of the model checkpoint at bert-large-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-arab

In [24]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-large-arabertv02-twitter were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-arab

In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss,Macro F1,Macro F1 Pos Neg,Macro Precision,Macro Recall,Accuracy
1,No log,0.443863,0.845954,0.827754,0.872162,0.828924,0.844086
2,No log,0.245351,0.893246,0.866896,0.9,0.887413,0.887097
3,No log,0.233931,0.913924,0.891434,0.926091,0.904417,0.908602
4,No log,0.218475,0.937508,0.926809,0.94391,0.932799,0.935484
5,No log,0.219821,0.928646,0.913517,0.936448,0.922046,0.924731




TrainOutput(global_step=60, training_loss=0.3490442276000977)

In [27]:
trainer.save_model(r'C:\Users\96654\twee3\twitter_large')

trainer.compute_metrics(result)

{'macro_f1': 0.9286461974492206,
 'macro_f1_pos_neg': 0.9135172413793103,
 'macro_precision': 0.9364478114478114,
 'macro_recall': 0.9220463010489465,
 'accuracy': 0.9247311827956989}

In [26]:
result=trainer.predict(test_dataset)
final_pred=np.argmax(result.predictions, axis=1)
final_result=[list(label_map.keys())[list(label_map.values()).index(x)] for x in final_pred]
true_label=dev_df['label'].to_list()
true_text=dev_df['text'].to_list()


accuracy = accuracy_score(true_label, final_result)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(true_label, final_result))

Accuracy: 92.47%
              precision    recall  f1-score   support

           0       0.89      0.90      0.90        62
           1       1.00      0.92      0.96        38
           2       0.92      0.94      0.93        86

    accuracy                           0.92       186
   macro avg       0.94      0.92      0.93       186
weighted avg       0.93      0.92      0.93       186



In [28]:
sentimentResults = pd.DataFrame(
    {'text': true_text,
     'sentiment': true_label,
     'sentiment_BERT': final_result
    })
sentimentResults.to_excel('BERT_tweets_RESULTS5.xlsx')