In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
df = pd.read_excel('AJGT.xlsx',header = 0)
df = df[['Feed','Sentiment']]
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'
df.columns= [DATA_COLUMN,LABEL_COLUMN]


In [None]:
data_train, data_test = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers.data.processors.utils import InputFeatures
from transformers import AutoTokenizer


class SADataset(Dataset):
  def __init__(self, texts, labels, model_name, max_len, label_map):
   
    self.texts = texts
    self.labels = labels
    self.label_map = label_map
    self.tokenizer_name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.max_len = max_len

  def __len__(self):
   
    return len(self.texts)

  def __getitem__(self, item):
    
    text = str(self.texts[item])
    label = self.labels[item]

    input_dict = self.tokenizer(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          padding = 'max_length',
          truncation= True
      )

    return InputFeatures(input_ids=input_dict["input_ids"],
                         token_type_ids=input_dict['token_type_ids'],
                         attention_mask=input_dict["attention_mask"],
                         label=self.label_map[self.labels[item]])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    assert len(preds) == len(pred.label_ids)

   
    acc = accuracy_score(pred.label_ids, preds)
    macro_f1 = f1_score(pred.label_ids, preds, average='macro')
    macro_f1_pos_neg = f1_score(pred.label_ids, preds, labels=[0, 1], average='macro') # Adjust labels as needed
    macro_precision = precision_score(pred.label_ids, preds, average='macro')
    macro_recall = recall_score(pred.label_ids, preds, average='macro')

    return {
        'macro_f1': macro_f1,
        'macro_f1_pos_neg': macro_f1_pos_neg,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'accuracy': acc
    }

In [None]:
from arabert.preprocess import ArabertPreprocessor

model_name = 'aubmindlab/bert-base-arabertv02'
arabert_prep = ArabertPreprocessor(model_name)

In [None]:
data_train[DATA_COLUMN] = data_train[DATA_COLUMN].apply(lambda x: arabert_prep.preprocess(x))
data_test[DATA_COLUMN] = data_test[DATA_COLUMN].apply(lambda x: arabert_prep.preprocess(x))

In [None]:
label_list = list(data_test[LABEL_COLUMN].unique())
label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)

In [None]:
max_len = 256
train_dataset = SADataset(
   texts = data_train[DATA_COLUMN].to_list(),
   labels = data_train[LABEL_COLUMN].to_list(),
   model_name = model_name,
   max_len = 256,
   label_map = label_map
)

test_dataset = SADataset(
   texts = data_test[DATA_COLUMN].to_list(),
   labels = data_test[LABEL_COLUMN].to_list(),
   model_name = model_name,
   max_len = 256,
   label_map = label_map
    )

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
from transformers import Trainer, TrainingArguments


In [None]:

training_args = TrainingArguments(
    output_dir="./train",
    adam_epsilon=1e-8,
    learning_rate=5e-5,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=8,
    do_eval=True,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_macro_f1',
    greater_is_better=True,
    seed=42
)


In [None]:
training_args.__dict__

In [None]:
trainer = Trainer(
   model = model,
   args = training_args,
   train_dataset = train_dataset,
   eval_dataset = test_dataset,
   compute_metrics = compute_metrics,
)
trainer.train()

#  Saving the best model

In [None]:
trainer.model.config.label2id = label_map
inv_label_map = { v:k for k, v in label_map.items()}
trainer.model.config.id2label = inv_label_map

In [None]:
#save the model in the folder
trainer.save_model("best_sa_model")
test_dataset.tokenizer.save_pretrained("best_sa_model")

In [None]:
from transformers import pipeline
pipe = pipeline(
        "sentiment-analysis",
        model = "best_sa_model",
        device=0, 
        )

In [None]:
pipe("انا لا احبك")

In [None]:
pipe("انا احبك")

In [None]:
pipe("  الاكل ما عجبني للاسف")

In [None]:
pipe(" جميل")

In [None]:
pipe(" الخدمة كانت كب شينة لكن موخرا صارت سيئة")

In [None]:
pipe("خدمة العملاء أكثر من مرة اتصل عليهم ولا يحلون لي مشكلتي")

In [None]:
pipe("دكتور سلطان يده خفيفة جدا ما شاء الله")

In [None]:
pipe("الدكتور درسني في الجامعة فمة في الخلق والتواضع ")

In [None]:
# import shutil
# from google.colab import files

# # Compress the model directory into a zip file
# shutil.make_archive('best_sa_model', 'zip', 'best_sa_model')

# # Download the zip file
# files.download('best_sa_model.zip')


In [None]:
# import pickle

# # save the iris classification model as a pickle file
# model_pkl_file = "AHSP_v0"

# with open(model_pkl_file, 'wb') as file:
#     pickle.dump(model, file)


In [None]:
# import joblib

# # save model with joblib
# filename = 'joblib_model.sav'
# joblib.dump(model, filename)