# ***Libraries***

In [None]:
import numpy as np
import pandas as pd
import re , string
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
import logging
from keras.models import Sequential
from sklearn import model_selection
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
from keras.layers import Embedding, Dense, Dropout, Input#, LSTM, Bidirectional
from keras.layers import MaxPooling1D, Conv1D, Flatten, LSTM
from keras.preprocessing import sequence#, text
import tensorflow_datasets as tfds
import tensorflow as tf
from huggingface_hub import notebook_login

## Load and Explore data

In [None]:
df = pd.read_csv("ar_reviews_100k.tsv", sep = '\t')

label_mapping = {"Positive": 1, "Negative":-1 , "Mixed": 0}

df = df.rename(columns = {"class":"label"})


df["label"] = df["label"].map(label_mapping)

# Data Preprocessing



In [None]:
df["label"].value_counts()

In [None]:
# check duplicated 
print(df.duplicated().sum())


In [None]:
# Remove Duplicates
#df = df.drop_duplicates()
#train = train.drop_duplicates()

#test_df = test_df.drop_duplicates()

In [None]:
# check nulls 
print(df.isnull().sum())

## Cleaning data ( Punctuation , mentions , emotion .. etc )

In [None]:
# Remove Emotions & Links & Mentions & Hashtag 
df['text']= df['text'].map(lambda text: re.sub(r'[^\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text).strip())

In [None]:
# Remove Tashkeel 
import pyarabic.araby as araby
df['text']= df['text'].map(lambda text: araby.strip_diacritics(text))

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop  = stopwords.words('arabic')

In [None]:
# Remove stop words
#train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
df.head()

In [None]:
df["label"].value_counts()

# Model Training

## Split Data

In [None]:
train, test = train_test_split( df, test_size= .3, random_state= 42)

print("Train set: ")
print(train["label"].value_counts())
print("---------------------------")
print ("Test set: ")
print (test["label"].value_counts())

In [None]:
x = train['text'].to_list()
y = train['label'].to_list()

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42)

## Predict by Using  (Machine Learning)

In [None]:
# Helper functions 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

def train_model(model, data, targets):
    text_clf = Pipeline([
    ('vect', TfidfVectorizer(min_df = 0.0001, max_df=0.95,
                                 analyzer='word', lowercase=False,
                                 ngram_range=(1, 2))),('clf', model),
    ])
    text_clf.fit(data, targets)
    return text_clf
def get_accuracy(trained_model,X, y):
    predicted = trained_model.predict(X)
    accuracy = np.mean(predicted == y)
    return accuracy

In [None]:
from sklearn.naive_bayes import MultinomialNB
trained_clf_multinomial_nb = train_model(MultinomialNB(), x, y)
accuracy = get_accuracy(trained_clf_multinomial_nb,x_val, y_val)
print(f"Test dataset accuracy with MultinomialNB: {accuracy:.2f}")

In [None]:
from sklearn.svm import LinearSVC
trained_clf_linearSVC = train_model(LinearSVC(), x, y)
accuracy = get_accuracy(trained_clf_linearSVC,x_val, y_val)
print(f"Test dataset accuracy with LinearSVC: {accuracy:.2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
trained_clf_random_forest = train_model(RandomForestClassifier(), x, y)
accuracy = get_accuracy(trained_clf_random_forest,x_val, y_val)
print(f"Test dataset accuracy with RandomForestClassifier: {accuracy:.2f}")

In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(trained_clf_linearSVC, test.text.values, test.label.values, cv=kfold, scoring=scoring)
print("MSE: %.3f (%.3f)" % (results.mean(), results.std()))

In [None]:
predicted = trained_clf_multinomial_nb.predict(test.text.values)
report = classification_report(test.label.values, predicted)
print(report)

In [None]:
predicted = trained_clf_linearSVC.predict(test.text.values)
report = classification_report(test.label.values, predicted)
print(report)

In [None]:
predicted = trained_clf_random_forest.predict(test.text.values)
report = classification_report(test.label.values, predicted)
print(report)

In [None]:
accuracy = get_accuracy(trained_clf_linearSVC,test.text.values, test.label.values)
print(f"Test dataset accuracy with LinearSVC: {accuracy:.2f}")

# Predict by Using GRU




## Predict by Using Bert (Deep learning)

In [None]:
train_set, evaluation_set = train_test_split( df, test_size= .1 , random_state= 42)


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
Model_Used = "UBC-NLP/MARBERT"
Task_Name = "classification"

class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list
        
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTModelDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map
  
    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())
    
      encoded_review = self.tokenizer.encode_plus(
      text,
      max_length= 512,
      add_special_tokens= True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation='longest_first',
      return_attention_mask=True,
      return_tensors='pt'
    )
      input_ids = encoded_review['input_ids'].to(device)
      attention_mask = encoded_review['attention_mask'].to(device)

      return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

In [None]:
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  print(classification_report(p.label_ids,preds))
  #print(confusion_matrix(p.label_ids,preds))

  macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
label_list = list(train_set['label'].unique())
Extra_Len = 6 # an extra padding in length , found to be useful for increasing F-score
Max_Len = train_set['text'].str.split().str.len().max() + Extra_Len
print(Max_Len)
print(label_list)

print(train_set['label'].value_counts())

data_set = Dataset( "KAUST", train_set, evaluation_set, label_list )

label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)

train_dataset = BERTModelDataset(train_set['text'].to_list(),
                                 train_set['label'].to_list(),Model_Used,Max_Len,label_map)

evaluation_dataset = BERTModelDataset(evaluation_set['text'].to_list(),
                                      evaluation_set['label'].to_list(),Model_Used,Max_Len,label_map)

In [None]:
#define training arguments
training_args = TrainingArguments("./train")
training_args.lr_scheduler_type = 'cosine'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 

training_args.learning_rate = 1.78255000000000001e-05 
training_args.fp16 = True
training_args.per_device_train_batch_size = 16 #64 
training_args.per_device_eval_batch_size = 16 # 64 
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 20
training_args.warmup_steps = 0 
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.logging_steps = 200
training_args.save_steps = 1000
training_args.seed = 42 
training_args.disable_tqdm = False

In [None]:
import torch , gc, random, os
Rand_Seed = 42 
training_args.dataloader_pin_memory = False
gc.collect()
torch.cuda.empty_cache()
set_seed(Rand_Seed) 

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)

print(training_args.seed)

In [None]:
print(Max_Len)
print(training_args.learning_rate)
print(training_args.adam_epsilon)
print(training_args.warmup_steps)
#wandbkey if needed (depend on the transformers package version) = 0a58b374c46a154de1ba77c8634c6be279a9dcdb
trainer.train()

In [None]:
# first define the predection method
def predict(text, tokenizer):
 
  encoded_review = tokenizer.encode_plus(
    text,
    max_length=512,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True, #True,
    truncation='longest_first',
    return_attention_mask=True,
    return_tensors='pt'
  )

  input_ids = encoded_review['input_ids'].to(device) #(input_ids + ([tokenizer.pad_token_id] * padding_length)).to(device)  
  attention_mask = encoded_review['attention_mask'].to(device)
    

  output = trainer.model(input_ids, attention_mask)
  _, prediction = torch.max(output[0], dim=1)
  return prediction[0]

#then lets play !

tokenizer = AutoTokenizer.from_pretrained(Model_Used)

prediction_list = []
i = 0
for tweet in test["text"]:
    
  
    pre = predict(tweet,tokenizer)
    pre_txt = label_list[pre]
   
    if pre_txt == 'positive': pre_txt = 1
    if pre_txt == 'negative': pre_txt = -1
    if pre_txt == 'neutral': pre_txt = 0
    prediction_list.append(pre_txt)
    
    i = i + 1

In [None]:
#print(prediction_list)
results = pd.DataFrame({'class' : test["class"].astype(str), 'sentiment' : prediction_list},
                       columns = ['class', 'sentiment'])
print(results)


result_file = "sub_test3.csv"
results.to_csv(result_file, sep= ",", index = False)

In [None]:
results.head(50)

In [None]:

report = classification_report(test.label.values, prediction_list)


In [None]:
print(report)