# Arabic Dialect Identification - Applying Machine Learning
## By Karim Elshetihy
- [Github](https://github.com/KarimElshetihy)
- [Linkedin](https://www.linkedin.com/in/karim-el-shetihy/)

# Arabic Dialect Identification - Applying Deep Learning
## By Karim Elshetihy
- [Github](https://github.com/KarimElshetihy)
- [Linkedin](https://www.linkedin.com/in/karim-el-shetihy/)

#### Referances:
- [Arabic Dialict Identification in the Wild Paper](https://arxiv.org/pdf/2005.06557.pdf)
- [Flask Tutorial](https://programminghistorian.org/en/lessons/creating-apis-with-python-and-flask)
- [Multinomial Classification](https://towardsdatascience.com/the-complete-guide-to-neural-networks-multinomial-classification-4fe88bde7839)
- [Multi-class text classification model with Keras](https://www.design-ai.de/blog-posts/multi-class-text-classification-model-with-keras)
- [Word Embedding and Text Vectorization](https://www.analyticsvidhya.com/blog/2021/06/part-5-step-by-step-guide-to-master-nlp-text-vectorization-approaches/)


#### The Dataset:
The dataset and the dialect identification problem were addressed by Qatar Computing Research Institute, moreover, they published a paper, feel free to get more insights from it [Here](https://arxiv.org/pdf/2005.06557.pdf).

We are given a dataset which has 2 columns, **id** and **dialect**.
- Target label column is the **dialect***, which has 18 classes.
- The **id** column will be used to retrieve the text, to do that, you need to call this API by a
POST request. https://recruitment.aimtechnologies.co/ai-tasks.
- The request body must be a JSON as a list of strings, and the size of the list must NOT exceed 1000.
- The API will return a dictionary where the keys are the ids, and the values are the text, here is a request and response sample.

### Importing Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb

import torch
import torch , optuna, gc, random, os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

np.random.seed(1)
%matplotlib inline
sb.set_theme()

import warnings
warnings.simplefilter("ignore")

### Importing the Cleaned Dataset

In [2]:
#Setup Gdrive file download extention 
!conda install -y gdown

In [3]:
!gdown --id 1EnyzWLwO7fNF0eLAgvN9eXVOxOZgvn0k

In [4]:
clean_df = pd.read_csv("./5_class_dialects.csv")
df = clean_df.copy()

In [5]:
df.head()

In [7]:
df.info()

### Make sure we running on GPU

In [8]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    # !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Splitting the Dataset

In [9]:
# Prepare Train & Test Data
test_data_size = 0.25
train_test_ratio = int((1-test_data_size)*len(df))

train_data = df[df.columns[-20:]][:train_test_ratio]
test_data = df[df.columns[-20:]][train_test_ratio:]
train_set, evaluation_set = train_test_split(train_data, test_size= 0.05, random_state= 42)

### Specifying BERT Model Parameters

In [10]:
# First setting the max_len , will be useful later for BERT Model
Extra_Len = 6 # an extra padding in length , found to be useful for increasing F-score
Max_Len = 128
print(Max_Len)

In [11]:
Model_Used = "UBC-NLP/MARBERT"
Task_Name = "classification"

class Dataset:
    def __init__(self, name, train, test, label_list,):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list
        
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(BERTModelDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map
  
    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())

        encoded_review = self.tokenizer.encode_plus(
        text,
        max_length= self.max_len,
        add_special_tokens= True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        truncation='longest_first',
        return_attention_mask=True,
        return_tensors='pt'
        )
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

#### Intializing the Pretrained Model

In [12]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

def compute_metrics(p): #p should be of type EvalPrediction
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    print(classification_report(p.label_ids,preds))
    #print(confusion_matrix(p.label_ids,preds))

    macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_f1_pos_neg' : macro_f1_pos_neg,  
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
    }

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

#### Specifying BERT Model Inputs

In [13]:
label_list = list(train_set['dialect'].unique())

print(label_list)
print(train_set['dialect'].value_counts())

data_set = Dataset("KAUST", train_set, evaluation_set, label_list)

label_map = {v:index for index, v in enumerate(label_list) }
print(label_map)

train_dataset = BERTModelDataset(train_set['text'].to_list(),
                                 train_set['dialect'].to_list(),
                                 Model_Used,
                                 Max_Len,
                                 label_map)

evaluation_dataset = BERTModelDataset(evaluation_set['text'].to_list(),
                                      evaluation_set['dialect'].to_list(),
                                      Model_Used,
                                      Max_Len,
                                      label_map)

#### Specifying BERT Model Training Arguments & Parameters

In [14]:
#define training arguments
training_args = TrainingArguments("./train")
training_args.lr_scheduler_type = 'cosine'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 
Use_Train_Extended_Data = True

if Use_Train_Extended_Data :
    training_args.learning_rate = 1.215e-05 # use this with extended data
else:
    training_args.learning_rate = 1.78255000000000001e-05 # use this with org data  


training_args.fp16 = True
training_args.per_device_train_batch_size = 32 #64 
training_args.per_device_eval_batch_size = 32 # 64 
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 2
training_args.warmup_steps = 0 
training_args.evaluation_strategy = 'steps'
training_args.logging_steps = 1000
training_args.save_steps = 8000 
training_args.seed = 42 
training_args.disable_tqdm = False
training_args.output_dir='./results'

#### Initializing the Trainer

In [15]:
Rand_Seed = 42
training_args.dataloader_pin_memory = False
gc.collect()
torch.cuda.empty_cache()
set_seed(Rand_Seed) 

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)

In [16]:
print(training_args.seed)
print(Max_Len)
print(training_args.learning_rate)
print(training_args.adam_epsilon)
print(training_args.warmup_steps)

### Training the Model

In [17]:
#wandbkey if needed (depend on the transformers package version) = 0a58b374c46a154de1ba77c8634c6be279a9dcdb
trainer.train()

### Evaluating the Model

In [18]:
trainer.evaluate()

### Saving the Model

In [26]:
# saving the fine tuned model & tokenizer
model_path = "./models/BERT_model"
trainer.model.save_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(Model_Used)
tokenizer.save_pretrained("./models/tokenizer/")
# tokenizer = DistilBertTokenizer.from_pretrained("./models/tokenizer/")

### Defining the Predictor(Prediction Function)

In [20]:
# first define the predection method
def predict(text, tokenizer):
 
    encoded_review = tokenizer.encode_plus(
                                        text,
                                        max_length=Max_Len,
                                        add_special_tokens=True,
                                        return_token_type_ids=False,
                                        pad_to_max_length=True, #True,
                                        truncation='longest_first',
                                        return_attention_mask=True,
                                        return_tensors='pt'
    )

    input_ids = encoded_review['input_ids'].to(device) #(input_ids + ([tokenizer.pad_token_id] * padding_length)).to(device)  
    attention_mask = encoded_review['attention_mask'].to(device)


    output = trainer.model(input_ids, attention_mask)
    _, prediction = torch.max(output[0], dim=1)
    return prediction[0]

#then lets play !

tokenizer = AutoTokenizer.from_pretrained(Model_Used)

prediction_list = []
for num, text in enumerate(test_data['text']):
    if num > len(test_data['text']):
        break
    try:
        id = test_data['id'][train_test_ratio+num]
    except:
        id = test_data['id'][-1]
  
    pre = predict(text,tokenizer)
    pre_txt = label_list[pre]
   
#     if pre_txt == 'positive': pre_txt = 1
#     if pre_txt == 'negative': pre_txt = -1
#     if pre_txt == 'neutral': pre_txt = 0
    prediction_list.append(pre_txt)
    


### Saving Prediction

In [21]:
#print(prediction_list)
results = pd.DataFrame({'id' : test_data['id'].astype(str), 'Prediction' : prediction_list, 'True' : test_data['dialect']},
                       columns = ['id', 'Prediction', 'True'])
# print(results)

In [22]:
os.chdir(r'/kaggle/working')
result_file = "5_class_results.csv"
results.to_csv(result_file, sep= ",", index = False)

In [23]:
print(classification_report(results['True'], results['Prediction']))

### Viewing some Statistics

In [24]:
BERT_5_CR = classification_report(results['True'], results['Prediction'], output_dict=True)

In [25]:
plt.figure(figsize=(15,8));
sb.heatmap(pd.DataFrame(BERT_5_CR).iloc[:-1, :].T, annot=True);

---