# Arabic_Dialect_Identification_NLP_AIM_Task
**`Mohamed Sebaie Sebaie Youssef`**

## Installing requirements

In [1]:
%%capture
!pip install transformers
!pip install pyarabic

In [2]:
%cd /content/drive/MyDrive/Aim_NLP_Task/Arabic_Dialect_Classification_Task

/content/drive/MyDrive/Aim_NLP_Task/Arabic_Dialect_Classification_Task


In [3]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [5]:
import gc 
import time
import aranorm
import pandas as pd
import dialectUtils
import preprocess_arabert
from transformers import AutoTokenizer, BertForSequenceClassification,BertTokenizer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
train_tweets,train_labels = dialectUtils.read_csv("../Data/preProcessedTweets_trainF.csv",
                                                  "text_preprocessed","label")
valid_tweets,valid_labels = dialectUtils.read_csv("../Data/preProcessedTweets_validF.csv",
                                                  "text_preprocessed","label")
test_tweets,test_labels = dialectUtils.read_csv("../Data/preProcessedTweets_testF.csv",
                                                  "text_preprocessed","label")

## Bert `Base` AraBert

### Train 

In [None]:
preTrainedBertModel_base="aubmindlab/bert-base-arabertv02-twitter"

tokenizer_base= AutoTokenizer.from_pretrained(preTrainedBertModel_base)
tokenizer_obj_base = dialectUtils.Tokenizer(tokenizer_base)


train_data_base = tokenizer_obj_base.bert_tokenize_data(train_tweets,train_labels)
valid_data_base = tokenizer_obj_base.bert_tokenize_data(valid_tweets ,valid_labels)
test_data_base  = tokenizer_obj_base.bert_tokenize_data(test_tweets,test_labels)

train_loader_base,valid_loader_base=dialectUtils.create_bert_dataloader(train_data_base,
                                                                        valid_data_base,
                                                                        batch_size=32,
                                                                        split_train= False)

test_loader_base=dialectUtils.create_bert_dataloader(test_data_base,valid=None,batch_size=32,
                                                split_train=False,test=True)
BERTmodel_base=dialectUtils.get_model(preTrainedBertModel_base)
dialectUtils.printModel_Parameters(BERTmodel_base)
print("\n")
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [13]:
dialectUtils.train(train_loader_base,valid_loader_base,model=BERTmodel_base,epochs=5,learning_rate=2e-5,eps=1e-8,device ="cuda")




  Batch   250  of  11,429.    Elapsed: 0:00:53.
  Batch   500  of  11,429.    Elapsed: 0:01:46.
  Batch   750  of  11,429.    Elapsed: 0:02:39.
  Batch 1,000  of  11,429.    Elapsed: 0:03:32.
  Batch 1,250  of  11,429.    Elapsed: 0:04:25.
  Batch 1,500  of  11,429.    Elapsed: 0:05:18.
  Batch 1,750  of  11,429.    Elapsed: 0:06:10.
  Batch 2,000  of  11,429.    Elapsed: 0:07:03.
  Batch 2,250  of  11,429.    Elapsed: 0:07:56.
  Batch 2,500  of  11,429.    Elapsed: 0:08:49.
  Batch 2,750  of  11,429.    Elapsed: 0:09:42.
  Batch 3,000  of  11,429.    Elapsed: 0:10:35.
  Batch 3,250  of  11,429.    Elapsed: 0:11:28.
  Batch 3,500  of  11,429.    Elapsed: 0:12:21.
  Batch 3,750  of  11,429.    Elapsed: 0:13:14.
  Batch 4,000  of  11,429.    Elapsed: 0:14:07.
  Batch 4,250  of  11,429.    Elapsed: 0:15:00.
  Batch 4,500  of  11,429.    Elapsed: 0:15:52.
  Batch 4,750  of  11,429.    Elapsed: 0:16:45.
  Batch 5,000  of  11,429.    Elapsed: 0:17:38.
  Batch 5,250  of  11,429.    Elapsed: 

### Predict on Test

In [24]:
pred_base,actual = dialectUtils.Batchpredict(BERTmodel_base,test_loader_base)

Predicting labels for test sentences...
    DONE.


In [45]:
dialectUtils.get_report(pred_base,actual)

              precision    recall  f1-score   support

          EG       0.78      0.88      0.83      5756
          PL       0.55      0.57      0.56      4368
          KW       0.57      0.60      0.58      4200
          LY       0.76      0.74      0.75      3647
          QA       0.53      0.54      0.54      3105
          JO       0.46      0.42      0.44      2784
          LB       0.72      0.68      0.70      2757
          SA       0.48      0.56      0.51      2678
          AE       0.48      0.50      0.49      2620
          BH       0.42      0.41      0.41      2612
          OM       0.49      0.49      0.49      1904
          SY       0.49      0.43      0.46      1621
          DZ       0.64      0.62      0.63      1614
          IQ       0.67      0.61      0.64      1548
          SD       0.75      0.66      0.70      1438
          MA       0.79      0.64      0.71      1153
          YE       0.40      0.31      0.35       988
          TN       0.71    

### Save Base Model

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/drive/MyDrive/Aim_NLP_Task/BERT_Fine_Tuning_dialect_base_0.60'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = BERTmodel_base.module if hasattr(BERTmodel_base, 'module') else BERTmodel_base  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer_obj_base.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


In [47]:
path = "/content/drive/MyDrive/Aim_NLP_Task/Models_Weight/model_base_0.60.pth"
torch.save(BERTmodel_base.cpu().state_dict(), path) # saving model
BERTmodel_base.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Load Pretrained Model

In [None]:
output_dir_base="/content/drive/MyDrive/Aim_NLP_Task/BERT_Fine_Tuning_dialect_base_0.60"
test_tweets,test_labels = dialectUtils.read_csv("../Data/preProcessedTweets_testF.csv",
                                                  "text_preprocessed","label")

In [None]:
tokenizer_class_base = BertTokenizer.from_pretrained(output_dir_base)
load_BERTmodel_base  = BertForSequenceClassification.from_pretrained(output_dir_base)

In [None]:
load_BERTmodel_base.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [None]:
test_data_base  = tokenizer_class_base.bert_tokenize_data(test_tweets,test_labels)
test_loader_base=dialectUtils.create_bert_dataloader(test_data_base,valid=None,batch_size=32,
                                                split_train=False,test=True)

In [None]:
pred_base,actual = dialectUtils.Batchpredict(load_BERTmodel_base,test_loader_base)

Predicting labels for test sentences...
    DONE.


In [None]:
dialectUtils.get_report(pred_base,actual)

              precision    recall  f1-score   support

          EG       0.86      0.92      0.89      5756
          PL       0.58      0.78      0.66      4368
          KW       0.62      0.79      0.69      4200
          LY       0.78      0.87      0.83      3647
          QA       0.74      0.55      0.63      3105
          JO       0.63      0.49      0.55      2784
          LB       0.85      0.77      0.81      2757
          SA       0.64      0.68      0.66      2678
          AE       0.65      0.61      0.63      2620
          BH       0.53      0.55      0.54      2612
          OM       0.64      0.61      0.63      1904
          SY       0.72      0.55      0.62      1621
          DZ       0.82      0.66      0.73      1614
          IQ       0.87      0.70      0.78      1548
          SD       0.92      0.74      0.82      1438
          MA       0.91      0.71      0.80      1153
          YE       0.57      0.47      0.51       988
          TN       0.73    

________________________________________________________________________

## Bert `Large` AraBert

### Train

In [None]:
preTrainedBertModel_large="aubmindlab/bert-large-arabertv02-twitter"

tokenizer_large= AutoTokenizer.from_pretrained(preTrainedBertModel_large)
tokenizer_obj_large = dialectUtils.Tokenizer(tokenizer_large)


train_data_large = tokenizer_obj_large.bert_tokenize_data(train_tweets,train_labels)
valid_data_large = tokenizer_obj_large.bert_tokenize_data(valid_tweets ,valid_labels)
test_data_large  = tokenizer_obj_large.bert_tokenize_data(test_tweets,test_labels)

train_loader_large,valid_loader_large=dialectUtils.create_bert_dataloader(train_data_large,
                                                                        valid_data_large,
                                                                        batch_size=64,
                                                                        split_train= False)

test_loader_large=dialectUtils.create_bert_dataloader(test_data_large,valid=None,batch_size=64,
                                                split_train=False,test=True)

BERTmodel_large=dialectUtils.get_model(preTrainedBertModel_large)
dialectUtils.printModel_Parameters(BERTmodel_large)
print("\n")
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
dialectUtils.train(train_loader_large,valid_loader_large,model=BERTmodel_large,epochs=5,learning_rate=2e-5,eps=1e-8,device ="cuda")




  Batch    40  of  5,728.    Elapsed: 0:00:52.
  Batch    80  of  5,728.    Elapsed: 0:01:43.
  Batch   120  of  5,728.    Elapsed: 0:02:34.
  Batch   160  of  5,728.    Elapsed: 0:03:25.
  Batch   200  of  5,728.    Elapsed: 0:04:16.
  Batch   240  of  5,728.    Elapsed: 0:05:07.
  Batch   280  of  5,728.    Elapsed: 0:05:58.
  Batch   320  of  5,728.    Elapsed: 0:06:49.
  Batch   360  of  5,728.    Elapsed: 0:07:40.
  Batch   400  of  5,728.    Elapsed: 0:08:30.
  Batch   440  of  5,728.    Elapsed: 0:09:21.
  Batch   480  of  5,728.    Elapsed: 0:10:12.
  Batch   520  of  5,728.    Elapsed: 0:11:03.
  Batch   560  of  5,728.    Elapsed: 0:11:54.
  Batch   600  of  5,728.    Elapsed: 0:12:45.
  Batch   640  of  5,728.    Elapsed: 0:13:36.
  Batch   680  of  5,728.    Elapsed: 0:14:26.
  Batch   720  of  5,728.    Elapsed: 0:15:17.
  Batch   760  of  5,728.    Elapsed: 0:16:08.
  Batch   800  of  5,728.    Elapsed: 0:16:59.
  Batch   840  of  5,728.    Elapsed: 0:17:50.
  Batch   88

### Predict on Test

In [None]:
pred_large,actual = dialectUtils.Batchpredict(BERTmodel_large,test_loader_large)

Predicting labels for test sentences...
    DONE.


In [None]:
dialectUtils.get_report(pred_large,actual)

              precision    recall  f1-score   support

          EG       0.86      0.92      0.89      5756
          PL       0.58      0.78      0.66      4368
          KW       0.62      0.79      0.69      4200
          LY       0.78      0.87      0.83      3647
          QA       0.74      0.55      0.63      3105
          JO       0.63      0.49      0.55      2784
          LB       0.85      0.77      0.81      2757
          SA       0.64      0.68      0.66      2678
          AE       0.65      0.61      0.63      2620
          BH       0.53      0.55      0.54      2612
          OM       0.64      0.61      0.63      1904
          SY       0.72      0.55      0.62      1621
          DZ       0.82      0.66      0.73      1614
          IQ       0.87      0.70      0.78      1548
          SD       0.92      0.74      0.82      1438
          MA       0.91      0.71      0.80      1153
          YE       0.57      0.47      0.51       988
          TN       0.73    

### Save Large Model

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/drive/MyDrive/Aim_NLP_Task/BERT_Fine_Tuning_dialect_large_061'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = BERTmodel_large.module if hasattr(BERTmodel_large, 'module') else BERTmodel_large  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer_obj_large.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

In [51]:
path = "/content/drive/MyDrive/Aim_NLP_Task/Models_Weight/model_large_0.61.pth"
torch.save(BERTmodel_large.cpu().state_dict(), path) # saving model
BERTmodel_large.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

### Load Pretrained Model

In [48]:
output_dir_large="/content/drive/MyDrive/Aim_NLP_Task/BERT_Fine_Tuning_dialect_large_0.61"
test_tweets,test_labels = dialectUtils.read_csv("../Data/preProcessedTweets_testF.csv",
                                                  "text_preprocessed","label")

In [49]:
tokenizer_class_large = BertTokenizer.from_pretrained(output_dir_large)
load_BERTmodel_large = BertForSequenceClassification.from_pretrained(output_dir_large)

In [50]:
load_BERTmodel_large.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [None]:
test_data_large  = tokenizer_class_large.bert_tokenize_data(test_tweets,test_labels)
test_loader_large=dialectUtils.create_bert_dataloader(test_data_large,valid=None,batch_size=64,
                                                split_train=False,test=True)

In [52]:
pred_large,actual = dialectUtils.Batchpredict(BERTmodel_large,test_loader_large)

Predicting labels for test sentences...
    DONE.


In [58]:
dialectUtils.get_report(pred_large,actual)

              precision    recall  f1-score   support

          EG       0.86      0.92      0.89      5756
          PL       0.58      0.78      0.66      4368
          KW       0.62      0.79      0.69      4200
          LY       0.78      0.87      0.83      3647
          QA       0.74      0.55      0.63      3105
          JO       0.63      0.49      0.55      2784
          LB       0.85      0.77      0.81      2757
          SA       0.64      0.68      0.66      2678
          AE       0.65      0.61      0.63      2620
          BH       0.53      0.55      0.54      2612
          OM       0.64      0.61      0.63      1904
          SY       0.72      0.55      0.62      1621
          DZ       0.82      0.66      0.73      1614
          IQ       0.87      0.70      0.78      1548
          SD       0.92      0.74      0.82      1438
          MA       0.91      0.71      0.80      1153
          YE       0.57      0.47      0.51       988
          TN       0.73    

___________________________________________________________________