In [1]:
import pandas as pd
import numpy as np
import torch
import time
import copy
from tqdm.auto import tqdm
from transformers import AutoTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../PreprocessedData/preprocessed_data.csv',sep='\t',encoding='utf-8')

In [3]:
df.sample(5)

Unnamed: 0,id,text,dialect,preprocessed_text
450623,1164904153652617216,@_anwaralsalim اللهم اميين يارب العاالميين 🙏🏼,BH,[مستخدم] اللهم اميين يارب العاالميين
6726,1126203932722847616,@EdyCohen ولا واحد كلهم نكرات,IQ,[مستخدم] ولا واحد كلهم نكرات
93275,1135183573613449216,كنت انا و @EsraaRaie بالتاكسي السواق بقولنا ا...,PL,كنت انا و [مستخدم] بالتاكسي السواق بقولنا اخدت...
409451,537560284928307200,#..\n\n#__ آي علآقة من آلعآلم آلآفترآضي علآق...,AE,# . . # _ _ آي علآقة من آلعآلم آلآفترآضي علآقة...
270687,939985298506829824,الحمد لله على نعمه التربيه الكويسه والعين المل...,EG,الحمد لله على نعمه التربيه الكويسه والعين المل...


In [4]:
df.dropna(subset=['preprocessed_text'],inplace=True)

In [5]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
class bert_dataset(torch.utils.data.Dataset):
    def __init__(self, text_list, classes_list, tokenizer):
        self.features = []
        self.labels = []
        self.text_list = text_list
        self.classes_list = classes_list
        self.tokenizer = tokenizer
        self.max_len = 0
        self.create_dataset()

    def __getitem__(self,index):
        input_ids = self.features[index]
        attention_mask = [1] * len(input_ids)
        padding_length = self.max_len - len(input_ids)
        input_ids = ([self.tokenizer.pad_token_id] * padding_length) + input_ids 
        attention_mask = ([0] * padding_length) + attention_mask  

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        label = torch.tensor(int(self.labels[index]))
        return input_ids, attention_mask, label

    def __len__(self):
        return len(self.features)

    def create_dataset(self):
        lines = []
  
        for line in self.text_list:
            line_tokenized = self.tokenizer.encode(line)
            self.max_len = max(self.max_len,len(line_tokenized))
            lines.append(line_tokenized)

        self.features = lines

        le = LabelEncoder()
        self.labels = le.fit_transform(self.classes_list)

In [7]:
dataset = bert_dataset(df['preprocessed_text'], df['dialect'], arabert_tokenizer)

dataset_len = len(dataset)

train_len = int(len(dataset)*0.9)
test_len = dataset_len - train_len

train_set, test_set = torch.utils.data.random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

In [8]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=8, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=8, shuffle=True)

dataloaders_dict = {'train':train_loader, 'val':test_loader}

In [9]:
class BERT(torch.nn.Module):
    def __init__(self, model_name):
        super(BERT, self).__init__()
        self.bert_model = BertModel.from_pretrained(model_name)
        self.lstm = torch.nn.LSTM(768, 128, batch_first=True)
        self.out = torch.nn.Linear(128, 18)
        
    def forward(self,ids,attention_mask):
        outputs = self.bert_model(ids,attention_mask= attention_mask, output_hidden_states=True)
        o2 = outputs[2][-1]
        lstm_out, (ht, ct) = self.lstm(o2)
        out= self.out(ht[-1])
        
        return out
    
model = BERT(model_name)

criterion = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02-twitter were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['bert.pooler.dens

In [10]:
for param in model.bert_model.parameters():
    param.requires_grad = False

params_to_update = []
for name,param in model.named_parameters():
#     print(name)
    if param.requires_grad == True:
        params_to_update.append(param)

# print(len(params_to_update))

#Initialize Optimizer
optimizer = AdamW(params_to_update, lr=0.001, eps=1e-8)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERT(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [12]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=5):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    num_training_steps = num_epochs*dataset_len//32
    progress_bar = tqdm(range(num_training_steps))

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for input_ids, input_mask, labels in dataloaders[phase]:
                
                input_ids = input_ids.to(device)
                input_mask  = input_mask.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(input_ids,attention_mask=input_mask)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
                progress_bar.update(1)

                # statistics
                running_loss += loss.item() * input_ids.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [None]:
model_ft, hist = train_model(model, dataloaders_dict, criterion, optimizer, num_epochs=6)
torch.save(model.state_dict(), '../models/transformer_lstm_wts.pth')

In [13]:
model.load_state_dict(torch.load('../models/transformer_lstm_wts.pth'))

<All keys matched successfully>

In [14]:
model.to('cpu')

label_dict = {0 : 'AE', 1 : 'BH', 2 : 'DZ', 3 : 'EG', 4 : 'IQ', 5 : 'JO', 6 : 'KW', 7 : 'LB', 8 : 'LY',
              9 : 'MA', 10 : 'OM', 11 : 'PL', 12 : 'QA', 13 : 'SA', 14 : 'SD', 15 : 'SY', 16 : 'TN', 17 : 'YE'}


s = 'يخرب بيت عيونك يا صوفيا شو حلوين'

inputs = arabert_tokenizer([s])
input_ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
attention_mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
model(input_ids,attention_mask)

print(label_dict[np.argmax(model(input_ids,attention_mask).detach().numpy())])

SY
