In [1]:
# importing what is important
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForMaskedLM
import emoji
import unicodedata
import re
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from transformers import BertModel
##nltk.download('stopwords')

In [2]:
# Importing model pretrained
tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-mini-arabic")

In [3]:
# tweets with labled sentiment
df = pd.read_csv("proccesed.csv")

In [4]:
## setting device "GPU/CPU"
device = torch.device('cpu')

In [5]:
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

# Arabic stop words with nltk
stop_words = stopwords.words()

arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def text_preprocessing(text):
    '''
    text is an arabic string input
    
    the preprocessed text is returned
    '''
    
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
    
    # remove Tashkeel
    text = re.sub(arabic_diacritics, '', text)
    
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text

In [6]:
def preprocessing_for_bert(data, tokinizer, text_preprocessing_fn = text_preprocessing, MAX_LEN=10):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    
    # For every sentence...
    for i,sent in enumerate(data):
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing_fn(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            padding='max_length',        # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,     # Return attention mask
            truncation = True 
            )
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [7]:
def get_test_loader(data, tokinizer):
    input_id, attention_masks = preprocessing_for_bert(data, tokinizer)

In [8]:
df.reset_index(inplace=True)

In [9]:
X = df.Tweet
y = df.Sentiment.replace({"pos": 1, "neg": 0})

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X.values,y,test_size=0.1, random_state=42)

In [12]:
train_inputs, train_masks = preprocessing_for_bert(X_train, tokenizer)
val_inputs, val_masks = preprocessing_for_bert(X_val, tokenizer)

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [14]:
def get_test_loader(data, tokinizer):
    input_id, attention_masks = preprocessing_for_bert(data, tokinizer)
    X_test = torch.tensor(input_ids)
    test_data = TensorDataset(X_test, attention_masks)
    test_sampler = RandomSampler(test_data)

In [15]:
class BertModel(nn.Module):
    """
    BERT :) 
    """
    
    def __init__(self, freeze=False):
        """
        
        """
        super(BertModel, self).__init__()
        D_in = 32000 # bert_in ?
        H, D_out= 16000,2
        
        self.bert =  AutoModelForMaskedLM.from_pretrained("asafaya/bert-mini-arabic")
        
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
                
    def forward(self, input_ids, attention_mask):
        
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        last_hidden_state_cls = outputs[0][:, 0, :]
        
        logits = self.classifier(last_hidden_state_cls)
        
        return logits

In [16]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.optim import SparseAdam, Adam

In [17]:
def initialize_model(epochs=4):
    
    bert_model = BertModel(freeze=False)
    bert_model = bert_model.to(device)
    
    optimizer = AdamW(params=list(bert_model.parameters()), 
                     lr=5e-5,
                     eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    
    return bert_model, optimizer, scheduler

In [22]:
import random
import time
import torch
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

def train(model, train_loader, val_loader, epochs=4):
    
    print("START TRAINING...")
    temp = np.Infinity
    val_accuracy = 0
    for epoch in range(epochs+1):
        total_loss, batch_loss, batch_counts = 0.,0.,0.
        
        model.train()
        
        for step,batch in enumerate(train_loader):
            if(epoch==0):
                continue
            batch_counts+=1
            inputs_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            
            model.zero_grad()
            
            logits = model(inputs_ids, attention_mask)
            train_loss = criterion(logits,labels)
            
            batch_loss += train_loss.item()
            total_loss += train_loss.item()
            train_loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
            
            if (step % 100 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                val_loss, val_accuracy = evaluate(model, val_loader)
                print("epoch: {} | step: {} | train_loss: {} | val_loss {} | val_accuracy {} ".format(epoch, step, (batch_loss / batch_counts), val_loss, val_accuracy))
                if(val_loss < temp):
                    temp = val_loss
                    print("saving model...")
                    torch.save(model,"model.pt")
            

In [19]:
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []
    
    # For each batch in our validation set...
    for step, batch in enumerate(val_dataloader):
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = criterion(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy
            

In [20]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=3)



In [23]:
train(bert_classifier, train_dataloader, val_dataloader, epochs=3)

START TRAINING...
epoch: 1 | step: 100 | train_loss: 0.38263419816399563 | val_loss 0.43375168352479665 | val_accuracy 81.40845070422536 
saving model...
epoch: 1 | step: 200 | train_loss: 0.35938418916987247 | val_loss 0.5150794959928788 | val_accuracy 80.66901408450704 
epoch: 1 | step: 300 | train_loss: 0.35153737629568854 | val_loss 0.4547091201892201 | val_accuracy 80.77464788732394 
epoch: 1 | step: 400 | train_loss: 0.3469836261597209 | val_loss 0.4193937877009452 | val_accuracy 82.16549295774648 
saving model...
epoch: 1 | step: 500 | train_loss: 0.3390265948207911 | val_loss 0.46432274827654935 | val_accuracy 81.91901408450704 
epoch: 1 | step: 600 | train_loss: 0.3388026504097088 | val_loss 0.4144747299207768 | val_accuracy 81.90140845070422 
saving model...
epoch: 1 | step: 700 | train_loss: 0.3332892374883535 | val_loss 0.4721359352095866 | val_accuracy 81.65492957746478 
epoch: 1 | step: 800 | train_loss: 0.33256133103954805 | val_loss 0.4327883133271211 | val_accuracy 81.

KeyboardInterrupt: 