In [16]:
import pandas as pd
import re
import nltk
from nltk.stem.isri import ISRIStemmer
from tqdm import tqdm
from arabert.preprocess import ArabertPreprocessor
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Read the datasets
train = pd.read_csv('dataset/train.csv')
train['stance'] += 1
print('train counts\n', train['stance'].value_counts())
dev = pd.read_csv('dataset/dev.csv')
dev['stance'] += 1
dev['stance'].value_counts()
print('dev counts\n', dev['stance'].value_counts())

train counts
 2    5538
1    1012
0     438
Name: stance, dtype: int64
dev counts
 2    804
1    126
0     70
Name: stance, dtype: int64


In [3]:
# For the train dataset, there are 4 different versions made from it:
    # 1. The original dataset with original counts
    # 2. The dataset but with 500 tweets per class
    # 3. The dataset but with 1000 tweets per class
    # 4. The dataset but up 2500 tweets per class

train_1 = train.copy()
dev_1 = dev.copy()

tmp1 = train[train['stance'] == 0].sample(500, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(500, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(500, random_state=42, replace=True)

train_2 = pd.concat([tmp1, tmp2, tmp3])

tmp1 = train[train['stance'] == 0].sample(1000, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(1000, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(1000, random_state=42, replace=True)

train_3 = pd.concat([tmp1, tmp2, tmp3])

tmp1 = train[train['stance'] == 0].sample(2500, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(2500, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(2500, random_state=42, replace=True)

train_4 = pd.concat([tmp1, tmp2, tmp3])

In [4]:
# Some pytorch preparations
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('device:', device)

device: cuda


# Using Arabert (No stopwords)

In [5]:
# The first preprocessing pipeline, using the arabert model
# This pipeline produces 2 outputs for every dataset:
    # 1. tokenized data for every tweet --> This is so this data can be used by others for feature extraction
    # 2. word embeddings for every tweet --> This can directly be used by the model

model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name).to(device)

def preprocess_arabert(text, embedding=True):
    """
    This function preprocesses the text using arabert.
    It's essentially a full pipeline that even returns the word embeddings.

    Parameters
    ----------
    text: str
        The text to be preprocessed
    embedding: bool
        Whether to return the word embeddings or not
    
    Returns
    -------
    output: list
        The preprocessed text
    """

    def clean_text(text):
        # remove any word with + in it
        text = re.sub(r'\S*\+\S*', '', text)
        # remove non arabic characters
        text = re.sub(r'[^\u0600-\u06FF]', ' ', text)
        # remove extra spaces
        return text

    output = arabert_prep.preprocess(text)
    output = clean_text(output)
    tokenized = tokenizer.tokenize(output)

    if embedding:
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized)
        tokens_tensor = torch.tensor([indexed_tokens]).to(device)
        with torch.no_grad():
            outputs = bert_model(tokens_tensor)
            encoded_layers = outputs[0]
            encoded_layers = encoded_layers.view(-1, 768)
        del tokens_tensor
        del outputs
        return encoded_layers
    else:
        return tokenized

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Use the first preprocessing pipeline to preprocess the datasets
tqdm.pandas()

train_1['tokenized'] = train_1['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=False))
train_1['embeddings'] = train_1['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=True))

# Write the datasets to pickle files -- So that it can be used by any other files easily
train_1.to_pickle('output/train_1_arabert.pkl')

# Can later be read using the following code:
    # train_1 = pd.read_pickle('output/train_1.pkl')
# The dimensions for every word is (1, 768)
# This means that every sentence will have a dimension of (sentence_length, 768)

# clear the memory
del train_1

100%|██████████| 6988/6988 [00:18<00:00, 383.27it/s]
100%|██████████| 6988/6988 [01:31<00:00, 76.58it/s]


In [7]:
train_2['tokenized'] = train_2['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=False))
train_2['embeddings'] = train_2['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=True))
train_2.to_pickle('output/train_2_arabert.pkl')
del train_2

100%|██████████| 1500/1500 [00:03<00:00, 386.53it/s]
100%|██████████| 1500/1500 [00:19<00:00, 75.78it/s]


In [8]:
train_3['tokenized'] = train_3['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=False))
train_3['embeddings'] = train_3['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=True))
train_3.to_pickle('output/train_3_arabert.pkl')
del train_3

100%|██████████| 3000/3000 [00:07<00:00, 380.97it/s]
100%|██████████| 3000/3000 [00:39<00:00, 75.74it/s]


In [9]:
train_4['tokenized'] = train_4['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=False))
train_4['embeddings'] = train_4['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=True))
train_4.to_pickle('output/train_4_arabert.pkl')
del train_4

100%|██████████| 7500/7500 [00:19<00:00, 387.36it/s]
100%|██████████| 7500/7500 [01:37<00:00, 76.64it/s]


In [10]:
dev_1['tokenized'] = dev_1['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=False))
dev_1['embeddings'] = dev_1['text'].progress_apply(lambda x: preprocess_arabert(x, embedding=True))
dev_1.to_pickle('output/dev_1_arabert.pkl')
del dev_1

100%|██████████| 1000/1000 [00:02<00:00, 404.44it/s]
100%|██████████| 1000/1000 [00:12<00:00, 77.85it/s]


In [11]:
# delete the bert_model and the arabert_prep & tokenizer
del bert_model
del arabert_prep
del tokenizer

# Using our function (uses stopwords)

In [13]:
# This function is made because the process of cleaning arabic text is complex
# and depends on many unicodes done in many steps
def clean_arabic(text):
    # ! to understand this nonsense you need this link open 
    """ https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D8%AE%D8%B7_
        %D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A_%D9%81%D9%8A_%D9%8A%D9
        %88%D9%86%D9%8A%D9%83%D9%88%D8%AF   """

    # remove التشكيل
    text = re.sub(r'[\u0600-\u061F]', '', text)
    text = re.sub(r'[\u064B-\u066D]', '', text)

    # Because of all the idiots that were bypassing twitters' spam filters using
    # special characters like this idiot: كو.ر.ونا We'll remove all the special 
    # before everything else

    # remove special characters
    text = re.sub(r'[\u0024-\u003F]', '', text)
    text = re.sub(r'[\u005B-\u0060]', '', text)
    text = re.sub(r'[\u007B-\u007E]', '', text)

    # replace weird characters with more standard ones
    # 1. replace چ with ج
    text = re.sub(r'چ','ج',text)

    # 2. replace ڤ ڨ with ف
    text = re.sub(r'ڤ','ف',text)
    text = re.sub(r'ڨ','ف',text)

    # 3. replace ڠ with ق
    text = re.sub(r'ڠ','غ',text)
    
    # 4. replace ٱ	ٲ	ٳ	◌ٴ	ٵ with ا
    string = ['ٱ','ٲ','ٳ','ٴ','ٵ', 'آ', 'أ', 'إ']
    for char in string:
        text = re.sub(char,'ا',text)

    # 5. replace ٶ	ٷ with و
    string = ['ٶ','ٷ']
    for char in string:
        text = re.sub(char,'و',text)

    # 6. replace ٸ ی with ي
    text = re.sub(r'ٸ','ي',text) 
    text = re.sub(r'ی','ي',text)
    
    # 7. replace پ	with ب
    text = re.sub(r'پ','ب',text)

    # 8. replace ژ with ز
    text = re.sub(r'ژ','ز',text)

    # 9. replace ک ڪ ګ ڬ ڭ ڮ گ ڰ ڱ ڲ ڳ ڴ with ك
    string = ['ک', 'ڪ', 'ګ', 'ڬ', 'ڭ', 'ڮ', 'گ', 'ڰ', 'ڱ', 'ڲ', 'ڳ', 'ڴ']
    for char in string:
        text = re.sub(char,'ك',text)
    # 10. replace ھ with ه
    text = re.sub(r'ھ','ه',text)

    # remove all extra arabic characters (shift + ت) 
    text = re.sub(r'ـ','',text)

    # remove non arabic characters
    text = re.sub(r'[^\u0620-\u064A\s]',' ',text)
    
    return text

In [14]:
def clean_text(text):
    """ removes all non arabic characters & replaces all spaces with a single space """
    
    # remove all words with # in them
    text = re.sub(r'[^\s]*#[^\s]*',' ',text)
    
    # arabic letters clean up 
    text = clean_arabic(text)
        
    # replace all white spaces with a single space
    text = re.sub(r'\s+',' ',text)
    
    return text 

In [22]:
def original_preprocess(dataset):
    tqdm.pandas()
    dataset['cleaned_data'] = dataset['text'].progress_apply(lambda x: clean_text(x))
    dataset['tokenized_data'] = dataset['cleaned_data'].progress_apply(lambda x: nltk.word_tokenize(x))
    dataset.drop(['cleaned_data'], axis=1, inplace=True)

    # remove stopwords
    stopwords = nltk.corpus.stopwords.words('arabic')

    # Here we're looking for more stopwords that are 2 characters or less
    # we spend hours doing just this for two or three character words
    stopwords += ['ال', 'اي', 'ان', 'تم', 'بن', 
                'او', 'اي', 'عم', 'ام', 'رض',
                'في', 'فى', 'رب', 'سم', 'خط',
                'ول', 'زي', 'دي', 'اذ', 'ده',
                'دى', 'انه', 'ابو', 'احد']
    dataset['tokens_no_stopwords'] = dataset['tokenized_data'].progress_apply(lambda x: [word for word in x if word not in stopwords]) 
    dataset.drop(['tokenized_data'], axis=1, inplace=True)
    dataset['tokens'] = dataset['tokens_no_stopwords'].progress_apply(lambda x: [ISRIStemmer().stem(word) for word in x])
    dataset.drop(['tokens_no_stopwords'], axis=1, inplace=True)

    return dataset

In [20]:
# For the train dataset, there are 4 different versions made from it:
    # 1. The original dataset with original counts
    # 2. The dataset but with 500 tweets per class
    # 3. The dataset but with 1000 tweets per class
    # 4. The dataset but up 2500 tweets per class

train_1 = train.copy()
dev_1 = dev.copy()

tmp1 = train[train['stance'] == 0].sample(500, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(500, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(500, random_state=42, replace=True)

train_2 = pd.concat([tmp1, tmp2, tmp3]).sort_index()

tmp1 = train[train['stance'] == 0].sample(1000, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(1000, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(1000, random_state=42, replace=True)

train_3 = pd.concat([tmp1, tmp2, tmp3]).sort_index()

tmp1 = train[train['stance'] == 0].sample(2500, random_state=42, replace=True)
tmp2 = train[train['stance'] == 1].sample(2500, random_state=42, replace=True)
tmp3 = train[train['stance'] == 2].sample(2500, random_state=42, replace=True)

train_4 = pd.concat([tmp1, tmp2, tmp3]).sort_index()

In [21]:
train_1 = original_preprocess(train_1)
train_2 = original_preprocess(train_2)
train_3 = original_preprocess(train_3)
train_4 = original_preprocess(train_4)
dev_1 = original_preprocess(dev_1)

# save the datasets to pickle files
train_1.to_pickle('output/train_1_original.pkl')
train_2.to_pickle('output/train_2_original.pkl')
train_3.to_pickle('output/train_3_original.pkl')
train_4.to_pickle('output/train_4_original.pkl')
dev_1.to_pickle('output/dev_1_original.pkl')