<h1>Task 8: Word Embedding</h1>

<h4> This notebook compares different embedding methods on a simple task (sentiment analysis) <a href="https://www.kaggle.com/mksaad/arabic-sentiment-twitter-corpus">on a small dataset</a>.</h4>

<h4>Table of Contents:</h4>
<ol>
    <li>Load Dataset</li>
    <li>Normalize Dataset</li>
    <li>Tokenize Dataset</li>
    <li>Word Embedding</li>
    <li>Train RNN model</li>
    <li>Evaluate model</li>
</ol>
<h4>Embedding Methods:</h4>
<ol>
    <li>Genism library's Word2Vec implementation (trained from scratch)</li>
    <li>Genism library's fasttext implementation (trained from scratch)</li>
    <li>AraVec pretrained embeddings</li>
    <li>BERT Arabic pretrained model</li>
</ol>

In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import pandas as pd
import glob
import gensim
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, Input, InputLayer, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

<h1>Load Model</h1>

In [2]:
train_pos = pd.read_csv("data/train_Arabic_tweets_positive_20190413.tsv", sep='\t', names=["label", "tweet"])
train_neg = pd.read_csv("data/train_Arabic_tweets_negative_20190413.tsv", sep='\t', names=["label", "tweet"])
test_pos = pd.read_csv("data/test_Arabic_tweets_positive_20190413.tsv", sep='\t', names=["label", "tweet"])
test_neg = pd.read_csv("data/test_Arabic_tweets_negative_20190413.tsv", sep='\t', names=["label", "tweet"])
train = pd.concat([train_pos, train_neg]).sample(frac=1.0, random_state=0)
test = pd.concat([test_pos, test_neg])

In [3]:
train

Unnamed: 0,label,tweet
15454,pos,يسلموو #آدآرتنآ مآقصرتوآ على آلدعم آلجميل تميز...
10789,pos,اللهم إن في صدري كلاما لا أستطيع ترتيبه في الد...
19949,neg,كم مره قعدت مع ناس او حتى شخص و اسولف معاهم و ...
12259,neg,اسأل الله العظيم رب العرش العظيم ان يشفيك ي يا...
8704,pos,اللهم شيئا لطيفا ، مفاجئ غير مخطط له ، يأتي من...
...,...,...
7642,neg,اذا الفيفا عارف هالشي غلط مش المفروض يتصرف ويت...
21243,pos,اللهم اجعلنا ممن تفائل بخيرك فأكرمته ، وتوكل ع...
19852,neg,صبر انظم لكم من الابتوب 😏
20806,neg,والله توحشتها ايا سيدي ياه حابين هكا 😖


In [4]:
def normalize(text):
    text = araby.strip_harakat(text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_small(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.strip_diacritics(text)
    text = araby.normalize_ligature(text)
    #text = araby.normalize_hamza(text)
    text = araby.normalize_teh(text)
    text = araby.normalize_alef(text)
    return text

def strip_all(text):
    l = [' ', '0', '1', '2', '3', '4', '5', '6',
       '7', '8', '9', '?', 
       '؟', 'ء', 'ؤ', 'ئ', 'ا', 'ب', 'ت', 'ث',
       'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ',
       'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', '٠', '١',
       '٢', '٣', '٤', '٥', '٦', '٧', '٨', '٩']
    return "".join([x for x in text if x in l])

In [5]:
train.tweet = train.tweet.apply(normalize).apply(strip_all).apply(araby.tokenize)
test.tweet = test.tweet.apply(normalize).apply(strip_all).apply(araby.tokenize)

In [6]:
le = LabelEncoder()
le.fit(train.label)
train.label = le.transform(train.label)
test.label = le.transform(test.label)


In [7]:
train

Unnamed: 0,label,tweet
15454,1,"[يسلموو, ادارتنا, ماقصرتوا, علا, الدعم, الجميل..."
10789,1,"[اللهم, ان, في, صدري, كلاما, لا, استطيع, ترتيب..."
19949,0,"[كم, مره, قعدت, مع, ناس, او, حتا, شخص, و, اسول..."
12259,0,"[اسال, الله, العظيم, رب, العرش, العظيم, ان, يش..."
8704,1,"[اللهم, شيئا, لطيفا, مفاجئ, غير, مخطط, له, يات..."
...,...,...
7642,0,"[اذا, الفيفا, عارف, هالشي, غلط, مش, المفروض, ي..."
21243,1,"[اللهم, اجعلنا, ممن, تفائل, بخيرك, فاكرمته, وت..."
19852,0,"[صبر, انظم, لكم, من, الابتوب]"
20806,0,"[والله, توحشتها, ايا, سيدي, ياه, حابين, هكا]"


In [8]:
def word2idx(word, word_model):
    return word_model.wv.key_to_index[word]
def idx2word(idx):
    return word_model.wv.index_to_key[idx]


In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(train.tweet.values, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)


In [9]:
def token2vec(word_model, data):
    data_tmp = np.zeros([data.shape[0], 100], dtype=np.int32)
    for i, sentence in enumerate(data):
        for t, word in enumerate(sentence[:100]):
            if word in word_model.wv.key_to_index:
                data_tmp[i, t] = word2idx(word, word_model)
            else:
                data_tmp[i, t] = 0
    return data_tmp

In [10]:
def train_model(vocab_size, embedding_size, weights):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[weights], trainable=True))
    model.add(Bidirectional(GRU(units = 32, return_sequences=True)))
    model.add(Bidirectional(GRU(units = 32, return_sequences=False)))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
    model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 5, batch_size= 128, shuffle = True, callbacks=callbacks)
    return model

In [11]:
sentences = np.concatenate([train.tweet.values, test.tweet.values])

In [12]:
word2vec_model_cbow = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, seed=0)
word2vec_model_cbow.build_vocab(sentences) 
word2vec_model_cbow.train(sentences, total_examples=word2vec_model_cbow.corpus_count, epochs=15)
word2vec_weights_cbow = word2vec_model_cbow.syn1neg
vocab_size, emdedding_size = word2vec_weights_cbow.shape


In [26]:
word2vec_model_cbow.wv.most_similar("لغه")

[('الحقيقيه', 0.6934918165206909),
 ('احيانا', 0.6874623894691467),
 ('حكايه', 0.6851267218589783),
 ('الكتمان', 0.676027774810791),
 ('بكرامه', 0.6758304238319397),
 ('الموجعه', 0.6753607988357544),
 ('لانا', 0.6732767820358276),
 ('ندا', 0.6690214276313782),
 ('ارواح', 0.6672318577766418),
 ('يحسون', 0.6625495553016663)]

In [27]:
from pretrained.AraVec import AraVec
aravec = AraVec()
model = aravec.load_model("full_grams_cbow_100_twitter/full_grams_cbow_100_twitter.mdl")
model.wv.most_similar("لغه")

[('مفردات', 0.7585998177528381),
 ('لغات', 0.755275309085846),
 ('ولغه', 0.7323259711265564),
 ('ابجديه', 0.7260634899139404),
 ('مصطلحات', 0.7094709873199463),
 ('اللغه', 0.7051507234573364),
 ('ابجديات', 0.6975376605987549),
 ('بلغه', 0.6946367025375366),
 ('لغتها', 0.6932260394096375),
 ('ثقافه', 0.6907159090042114)]

In [18]:
word2vec_model_cbow = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, seed=0)
word2vec_model_cbow.build_vocab(sentences) 
word2vec_model_cbow.train(sentences, total_examples=word2vec_model_cbow.corpus_count, epochs=15)
word2vec_weights_cbow = word2vec_model_cbow.syn1neg
vocab_size, emdedding_size = word2vec_weights_cbow.shape

X_train, X_valid, y_train, y_valid = train_test_split(train.tweet.values, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)
X_train = token2vec(word2vec_model_cbow, X_train)
X_valid = token2vec(word2vec_model_cbow, X_valid)

trained_word2vec_cbow = train_model(vocab_size, emdedding_size, word2vec_weights_cbow)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
word2vec_model_sg = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, seed=0, sg=1)
word2vec_model_sg.build_vocab(sentences) 
word2vec_model_sg.train(sentences, total_examples=word2vec_model_sg.corpus_count, epochs=15)
word2vec_weights_sg = word2vec_model_sg.syn1neg
vocab_size, emdedding_size = word2vec_weights_sg.shape

X_train, X_valid, y_train, y_valid = train_test_split(train.tweet.values, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)
X_train = token2vec(word2vec_model_sg, X_train)
X_valid = token2vec(word2vec_model_sg, X_valid)

trained_word2vec_sg = train_model(vocab_size, emdedding_size, word2vec_weights_sg)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
fasttext_model = gensim.models.FastText(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.build_vocab(sentences)
fasttext_model.train(sentences, total_examples=fasttext_model.corpus_count, epochs=15) 
fasttext_weights = fasttext_model.syn1neg
vocab_size, emdedding_size = fasttext_weights.shape

X_train, X_valid, y_train, y_valid = train_test_split(train.tweet.values, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)
X_train = token2vec(fasttext_model, X_train)
X_valid = token2vec(fasttext_model, X_valid)

trained_fasttext = train_model(vocab_size, emdedding_size, fasttext_weights)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
from keras.preprocessing.sequence import pad_sequences

vocab = np.unique(np.array([x for y in train.tweet.values for x in y ]))
word_index = {w: i for i, w in enumerate(vocab)}

seq_list = []
for words in train.tweet.values:
    seq = []
    for w in words:
        seq.append(word_index.get(w,0))
    seq_list.append(seq)
train_padded = pad_sequences(seq_list, padding="post", truncating="post", maxlen=100)

In [10]:
from pretrained.AraVec import AraVec
aravec = AraVec()
model_path = aravec.get_model("Twitter_CBOW_100", unzip=True)
model = aravec.load_model(model_path)

embeddings_index = aravec.get_embedding_matrix(model)
vocab_size, emdedding_size = len(word_index),100
embeddings_matrix = aravec.load_embedding_matrix(vocab_size, emdedding_size, word_index, embeddings_index)
X_train, X_valid, y_train, y_valid = train_test_split(train_padded, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)

trained_cbow = train_model(vocab_size, emdedding_size, embeddings_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
from pretrained.AraVec import AraVec
aravec = AraVec()
model_path = aravec.get_model("Twitter_SkipGram_100", unzip=True)
model = aravec.load_model(model_path)

embeddings_index = aravec.get_embedding_matrix(model)
vocab_size, emdedding_size = len(word_index),100
embeddings_matrix = aravec.load_embedding_matrix(vocab_size, emdedding_size, word_index, embeddings_index)
X_train, X_valid, y_train, y_valid = train_test_split(train_padded, train.label.values, test_size=0.5,random_state=0, stratify=train.label.values)

trained_skipgram = train_model(vocab_size, emdedding_size, embeddings_matrix)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
!pip install transformers datasets

Successfully installed datasets-1.7.0 fsspec-2021.5.0 xxhash-2.0.2


In [9]:
from transformers import AutoTokenizer

train = pd.concat([train_pos, train_neg])
test = pd.concat([test_pos, test_neg])
train.tweet = train.tweet.apply(normalize).apply(strip_all)
test.tweet = test.tweet.apply(normalize).apply(strip_all)
train.label = le.transform(train.label)
test.label = le.transform(test.label)


X_train, X_valid, y_train, y_valid = train_test_split(train.tweet, train.label, test_size=0.5,random_state=0, stratify=train.label.values)

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=100)
val_encodings = tokenizer(X_valid.tolist(), truncation=True, padding='max_length', max_length=100)

In [10]:
import torch

class MeterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MeterDataset(train_encodings, y_train.tolist())
val_dataset = MeterDataset(val_encodings, y_valid.tolist())


In [11]:
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("asafaya/bert-medium-arabic", num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=32,              # number of warmup steps for learning rate scheduler
    weight_decay=0.01,           # strength of weight decay
    learning_rate= 5e-5,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy = 'epoch',
)
from datasets import load_metric
from transformers import Trainer

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of the model checkpoint at asafaya/bert-medium-arabic were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mediu

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5406,0.499445,0.745251
2,0.402,0.486501,0.77096
3,0.268,0.570267,0.766543
4,0.2003,0.751087,0.767073


TrainOutput(global_step=2832, training_loss=0.33862685039639473, metrics={'train_runtime': 1008.9977, 'train_samples_per_second': 2.807, 'total_flos': 268079209398000.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 1894203392, 'init_mem_gpu_alloc_delta': 168528384, 'init_mem_cpu_peaked_delta': 64856064, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 20869120, 'train_mem_gpu_alloc_delta': 506130432, 'train_mem_cpu_peaked_delta': 130834432, 'train_mem_gpu_peaked_delta': 1205711360})