# Setup

In [1]:
%cd ../../../..

/home/magedsaeed/MyProjects/DotlessArabic


In [2]:
import os
import random
import shutil
from pathlib import Path
from collections import defaultdict

import wandb
from tqdm.auto import tqdm

import torch
import torchmetrics
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningModule,Trainer
from pytorch_lightning.callbacks import EarlyStopping,LearningRateMonitor,ModelCheckpoint


from sklearn.model_selection import train_test_split

from dotless_arabic.processing import undot,process
from dotless_arabic.tokenizers import CharacterTokenizer,WordTokenizer
from dotless_arabic.experiments.nlms.src import constants


from dotless_arabic.datasets.labr.collect import collect_train_dataset_for_sentiment_analysis,collect_test_dataset_for_sentiment_analysis

In [3]:
random.seed(42)
os.environ['WANDB_MODE']='disabled'
os.environ['CUDA_LAUNCH_BLOCKING']='1' # to see CUDA errors
torch.cuda.empty_cache() # to free gpu memory
seed_everything(42, workers=True)
# other options: https://stackoverflow.com/questions/15197286/how-can-i-flush-gpu-memory-using-cuda-physical-reset-is-unavailable

Global seed set to 42


42

In [4]:
wandb.login()

True

# Prepare the dataset

In [5]:
train_dataset = collect_train_dataset_for_sentiment_analysis()
test_dataset = collect_test_dataset_for_sentiment_analysis()

Found cached dataset labr (/home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-4a2ac4621d22324b.arrow
Loading cached processed dataset at /home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-3cda0d8daa43dcb9.arrow


####################################################################################################
Number of samples in train split: 11760
####################################################################################################
####################################################################################################
Dropping out the neutral class (label=2)
####################################################################################################
####################################################################################################
Number of train samples after dropping the neutral samples: 9408
####################################################################################################
####################################################################################################
Converting dataset to a dictionary of text:label
##################################################################################################

Found cached dataset labr (/home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-3e414338e0923490.arrow
Loading cached processed dataset at /home/magedsaeed/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-5cf4c4aab57f0c27.arrow


####################################################################################################
Number of samples in test split: 2935
####################################################################################################
####################################################################################################
Dropping out the neutral class (label=2)
####################################################################################################
####################################################################################################
Number of test samples after dropping the neutral samples: 2348
####################################################################################################
####################################################################################################
Converting dataset to a dictionary of text:label
####################################################################################################


In [6]:
len(train_dataset),len(test_dataset)

(9225, 2338)

In [7]:
def shuffle_items(items):
    random.shuffle(items)
    return items

In [8]:
train_dataset = {
    process(text):label
    for text,label in shuffle_items(list(train_dataset.items()))
    if len(process(text).split()) > 10
}
test_dataset = {
    process(text):label
    for text,label in test_dataset.items()
    if len(process(text).split()) > 10
}

In [9]:
len(train_dataset),len(test_dataset)

(7312, 1823)

In [10]:
x_train,x_val,y_train,y_val = train_test_split(list(train_dataset.keys()),list(train_dataset.values()),test_size=0.1)
x_test = list(test_dataset.keys())
y_test = list(test_dataset.values())

In [11]:
len(x_train),len(y_train),len(x_val),len(y_val),len(x_test),len(y_test)

(6580, 6580, 732, 732, 1823, 1823)

In [12]:
number_of_classes = len(set(train_dataset.values()))
number_of_classes

2

In [13]:
max_review_length = len(max(train_dataset,key=str.split).split())
max_review_length

243

In [14]:
# set max review to 200
max_review_length = 200

In [15]:
class ReviewsDataset(Dataset):
    def __init__(
        self,
        X,
        y,
        tokenizer,
        use_tqdm=True,
        undot_text=False,
    ):
        super().__init__()
        max_length = max_review_length
        X = tqdm(X) if use_tqdm else X
        self.encoded_dataset = []
        self.X = X
        self.y = y
        for review in X:
            if not review:
                raise
            if undot_text:
                review = undot(review)
            tokenized_review = tokenizer.tokenize_from_splits(review)
            encoded_review = []
            for token in tokenized_review:
                encoded_review.append(tokenizer.token_to_id(token))
            encoded_review = tokenizer.pad(encoded_review,length=max_length)
            encoded_review = encoded_review[:max_length]
            self.encoded_dataset.append(encoded_review)

    def __getitem__(self, index):
        inputs = torch.LongTensor(self.encoded_dataset[index])
        outputs = torch.tensor([self.y[index]],dtype=torch.float32)
        return inputs, outputs

    def __len__(self):
        assert len(self.X) == len(self.y)
        return len(self.encoded_dataset)

In [16]:
with open("tmp_dataset3.txt", "w") as f:
    f.write("\n".join(item for item in x_train))
#tokenizer = WordTokenizer(vocab_size=10**6) # put very high threshold for vocab size
tokenizer = WordTokenizer(vocab_size=10**4) # put very high threshold for vocab size
tokenizer.train('tmp_dataset3.txt')

Training WordTokenizer ...


In [17]:
tokenizer.vocab_size

10000

In [18]:
# randomly see some samples in the dataset
ReviewsDataset(
            X=x_train,
            y=y_train,
            tokenizer=tokenizer,
        )[:5]


  0%|          | 0/6580 [00:00<?, ?it/s]

(tensor([[  28, 4094, 1540,  131,   86,   31,    0,  194, 3009,  444,    2, 3834,
          3835,    0, 3836,    0,  904,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1, 

In [19]:
list(x_train)[150]

'عبر شخصية راغب دميان الدكتور مصطفى محمود يسرد خيالاته العلمية وتطلعاته الروحية يمزج الخيال بالعلم بالفلسفة في سلسلة احداث مشوقة تدعوك لمتابعتها بحيث لا تبدا معه في العيادة حين يستقبل المهندس راغب دميان ويحار في امر خلايا دماغه ثم يقرر ان يتتبع غموضه ويكشف قصته فلا تستطيع ان تغادر الرواية حينها الا بعد ان تكشف معه عن السر الدكتور مصطفى محمود استخدم اطلاعه العلمي بشكل راىع في توصيف الدور الذي يلعبه الفص الصنوبري في ادمغتنا وعما اذا كان هذا الجزء المعطل عند اغلب الناس هو ما يطلق عليه فعلا العين الثالثة والتي يمكن اذا استطعنا استخدامه ان نرى ليس من خلال المكان فقط نخترق حجب الزمان والاكوان رواية شيقة توسس لوعي علمي وتشريحي جيد وتمتعك بالسرد الرواىي للاحداث الغامضة'

# Sentiment Analysis Model

In [20]:
class LitSentimentAnalysisModel(LightningModule):
    def __init__(
        self,
        vocab_size,
        num_layers=2,
        gru_hiddens=256,
        gru_dropout=0.5,
        dropout_prob=0.333,
        embedding_size=128,
        learning_rate=0.001,
    ):
        super().__init__()
        self.save_hyperparameters()

        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.gru_hiddens = gru_hiddens
        self.dropout_prob = dropout_prob
        self.learning_rate = learning_rate
        self.embedding_size = embedding_size

        self.train_accuracy = torchmetrics.Accuracy(task="binary")
        self.val_accuracy = torchmetrics.Accuracy(task="binary")
        self.test_accuracy = torchmetrics.Accuracy(task="binary")

        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.embedding_size,
        )
        self.gru_layer = nn.GRU(
            input_size=self.embedding_size,
            hidden_size=self.gru_hiddens,
            num_layers=self.num_layers,
            dropout=gru_dropout,
            batch_first=True,
            bidirectional=False,
        )
        # self.first_dense_layer = nn.Linear(
        #     in_features=self.gru_hiddens,
        #     out_features=128,
        # )
        # self.dropout_layer = nn.Dropout(p=self.dropout_prob)
        self.relu = nn.ReLU()
        self.second_dense_layer = nn.Linear(
            # in_features=128,
            in_features=256,
            out_features=1,
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hiddens=None):
        outputs = self.embedding_layer(x)
        outputs, hiddens = self.gru_layer(outputs)
        # outputs = self.first_dense_layer(outputs)
        # outputs = self.dropout_layer(outputs)
        outputs = self.relu(outputs)
        outputs = self.second_dense_layer(outputs)
        outputs = self.sigmoid(outputs)
        return outputs

    def step(self, inputs, labels):
        outputs = self(inputs)
        outputs = outputs[:, -1, :]  # take the results at the last time-step
        return outputs

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        train_accuracy = self.train_accuracy(outputs, labels)
        self.log(
            "loss",
            loss,
            on_step=True,
            on_epoch=False,
        )
        self.log(
            "train_acc",
            train_accuracy,
            on_step=True,
            on_epoch=False,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        val_accuracy = self.val_accuracy(outputs, labels)
        self.log("val_loss", loss, prog_bar=True)
        self.log(
            "val_acc",
            val_accuracy,
            on_step=True,
            on_epoch=False,
            prog_bar=True,
            logger=True,
        )
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        test_accuracy = self.test_accuracy(outputs, labels)
        metrics = {"test_acc": test_accuracy, "test_loss": loss}
        self.log_dict(metrics, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.learning_rate,
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer,
            factor=0.1,
            patience=2,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "val_loss",
        }


In [21]:
lr_monitor = LearningRateMonitor(
    logging_interval="step",
    log_momentum=True,
)

In [22]:
def get_best_checkpoint(text_type, tokenizer_class=WordTokenizer, checkpoints_base_path="SentimentAnalysis"):
    checkpoints_path = (
        f"{checkpoints_base_path}/{text_type}/{tokenizer_class.__name__}/checkpoints"
    )
    for file_name in os.listdir(checkpoints_path):
        if file_name.startswith("epoch"):
            return f"{checkpoints_path}/{file_name}"

In [23]:
batch_size = 128
epochs = 100

# Dotted Experiment

In [24]:
with open("tmp_dataset3.txt", "w") as f:
    f.write("\n".join(item for item in x_train))
dotted_tokenizer = WordTokenizer(vocab_size=10**4)
dotted_tokenizer.train('tmp_dataset3.txt')

Training WordTokenizer ...


In [25]:
dotted_tokenizer.vocab_size

10000

In [26]:
train_dataset = ReviewsDataset(X=x_train,y=y_train,tokenizer=dotted_tokenizer)

  0%|          | 0/6580 [00:00<?, ?it/s]

In [27]:
val_dataset = ReviewsDataset(X=x_val,y=y_val,tokenizer=dotted_tokenizer)

  0%|          | 0/732 [00:00<?, ?it/s]

In [28]:
test_dataset = ReviewsDataset(X=x_test,y=y_test,tokenizer=dotted_tokenizer)

  0%|          | 0/1823 [00:00<?, ?it/s]

In [29]:
train_dataloader = DataLoader(
    shuffle=True,
    dataset=train_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [30]:
val_dataloader = DataLoader(
    shuffle=False,
    dataset=val_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [31]:
test_dataloader = DataLoader(
    shuffle=False,
    dataset=test_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [32]:
model = LitSentimentAnalysisModel(vocab_size=dotted_tokenizer.vocab_size)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(10000, 128)
  (gru_layer): GRU(128, 256, num_layers=2, batch_first=True, dropout=0.5)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [33]:
shutil.rmtree(
    Path(
        f"SentimentAnalysis/dotted/{dotted_tokenizer.__class__.__name__}/checkpoints"
    ),
    ignore_errors=True,
)
checkpoint_callback = ModelCheckpoint(
    mode="min",
    save_top_k=1,
    verbose=False,
    save_last=False,
    monitor="val_loss",
    save_weights_only=False,
    auto_insert_metric_name=True,
    save_on_train_epoch_end=False,
    filename="{epoch}-{val_loss:.3f}-{step}",
    dirpath=f"SentimentAnalysis/dotted/{dotted_tokenizer.__class__.__name__}/checkpoints",
)

In [34]:
wandb_logger = WandbLogger(
    project="MC",
    name=f"dotted_{dotted_tokenizer.__class__.__name__}",
    reinit=True,
)
wandb_logger.watch(model, log="all")

In [35]:
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.05,
    patience=20,
    check_finite=True,
)

In [36]:
trainer = Trainer(
    max_epochs=epochs,
    devices=[0],
    deterministic=True,
    logger=wandb_logger,
    val_check_interval=0.5,
    accelerator="cuda",
    log_every_n_steps=max(len(train_dataloader) // 25, 1),
    # default_root_dir=f"LMsModels/{previous_hiddens}",
    callbacks=[
        checkpoint_callback,
        early_stopping_callback,
        lr_monitor,
    ],
)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [37]:
trainer.validate(
    model=model,
    dataloaders=val_dataloader,
)

  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'val_loss': 0.6936768293380737}]

In [38]:
trainer.fit(model,train_dataloader,val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type           | Params
------------------------------------------------------
0 | train_accuracy     | BinaryAccuracy | 0     
1 | val_accuracy       | BinaryAccuracy | 0     
2 | test_accuracy      | BinaryAccuracy | 0     
3 | embedding_layer    | Embedding      | 1.3 M 
4 | gru_layer          | GRU            | 691 K 
5 | relu               | ReLU           | 0     
6 | second_dense_layer | Linear         | 257   
7 | sigmoid            | Sigmoid        | 0     
------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.886     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [39]:
model = LitSentimentAnalysisModel.load_from_checkpoint(
    get_best_checkpoint(text_type="dotted")
)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(10000, 128)
  (gru_layer): GRU(128, 256, num_layers=2, batch_first=True, dropout=0.5)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [40]:
trainer.test(model,test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.7003348469734192, 'test_loss': 0.6057667136192322}]