# Setup

In [1]:
%cd ../../../..

/home/majed_alshaibani/Experiments/DotlessArabic


In [2]:
import os
import re
import random
import shutil
from pathlib import Path
from collections import defaultdict

import wandb
from tqdm.auto import tqdm

import nltk
from farasa.stemmer import FarasaStemmer

import torch
import torchmetrics
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import LightningModule,Trainer
from pytorch_lightning.callbacks import EarlyStopping,LearningRateMonitor,ModelCheckpoint


from sklearn.model_selection import train_test_split

from dotless_arabic.processing import undot,process
from dotless_arabic.tokenizers import CharacterTokenizer,WordTokenizer
from dotless_arabic.experiments.nlms.src import constants


from dotless_arabic.datasets.labr.collect import collect_train_dataset_for_sentiment_analysis,collect_test_dataset_for_sentiment_analysis

In [3]:
random.seed(42)
nltk.download('stopwords')
os.environ['WANDB_MODE']='disabled'
os.environ['CUDA_LAUNCH_BLOCKING']='1' # to see CUDA errors
torch.cuda.empty_cache() # to free gpu memory
seed_everything(42, workers=True)
# other options: https://stackoverflow.com/questions/15197286/how-can-i-flush-gpu-memory-using-cuda-physical-reset-is-unavailable

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/majed_alshaibani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Global seed set to 42


42

In [4]:
wandb.login()

True

# Prepare the dataset

In [5]:
train_dataset = collect_train_dataset_for_sentiment_analysis()
test_dataset = collect_test_dataset_for_sentiment_analysis()

Found cached dataset labr (/home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-4a2ac4621d22324b.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-3cda0d8daa43dcb9.arrow


####################################################################################################
Number of samples in train split: 11760
####################################################################################################
####################################################################################################
Dropping out the neutral class (label=2)
####################################################################################################
####################################################################################################
Number of train samples after dropping the neutral samples: 9408
####################################################################################################
####################################################################################################
Converting dataset to a dictionary of text:label
##################################################################################################

Found cached dataset labr (/home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-3e414338e0923490.arrow
Loading cached processed dataset at /home/majed_alshaibani/.cache/huggingface/datasets/labr/plain_text/1.0.0/f015cadcb8824981cb60dcebcac35b833f35fc094de760aab348c03f4816296e/cache-5cf4c4aab57f0c27.arrow


####################################################################################################
Number of samples in test split: 2935
####################################################################################################
####################################################################################################
Dropping out the neutral class (label=2)
####################################################################################################
####################################################################################################
Number of test samples after dropping the neutral samples: 2348
####################################################################################################
####################################################################################################
Converting dataset to a dictionary of text:label
####################################################################################################


In [6]:
len(train_dataset),len(test_dataset)

(9225, 2338)

In [7]:
def shuffle_items(items):
    random.shuffle(items)
    return items

### processing

In [8]:
train_dataset = {
    process(text):label
    for text,label in tqdm(shuffle_items(list(train_dataset.items())))
    if len(process(text).split()) > 0
}
test_dataset = {
    process(text):label
    for text,label in tqdm(test_dataset.items())
    if len(process(text).split()) > 0
}

  0%|          | 0/9225 [00:00<?, ?it/s]

  0%|          | 0/2338 [00:00<?, ?it/s]

In [9]:
len(train_dataset),len(test_dataset)

(9198, 2330)

### removing stopwords

In [10]:
stopwords = list(map(process,nltk.corpus.stopwords.words('arabic')))
len(stopwords)

754

In [11]:
train_dataset = {
    " ".join(word for word in text.split() if word not in stopwords): label
    for text, label in tqdm(train_dataset.items())
    if len(" ".join(word for word in text.split() if word not in stopwords).split()) > 0
}
test_dataset = {
    " ".join(word for word in text.split() if word not in stopwords): label
    for text, label in tqdm(test_dataset.items())
    if len(" ".join(word for word in text.split() if word not in stopwords).split()) > 0
}

  0%|          | 0/9198 [00:00<?, ?it/s]

  0%|          | 0/2330 [00:00<?, ?it/s]

In [12]:
len(train_dataset),len(test_dataset)

(9182, 2329)

### drop repeated letters

for instance 'جدددا'=>'جدا'

Note that this may include valid words but the effect of these words is assumed to be minor.

This method did not improve the results, it is commented. In theory, it should work, maybe the dataset size here is so small?

In [13]:
# drop_consequent_letters = lambda text: re.sub(r'(.)\1*', r'\1', text)

In [14]:
# train_dataset = {drop_consequent_letters(text): label for text, label in tqdm(train_dataset.items())}
# test_dataset = {drop_consequent_letters(text): label for text, label in tqdm(test_dataset.items())}

In [15]:
# list(train_dataset.keys())[:3],list(test_dataset.keys())[:3]

### stemming

In [16]:
stemmer = FarasaStemmer(interactive=True)



In [17]:
train_dataset = {stemmer.stem(text): label for text, label in tqdm(train_dataset.items())}
test_dataset = {stemmer.stem(text): label for text, label in tqdm(test_dataset.items())}

  0%|          | 0/9182 [00:00<?, ?it/s]

  0%|          | 0/2329 [00:00<?, ?it/s]

In [18]:
list(train_dataset.keys())[:3],list(test_dataset.keys())[:3]

(['كتاب وضح حياة بشرية وجب قام علي اساس اعتقاد منهج تربوي واجه خواء روحي عانى شاب جيل اقتباس دولة متخلف اقصد متفدم ممكن وصل الي مرتبة دولة متقدم وجود فساد إداري قمع جهد شعب ذلك حفاظ علي تبعية غربي أخذ أطعم حقن دماى مذهب مادي معتقد تزخرف أسلوب ادبي أعطى شكل جمالي فكر منهج سخر عنصر أخلاقي تصور اعتقاد دين ذلك وضع خط فاصل دين ممارسة دين دين الله خادم أحد ليس أسياد يوجهونه أراد دين إسلامي مجموعة ملامح نستوفي سنة أصبح مسلم تصور رسخ قاع ذهن روح فتتطبع سنة افعال أصبح علاقة راسي ديني عبد رب علاقة افقي حكم علاقة فرد واقع محيط',
  'ممتع انتهى سريع',
  'كتاب قمة روعة كأن خاطب قد استمتع درج تمنى زاد صفحة'],
 ['مقال كتاب مستوى متوقع انخدع عدد طبعة وصل إلى كتاب كتاب عبارة وجبة خفيف جد تقرا كتاب دسم تخفيف معلومة ثقيل عنى اقرا مجرد مرح فقط مقال أفهم مايرمي إلى كاتب سخرية أعجب مقطع مجاملة ناس قطاع معين صراحة حرية كتابة مهمة جد زمن خاصتا كتاب موجه شريحة شاب اغلب لدى وعي ادارك حدث حول ليس مثلما كان جيل سابق عنى عامي لاتستعبطنا شكر',
  'اختلاف فكر نقل ذكر كاتب شخصي افكر احلل عالم مافي عقل يطرز ورقة صفحة اع

### sptlis

In [19]:
x_train,x_val,y_train,y_val = train_test_split(list(train_dataset.keys()),list(train_dataset.values()),test_size=0.1)
x_test = list(test_dataset.keys())
y_test = list(test_dataset.values())

In [20]:
len(x_train),len(y_train),len(x_val),len(y_val),len(x_test),len(y_test)

(8246, 8246, 917, 917, 2323, 2323)

In [21]:
number_of_classes = len(set(train_dataset.values()))
number_of_classes

2

In [22]:
max_review_length = len(max(train_dataset,key=str.split).split())
max_review_length

198

In [23]:
# set max review to 200
max_review_length = 200

### wrapping in a pytorch dataset class

In [24]:
class ReviewsDataset(Dataset):
    def __init__(
        self,
        X,
        y,
        tokenizer,
        use_tqdm=True,
        undot_text=False,
    ):
        super().__init__()
        max_length = max_review_length
        X = tqdm(X) if use_tqdm else X
        self.encoded_dataset = []
        self.X = X
        self.y = y
        for review in X:
            if not review:
                raise
            if undot_text:
                review = undot(review)
            tokenized_review = tokenizer.tokenize_from_splits(review)
            encoded_review = []
            for token in tokenized_review:
                encoded_review.append(tokenizer.token_to_id(token))
            encoded_review = tokenizer.pad(encoded_review,length=max_length)
            encoded_review = encoded_review[:max_length]
            self.encoded_dataset.append(encoded_review)

    def __getitem__(self, index):
        inputs = torch.LongTensor(self.encoded_dataset[index])
        outputs = torch.tensor([self.y[index]],dtype=torch.float32)
        return inputs, outputs

    def __len__(self):
        assert len(self.X) == len(self.y)
        return len(self.encoded_dataset)

In [25]:
tokenizer = WordTokenizer(vocab_size=10**4) # put very high threshold for vocab size
tokenizer.train(text="\n".join(item for item in x_train))

Training WordTokenizer ...


In [26]:
tokenizer.vocab_size

10000

In [27]:
list(tokenizer.vocab)[5:10]

['في', 'كاتب', 'جد', 'ان', 'قراءة']

In [28]:
# randomly see some samples in the dataset
ReviewsDataset(
            X=x_train,
            y=y_train,
            tokenizer=tokenizer,
        )[:2]


  0%|          | 0/8246 [00:00<?, ?it/s]

(tensor([[  12, 2223, 5141,  106,    7,  670,   19, 2223, 5141,   64,  305, 1893,
           271, 2224, 1502,  244,   70, 8219, 1736, 2815,  140,   64,   80,  255,
          9999,   26,  207,   40, 2147,   14,   43,  222,  517,   70,    9,    3,
          2223, 5141,   61,  130,  404,  225,  116,   14, 2300,   80,   26,  105,
           137,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1, 

In [29]:
list(x_train)[150]

'كتاب جميل أسلوب ساخر خفيف رغم عمق قضية اللي ناقش عجبني جيمس بوند عند شاطر إملاء أصفر جر ده معدي اوووووووي حريقاااااااه شى نجس ملعب رفث الي شعب مشكلة هيد'

# Sentiment Analysis Model

In [30]:
class LitSentimentAnalysisModel(LightningModule):
    def __init__(
        self,
        vocab_size,
        num_layers=2,
        gru_hiddens=128,
        gru_dropout=0.25,
        dropout_prob=0.333,
        embedding_size=128,
        learning_rate=0.001,
    ):
        super().__init__()
        self.save_hyperparameters()

        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.gru_hiddens = gru_hiddens
        self.dropout_prob = dropout_prob
        self.learning_rate = learning_rate
        self.embedding_size = embedding_size

        self.train_accuracy = torchmetrics.Accuracy(task="binary")
        self.val_accuracy = torchmetrics.Accuracy(task="binary")
        self.test_accuracy = torchmetrics.Accuracy(task="binary")

        self.embedding_layer = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.embedding_size,
        )
        self.gru_layer = nn.GRU(
            input_size=self.embedding_size,
            hidden_size=self.gru_hiddens,
            num_layers=self.num_layers,
            dropout=gru_dropout,
            batch_first=True,
            bidirectional=False,
        )
        # self.first_dense_layer = nn.Linear(
        #     in_features=self.gru_hiddens,
        #     out_features=128,
        # )
        # self.dropout_layer = nn.Dropout(p=self.dropout_prob)
        self.relu = nn.ReLU()
        self.second_dense_layer = nn.Linear(
            # in_features=128,
            in_features=self.gru_hiddens,
            out_features=1,
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hiddens=None):
        outputs = self.embedding_layer(x)
        outputs, hiddens = self.gru_layer(outputs)
        # outputs = self.first_dense_layer(outputs)
        # outputs = self.dropout_layer(outputs)
        outputs = self.relu(outputs)
        outputs = self.second_dense_layer(outputs)
        outputs = self.sigmoid(outputs)
        return outputs

    def step(self, inputs, labels):
        outputs = self(inputs)
        outputs = outputs[:, -1, :]  # take the results at the last time-step
        return outputs

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        train_accuracy = self.train_accuracy(outputs, labels)
        self.log(
            "loss",
            loss,
            on_step=True,
            on_epoch=False,
        )
        self.log(
            "train_acc",
            train_accuracy,
            on_step=True,
            on_epoch=False,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        val_accuracy = self.val_accuracy(outputs, labels)
        self.log("val_loss", loss, prog_bar=True)
        self.log(
            "val_acc",
            val_accuracy,
            on_step=True,
            on_epoch=False,
            prog_bar=True,
            logger=True,
        )
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.step(inputs, labels)
        loss = F.binary_cross_entropy(outputs, labels)
        test_accuracy = self.test_accuracy(outputs, labels)
        metrics = {"test_acc": test_accuracy, "test_loss": loss}
        self.log_dict(metrics, prog_bar=True)

    def configure_optimizers(self):
        # optimizer = torch.optim.Adam(
        #     self.parameters(),
        #     lr=self.learning_rate,
        # )
        optimizer = torch.optim.RMSprop(
            self.parameters(),
            lr=self.learning_rate,
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer,
            factor=0.1,
            patience=2,
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "val_loss",
        }


In [31]:
lr_monitor = LearningRateMonitor(
    logging_interval="step",
    log_momentum=True,
)

In [32]:
def get_best_checkpoint(text_type, tokenizer_class=WordTokenizer, checkpoints_base_path="SentimentAnalysis"):
    checkpoints_path = (
        f"{checkpoints_base_path}/{text_type}/{tokenizer_class.__name__}/checkpoints"
    )
    for file_name in os.listdir(checkpoints_path):
        if file_name.startswith("epoch"):
            return f"{checkpoints_path}/{file_name}"

In [33]:
batch_size = 64
epochs = 100

# Dotted Experiment

In [34]:
# let us see how many vocab we have:
dotted_tokenizer = WordTokenizer(vocab_size=10**6)
dotted_tokenizer.train(text="\n".join(item for item in x_train))
dotted_tokenizer.vocab_size

Training WordTokenizer ...


25387

In [35]:
vocab_size = len([voc for voc,occurances in dotted_tokenizer.vocab.items() if occurances>1])
vocab_size

13192

In [36]:
dotted_tokenizer = WordTokenizer(vocab_size=vocab_size)
dotted_tokenizer.train(text="\n".join(item for item in x_train))

Training WordTokenizer ...


In [37]:
dotted_tokenizer.vocab_size

13192

In [38]:
train_dataset = ReviewsDataset(X=x_train,y=y_train,tokenizer=dotted_tokenizer)

  0%|          | 0/8246 [00:00<?, ?it/s]

In [39]:
val_dataset = ReviewsDataset(X=x_val,y=y_val,tokenizer=dotted_tokenizer)

  0%|          | 0/917 [00:00<?, ?it/s]

In [40]:
test_dataset = ReviewsDataset(X=x_test,y=y_test,tokenizer=dotted_tokenizer)

  0%|          | 0/2323 [00:00<?, ?it/s]

In [41]:
train_dataloader = DataLoader(
    shuffle=True,
    dataset=train_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [42]:
val_dataloader = DataLoader(
    shuffle=False,
    dataset=val_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [43]:
test_dataloader = DataLoader(
    shuffle=False,
    dataset=test_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [44]:
model = LitSentimentAnalysisModel(vocab_size=dotted_tokenizer.vocab_size)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(13192, 128)
  (gru_layer): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.25)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [45]:
shutil.rmtree(
    Path(
        f"SentimentAnalysis/dotted/{dotted_tokenizer.__class__.__name__}/checkpoints"
    ),
    ignore_errors=True,
)
checkpoint_callback = ModelCheckpoint(
    mode="min",
    save_top_k=1,
    verbose=False,
    save_last=False,
    monitor="val_loss",
    save_weights_only=False,
    auto_insert_metric_name=True,
    save_on_train_epoch_end=False,
    filename="{epoch}-{val_loss:.3f}-{step}",
    dirpath=f"SentimentAnalysis/dotted/{dotted_tokenizer.__class__.__name__}/checkpoints",
)

In [46]:
wandb_logger = WandbLogger(
    project="SA",
    name=f"dotted_{dotted_tokenizer.__class__.__name__}",
    reinit=True,
)
wandb_logger.watch(model, log="all")

In [47]:
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.01,
    patience=20,
    check_finite=True,
)

In [48]:
trainer = Trainer(
    max_epochs=epochs,
    devices=[0],
    deterministic=True,
    logger=wandb_logger,
    val_check_interval=0.5,
    accelerator="cuda",
    log_every_n_steps=max(len(train_dataloader) // 25, 1),
    # default_root_dir=f"LMsModels/{previous_hiddens}",
    callbacks=[
        checkpoint_callback,
        early_stopping_callback,
        lr_monitor,
    ],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [49]:
trainer.validate(
    model=model,
    dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'val_loss': 0.6939523816108704}]

In [50]:
trainer.fit(model,train_dataloader,val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type           | Params
------------------------------------------------------
0 | train_accuracy     | BinaryAccuracy | 0     
1 | val_accuracy       | BinaryAccuracy | 0     
2 | test_accuracy      | BinaryAccuracy | 0     
3 | embedding_layer    | Embedding      | 1.7 M 
4 | gru_layer          | GRU            | 198 K 
5 | relu               | ReLU           | 0     
6 | second_dense_layer | Linear         | 129   
7 | sigmoid            | Sigmoid        | 0     
------------------------------------------------------
1.9 M     Trainable params
0         Non-trainable params
1.9 M     Total params
7.547     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [51]:
model = LitSentimentAnalysisModel.load_from_checkpoint(
    get_best_checkpoint(text_type="dotted")
)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(13192, 128)
  (gru_layer): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.25)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [52]:
trainer.test(model,test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.7261284589767456, 'test_loss': 0.5572012066841125}]

# Undotted Experiment

In [53]:
undotted_x_train = list(map(undot,x_train))
undotted_x_val = list(map(undot,x_val))
undotted_x_test = list(map(undot,x_test))

In [54]:
# let us see how many vocab do we have
dotless_tokenizer = WordTokenizer(vocab_size=10**6)
dotless_tokenizer.train(text="\n".join(item for item in undotted_x_train))
dotless_tokenizer.vocab_size

Training WordTokenizer ...


19912

In [55]:
vocab_size = len([voc for voc,occurances in dotless_tokenizer.vocab.items() if occurances>1])
vocab_size

10848

In [56]:
dotless_tokenizer = WordTokenizer(vocab_size=vocab_size)
dotless_tokenizer.train(text="\n".join(item for item in undotted_x_train))

Training WordTokenizer ...


In [57]:
dotless_tokenizer.vocab_size

10848

In [58]:
train_dataset = ReviewsDataset(X=undotted_x_train,y=y_train,tokenizer=dotless_tokenizer,undot_text=True)

  0%|          | 0/8246 [00:00<?, ?it/s]

In [59]:
val_dataset = ReviewsDataset(X=undotted_x_val,y=y_val,tokenizer=dotless_tokenizer,undot_text=True)

  0%|          | 0/917 [00:00<?, ?it/s]

In [60]:
test_dataset = ReviewsDataset(X=undotted_x_test,y=y_test,tokenizer=dotless_tokenizer,undot_text=True)

  0%|          | 0/2323 [00:00<?, ?it/s]

In [61]:
train_dataloader = DataLoader(
    shuffle=True,
    dataset=train_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [62]:
val_dataloader = DataLoader(
    shuffle=False,
    dataset=val_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [63]:
test_dataloader = DataLoader(
    shuffle=False,
    dataset=test_dataset,
    num_workers=1,
    drop_last=True,
    batch_size=batch_size,
)

In [64]:
model = LitSentimentAnalysisModel(vocab_size=dotless_tokenizer.vocab_size)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(10848, 128)
  (gru_layer): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.25)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [65]:
shutil.rmtree(
    Path(
        f"SentimentAnalysis/dotless/{dotless_tokenizer.__class__.__name__}/checkpoints"
    ),
    ignore_errors=True,
)
checkpoint_callback = ModelCheckpoint(
    mode="min",
    save_top_k=1,
    verbose=False,
    save_last=False,
    monitor="val_loss",
    save_weights_only=False,
    auto_insert_metric_name=True,
    save_on_train_epoch_end=False,
    filename="{epoch}-{val_loss:.3f}-{step}",
    dirpath=f"SentimentAnalysis/dotless/{dotless_tokenizer.__class__.__name__}/checkpoints",
)

In [66]:
wandb_logger = WandbLogger(
    project="SA",
    name=f"dot;ess_{dotless_tokenizer.__class__.__name__}",
    reinit=True,
)
wandb_logger.watch(model, log="all")

  rank_zero_warn(


In [67]:
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.01,
    patience=20,
    check_finite=True,
)

In [68]:
trainer = Trainer(
    max_epochs=epochs,
    devices=[0],
    deterministic=True,
    logger=wandb_logger,
    val_check_interval=0.5,
    accelerator="cuda",
    log_every_n_steps=max(len(train_dataloader) // 25, 1),
    # default_root_dir=f"LMsModels/{previous_hiddens}",
    callbacks=[
        checkpoint_callback,
        early_stopping_callback,
        lr_monitor,
    ],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [69]:
trainer.validate(
    model=model,
    dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Validation: 0it [00:00, ?it/s]

[{'val_loss': 0.6933263540267944}]

In [70]:
trainer.fit(model,train_dataloader,val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type           | Params
------------------------------------------------------
0 | train_accuracy     | BinaryAccuracy | 0     
1 | val_accuracy       | BinaryAccuracy | 0     
2 | test_accuracy      | BinaryAccuracy | 0     
3 | embedding_layer    | Embedding      | 1.4 M 
4 | gru_layer          | GRU            | 198 K 
5 | relu               | ReLU           | 0     
6 | second_dense_layer | Linear         | 129   
7 | sigmoid            | Sigmoid        | 0     
------------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.347     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [71]:
model = LitSentimentAnalysisModel.load_from_checkpoint(
    get_best_checkpoint(text_type="dotless")
)
model

LitSentimentAnalysisModel(
  (train_accuracy): BinaryAccuracy()
  (val_accuracy): BinaryAccuracy()
  (test_accuracy): BinaryAccuracy()
  (embedding_layer): Embedding(10848, 128)
  (gru_layer): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.25)
  (relu): ReLU()
  (second_dense_layer): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [72]:
trainer.test(model,test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.7282986044883728, 'test_loss': 0.54630446434021}]