<a href="https://colab.research.google.com/github/KrishPro/sentiment-analysis/blob/main/trainned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q torchtext==0.10 torchaudio==0.9.0 torchvision==0.10.0 tf-estimator-nightly==2.8.0.dev2021122109 earthengine-api==0.1.238 folium==0.2.1
!pip install -q cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!pip install -q pytorch_lightning transformers pyngrok

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from typing import Optional
from pytorch_lightning import LightningDataModule

import torch.utils.data as data
import pandas as pd
import shutil
import kaggle
import os

from transformers.models.distilbert.tokenization_distilbert_fast import DistilBertTokenizerFast

from tensorboard import program
from pyngrok import ngrok
from pytorch_lightning import Trainer
import torch.nn as nn
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup

from transformers import DistilBertConfig
from transformers.models.distilbert.modeling_distilbert import DistilBertModel, BaseModelOutput
from pytorch_lightning import LightningModule
import torch



In [4]:
tracking_address = "/content/drive/MyDrive/Models/sentiment-analysis/fine-tuning/lightning_logs" # the path of your log file.

if __name__ == "__main__":
    tb = program.TensorBoard()
    tb.configure(argv=[None, '--logdir', tracking_address])
    url = tb.launch()
    print(f"Tensorflow listening on {url}")


NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784



Tensorflow listening on http://localhost:6006/


In [16]:
# !wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
# !tar zxvf ngrok-stable-linux-amd64.tgz
# !./ngrok authtoken 1y2MQMr0xLh05Dvbb0dABiNQpAY_3bqEfwwEtM7duDaqwrN93

ngrok.connect(6006)

<NgrokTunnel: "http://4538-34-122-203-194.ngrok.io" -> "http://localhost:6006">

In [6]:
class Dataset(data.Dataset):
    def __init__(self, split: str = "Train"):
        self.tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
        csv: pd.DataFrame = pd.read_csv(f"Data/{split}.csv")
        self.data = csv.to_numpy()

    def __getitem__(self, idx):
        review, label = tuple(self.data[idx])
        label = torch.tensor(label, dtype=torch.float)

        tokens: tuple[torch.Tensor, torch.Tensor] = tuple(self.tokenizer(review, padding="max_length", truncation=True, return_tensors="pt").values())
        input_ids, attention_mask =  tokens

        return (input_ids.squeeze(0), attention_mask.squeeze(0)), label

    def __len__(self):
        return len(self.data)

In [7]:
def download_and_unzip_data(dataset_name: str, output_dir: str):
    """
    But I just made it because I challenged myself
    """

    # This goes inside if-block if, output_dir doesn't exist or if it exists it is empty

    if (not os.path.exists(output_dir) or len(os.listdir(output_dir)) == 0):
 
        try: shutil.rmtree(output_dir)
        except FileNotFoundError: pass

        kaggle.api.authenticate()

        kaggle.api.dataset_download_files(dataset_name, path=output_dir, unzip=True)

In [8]:
class IMDBDataModule(LightningDataModule):
    def __init__(self, batch_size = 16):
        self.batch_size = batch_size

    def prepare_data(self) -> None:
        """
        Dataset is available at Kaggle
        https://www.kaggle.com/columbine/imdb-dataset-sentiment-analysis-in-csv-format
        """

        download_and_unzip_data("columbine/imdb-dataset-sentiment-analysis-in-csv-format", output_dir = "Data/")

    def setup(self, stage: Optional[str] = None):
        if (stage == "fit") or (stage == None):
            self.train_dataset = Dataset("Train")
            self.val_dataset = Dataset("Valid")

        if (stage == "test") or (stage == None):
            self.test_dataset = Dataset("Test")

    def train_dataloader(self):
        return data.DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return data.DataLoader(self.test_dataset, batch_size=self.batch_size)

In [9]:
class Model(LightningModule):
    def __init__(self):
        super(Model, self).__init__()

        self.config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
        self.model = DistilBertModel.from_pretrained("distilbert-base-uncased")

        self.dim: int = self.config.dim
        self.dropout: float = self.config.dropout

        self.classifier = nn.Sequential(
            nn.Linear(self.dim, self.dim),
            nn.GELU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.dim, self.dim),
            nn.GELU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.dim, 1),
            nn.Sigmoid()
        )
        
    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        
        bert_output: BaseModelOutput = self.model(input_ids, attention_mask)
       
        cls_token = bert_output.last_hidden_state[:, 0, :]

        assert (cls_token.dim() == 2) and (cls_token.size(1) == self.dim), f"cls_token shape must be ({input_ids.size(0)}, {self.dim}), Got {cls_token.shape}"

        output: torch.Tensor = self.classifier(cls_token)

        return output.squeeze(1)

In [10]:
class TrainModel(Model):
    def __init__(self, learning_rate: float, ultimate_batch_size: int, epochs: int):
        super(TrainModel, self).__init__()
        self.learning_rate = learning_rate
        self.criterion = nn.BCELoss()
        self.total_steps = (45_000 // ultimate_batch_size) * epochs
        self.warmup_steps = int(0.1 * self.total_steps)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def training_step(self, batch: tuple, batch_idx: int):
        (input_ids, attention_mask), label = batch
        preds = self(input_ids, attention_mask)
        loss = self.criterion(preds, label)
        self.log("lr", self.optimizers().optimizer.param_groups[0]['lr'], prog_bar=True)
        return loss
    

    def validation_step(self, batch: tuple, batch_idx: int):
        (input_ids, attention_mask), label = batch
        preds = self(input_ids, attention_mask)
        loss = self.criterion(preds, label)
        self.log("val_loss", loss.item(), True)
        return loss

    def test_step(self, batch: tuple, batch_idx: int):
        (input_ids, attention_mask), label = batch
        preds = self(input_ids, attention_mask)
        loss = self.criterion(preds, label)
        return loss

In [11]:
trainer = Trainer(tpu_cores=8, default_root_dir="/content/drive/MyDrive/Models/sentiment-analysis/fine-tuning/")
datamodule = IMDBDataModule(batch_size=16)
model = TrainModel(learning_rate=2e-5, ultimate_batch_size=16*8, epochs=100)

trainer.fit(model, datamodule)

GPU available: False, used: False
TPU available: True, using: 8 TPU cores
IPU available: False, using: 0 IPUs
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  | Name       | Type            | Params
-----------------------------------------------
0 | model      | DistilBertModel | 66.4 M
1 | classifier | Sequentia

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [15]:
!zip logs.zip /content/drive/MyDrive/Models/sentiment-analysis/fine-tuning/lightning_logs

  adding: content/drive/MyDrive/Models/sentiment-analysis/fine-tuning/lightning_logs/ (stored 0%)


In [17]:
!unzip logs.zip

Archive:  logs.zip
   creating: content/drive/MyDrive/Models/sentiment-analysis/fine-tuning/lightning_logs/


In [22]:
model(**datamodule.train_dataset.tokenizer("this movie is very very good", return_tensors="pt"))

tensor([0.5099], grad_fn=<SqueezeBackward1>)

In [29]:
model.train()

TrainModel(
  (model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feature

In [31]:
model = model.cpu()

In [None]:
model(*next(iter(datamodule.train_dataloader()))[0])

In [26]:
trainer.validate(model, datamodule.train_dataloader())

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.693497896194458}
--------------------------------------------------------------------------------


  rank_zero_warn("cleaning up tpu spawn environment...")


[{'val_loss': 0.693497896194458}]

In [28]:
new_trainer = Trainer(tpu_cores=1)
new_trainer.validate(model, datamodule.train_dataloader())

GPU available: False, used: False
TPU available: True, using: 1 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: /content/lightning_logs


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.6934975981712341}
--------------------------------------------------------------------------------


  rank_zero_warn("cleaning up tpu spawn environment...")


[{'val_loss': 0.6934975981712341}]