In [1]:
!pip install -qq transformers
!pip install -qq torchtext

[K     |████████████████████████████████| 5.8 MB 12.7 MB/s 
[K     |████████████████████████████████| 7.6 MB 10.8 MB/s 
[K     |████████████████████████████████| 182 kB 63.3 MB/s 
[?25h

In [2]:
!pip install -qq pytorch-lightning

[K     |████████████████████████████████| 798 kB 15.0 MB/s 
[K     |████████████████████████████████| 125 kB 80.8 MB/s 
[K     |████████████████████████████████| 512 kB 67.6 MB/s 
[K     |████████████████████████████████| 87 kB 7.5 MB/s 
[?25h  Building wheel for fire (setup.py) ... [?25l[?25hdone


In [25]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from torch.utils.data import Dataset
import torch
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from dataclasses import dataclass
from typing import Optional
import pytorch_lightning as pl
from torchmetrics import Accuracy
from pytorch_lightning.loggers import TensorBoardLogger

In [26]:
class TextDataset(Dataset):
    def __init__(self, path, max_len):
        super().__init__()
        self.max_len = max_len
        self.scoreMap = {
        'true': 0,
        'mostly-true': 1,
        'half-true': 2,
        'barely-true': 3,
        'false': 4,
        'pants-fire': 5
        }
        self.fdata, self.labels = self._processData(path)
        self.tokenizer = get_tokenizer("basic_english")
        self.bertTokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.vocab = build_vocab_from_iterator(self._buildVocab(self.fdata), min_freq=1, specials=["<UNK>",])
        self.vocab.set_default_index(self.vocab["<UNK>"])
        
    

    def _buildVocab(self, data):
        for text in data:
            yield self.tokenizer(text)
    
    
    def _processData(self, path):
        df = pd.read_csv(path, sep = "\t")
        df.dropna(inplace=True)
        df.columns = [
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.
        
        'context' # Column 14: the context (venue / location of the speech or statement).
        ]
    
        data = [x+y for x,y in zip(df.statement.values, df.context.values)]
        fdata = [x+y for x,y in zip(data, df.subjects.values)]
        df.label = df.label.map(self.scoreMap)
        return fdata, df.label.values.tolist()
    
    def __len__(self):
        return len(self.fdata)
    
    def __getitem__(self, index):
        text = self.fdata[index]
        label = self.labels[index]
        tokens = self.vocab(self.tokenizer(text))
        tokens = tokens+([0]* (self.max_len-len(tokens))) if len(tokens)<self.max_len else tokens[:self.max_len]
        #print(tokens)
        bertInputs = self.bertTokenizer(text, max_length = self.max_len, padding = "max_length", truncation = True, return_tensors = "pt")
        return {"right" : bertInputs, "left" : torch.tensor(tokens), "labels" : torch.tensor(label)}


 

In [48]:
class leftBranch(torch.nn.Module):
    def __init__(self, embedding_len, vocab_len, num_classes, max_len, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_len, embedding_dim=embedding_len)
        self.conv1 = nn.Conv1d(embedding_len, 32, kernel_size=7, padding="same")
        self.max_pool = nn.MaxPool1d(kernel_size = 5)
        self.biLstm = nn.LSTM(input_size = 25, num_layers = 2, bidirectional = True, hidden_size = 50, batch_first = True)
        self.linear = nn.Linear(100, 25)
        self.max_len = max_len
        self.embedding_len = embedding_len
        self.hidden = self.init_hidden()

    def init_hidden2(self):
        return nn.ParameterList((nn.Parameter(torch.zeros(4, self.batch_size, 50),
                                        nn.Parameter(torch.zeros(4, self.batch_size, 50)))))
    
    def init_hidden(self):
        return (torch.zeros((4, self.batch_size, 50)).to("cuda"), torch.zeros((4,self.batch_size, 50)).to("cuda"))

    def forward(self, X_batch):
        x = self.embedding_layer(X_batch)
        x = x.reshape(len(x), self.embedding_len , self.max_len) ## Embedding Length needs to be treated as channel dimension

        x = self.conv1(x)
        x = self.max_pool(x)
        x , _ = self.biLstm(x, self.hidden)
        x = x[:, -1, :]
        return self.linear(x)

In [49]:
class rightBranch(nn.Module):
    def __init__(self, seqLen, bs):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased")
        self.conv1D = nn.Conv1d(seqLen, seqLen // 2, 5) #Kernel size = 5
        self.maxPool1D = nn.MaxPool1d(seqLen // 2)
        self.linear = nn.Linear(64*11, 25)
        self.bs = bs
    
    def forward(self, tokenizeInputs):
        input_ids = tokenizeInputs["input_ids"].squeeze(1)
        attention_masks = tokenizeInputs["attention_mask"].squeeze(1)
        op = self.bert(input_ids, attention_masks, output_hidden_states = True)
        #Taking last hidden state as input to Conv1d
        convOutput = self.conv1D(op.hidden_states[-1])
        #print(convOutput.shape)
        op = self.maxPool1D(convOutput)
        op = op.view(self.bs, -1)
        return self.linear(op)
        #return op


In [50]:
class finalModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.leftBranch = leftBranch(embedding_len = config.embedding_len, vocab_len = config.vocab_len, num_classes = config.num_classes, max_len = config.max_len, batch_size = config.bs)
        self.rightBranch = rightBranch(seqLen = config.max_len, bs = config.bs)
        self.classification_layer = nn.Linear(50, 6)
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, x):
        leftOp = self.leftBranch(x["left"])
        rightOp = self.rightBranch(x["right"])
        op = torch.cat((leftOp, rightOp), 1)
        op = self.classification_layer(op)
        loss = self.criterion(op, x["labels"])
        return loss, op


In [51]:
class PLTrainer(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.train_ds = TextDataset(path = "/content/train.tsv", max_len = 128)
        config.vocab_len = len(self.train_ds.vocab)
        self.model = finalModel(config)
        self.accuracy = Accuracy(task = "multiclass", num_classes = 6)
        #self.automatic_optimization=False
    
    def training_step(self, batch, batch_idx):
        #opt = self.optimizers()
        #opt.zero_grad()
        loss, _ = self.model(batch)
        #self.manual_backward(loss, retain_graph=True)
        #opt.step()
        self.log("train_loss", loss, on_epoch = True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, op = self.model(batch)
        y = batch["labels"]
        op = torch.argmax(op, dim=1)
        val_acc = self.accuracy(op, y)
        self.log("val_loss", loss, on_epoch = True, prog_bar=True)
        self.log("val_acc", loss, on_epoch = True, prog_bar = True)
        return loss
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds, batch_size = 16, drop_last = True)
    
    def val_dataloader(self):
        val_ds = TextDataset(path = "/content/valid.tsv", max_len = 128)
        return torch.utils.data.DataLoader(val_ds, batch_size = 16, drop_last = True)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=0.02)


In [52]:
class Config:
    embedding_len : int = 64
    num_classes : int = 64
    max_len : int = 128
    bs : int = 16
    vocab_len : Optional[int] = None

In [None]:
config = Config()
model = PLTrainer(config)
logger = TensorBoardLogger("tb_logs", name="hybrid_model")
trainer = pl.Trainer(accelerator = "gpu", max_epochs = 5, logger = logger)
trainer.fit(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]