In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer
import warnings
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import wandb
import math
from tqdm import tqdm
from transformers import BertTokenizer
from scipy.stats import spearmanr

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
warnings.filterwarnings('ignore')

In [4]:
def casual_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

In [5]:
class HallucinationDataset(Dataset):
    def __init__(self, dataset, tokenizer, seq_len=512) -> None:
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.seq_len = seq_len
#         self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)
        self.label_mapping = {'Not Hallucination': 1, 'Hallucination': 0}
    
    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, idx):
        task = self.dataset.iloc[idx]['task']
        hypothesis = self.dataset.iloc[idx]['hyp']
        context = self.dataset.iloc[idx]['src'] if task == 'PG' else self.dataset.iloc[idx]['tgt']
        label = self.dataset.iloc[idx]['label']
        
#         hypothesis_encoded = self.tokenizer.encode(hypothesis).ids
#         context_encoded = self.tokenizer.encode(context).ids
        hypothesis_encoding = self.tokenizer(hypothesis, truncation=True, padding='max_length', max_length=self.seq_len)
        context_encoding = self.tokenizer(context, truncation=True, padding='max_length', max_length=self.seq_len)
        
#         for _ in range(len(hypothesis_encoded), self.seq_len): hypothesis_encoded.append(self.pad_token)
#         for _ in range(len(context_encoded), self.seq_len): context_encoded.append(self.pad_token)
        
        if isinstance(label, str):
            label_encoded = self.label_mapping[label]
        else: label_encoded = label
        
#         hypothesis_encoded = torch.tensor(hypothesis_encoded)
#         context_encoded = torch.tensor(context_encoded)
        
        return {
#             "hypothesis_encoded": hypothesis_encoded,
#             "context_encoded": context_encoded,
            "hypothesis_encoded": torch.tensor(hypothesis_encoding['input_ids']),
            "context_encoded": torch.tensor(context_encoding['input_ids']),
#             "hypothesis_mask": (hypothesis_encoded != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
#             "context_mask": (context_encoded != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(self.seq_len),
            "hypothesis_mask": torch.tensor(hypothesis_encoding['attention_mask']).unsqueeze(0).unsqueeze(0),
            "context_mask": torch.tensor(context_encoding['attention_mask']).unsqueeze(0).unsqueeze(0),
            "label": torch.tensor(label_encoded),
            "hypothesis": hypothesis,
            "context": context
        }

In [6]:
train_data = pd.read_csv('data/SHROOM_unlabeled-training-data-v2/train.model-agnostic_labeled.csv')
train_data.fillna('', inplace=True)

val_data = pd.read_json("data/SHROOM_dev-v2/val.model-agnostic.json")
val_data.fillna('', inplace=True)

test_data = pd.read_json("data/SHROOM_test-labeled/test.model-agnostic.json")
test_data.fillna('', inplace=True)

In [7]:
def get_all_sentences(ds):
    for i in range(len(ds)):
        x = ds.iloc[i]['ref']
        yield ds.iloc[i]['hyp'] + ' ' + ds.iloc[i][x if x != 'either' else 'tgt']

In [8]:
def build_tokenizer(ds):
    tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
    tokenizer.train_from_iterator(get_all_sentences(ds), trainer=trainer)
    return tokenizer

In [9]:
# tokenizer = build_tokenizer(train_data)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
train_dataset = HallucinationDataset(train_data, tokenizer)
val_dataset = HallucinationDataset(val_data, tokenizer)
test_dataset = HallucinationDataset(test_data, tokenizer)

In [11]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps: float = 10 ** -6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))  # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features))  # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
        # Keep the dimension for broadcasting
        mean = x.mean(dim=-1, keepdim=True)  # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim=-1, keepdim=True)  # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

In [12]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)  # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)  # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [13]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term)  # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term)  # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0)  # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)  # (batch, seq_len, d_model)
        return self.dropout(x)

In [14]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

In [15]:
class ResidualConnection(nn.Module):

    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [16]:
class MultiQueryAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, groups: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model  # Embedding vector size
        self.h = h  # Number of heads
        self.groups = groups
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"
        # Make sure that number of groups is divisible by h
        # assert h % groups == 0, "h should be divisible by groups"

        self.d_k = d_model // h  # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False)  # Wq
        self.w_k = nn.Linear(d_model, d_model // self.groups, bias=False)  # Wk
        self.w_v = nn.Linear(d_model, d_model // self.groups, bias=False)  # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False)  # Wo
        
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1)  # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q)  # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k)  # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v)  # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h // self.groups, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h // self.groups, self.d_k).transpose(1, 2)
        
        queries = torch.split(query, split_size_or_sections=self.h // self.groups, dim=1)
        
        lst_x, lst_attn = [], []
        
        for query in queries:
            x, attn = MultiQueryAttentionBlock.attention(query, key, value, mask, self.dropout)
            lst_x.append(x)
            lst_attn.append(attn)
        
        x, attention_scores = torch.cat(lst_x, dim=1), torch.cat(lst_attn, dim=1)

        # # Calculate attention
        # x, self.attention_scores = MultiQueryAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

In [17]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiQueryAttentionBlock,
                 feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [18]:
class MixEncoderBlock(nn.Module):
    
    def __init__(self, features: int, self_attention_block: MultiQueryAttentionBlock,
                 feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, x_mask, y, y_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(y, x, x, y_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [19]:
class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [20]:
class MixEncoder(nn.Module):
    
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, hypothesis, hypothesis_mask, context, context_mask):
        for layer in self.layers:
            hypothesis = layer(hypothesis, hypothesis_mask, context, context_mask)
        return self.norm(hypothesis)

In [21]:
class Transformer(nn.Module):

    def __init__(self, 
                 d_model: int,
                 dropout: float,
                 hypothesis_encoder: Encoder, 
                 context_encoder: Encoder, 
                 mix_encoder: MixEncoder, 
                 hypothesis_embed: InputEmbeddings, 
                 context_embed: InputEmbeddings,
                 hypothesis_pos: PositionalEncoding, 
                 context_pos: PositionalEncoding) -> None:
        super().__init__()
        self.hypothesis_encoder = hypothesis_encoder
        self.context_encoder = context_encoder
        self.mix_encoder = mix_encoder
        self.hypothesis_embed = hypothesis_embed
        self.context_embed = context_embed
        self.hypothesis_pos = hypothesis_pos
        self.context_pos = context_pos
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=512, out_channels=64, kernel_size=3, padding=1),
            nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, padding=1),
            nn.Conv1d(in_channels=32, out_channels=16, kernel_size=3, padding=1)
        ])
        
        self.fc = nn.Linear(16, 2)
        self.activation = nn.Sigmoid()
        self.hypothesis_norm = LayerNormalization(d_model)
        self.context_norm = LayerNormalization(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def hypothesis_encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.hypothesis_embed(src)
        src = self.hypothesis_pos(src)
        return self.hypothesis_encoder(src, src_mask)
    
    def context_encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.context_embed(src)
        src = self.context_pos(src)
        return self.context_encoder(src, src_mask)
    
    def hypothesis_context_mix(self, hypothesis_output, hypothesis_mask, context_output, context_mask):
        # (batch, seq_len, d_model)
        output = self.mix_encoder(hypothesis_output, hypothesis_mask, context_output, context_mask)
    
        output += self.dropout(self.hypothesis_norm(hypothesis_output))
        output += self.dropout(self.context_norm(context_output))
        
        return output
    
    def project(self, outputs):
        outputs = outputs.permute(0, 2, 1)
        for conv_layer in self.conv_layers:
            outputs = F.relu(conv_layer(outputs))
        pooled_output = F.max_pool1d(outputs, outputs.size(2)).squeeze(2)
        return self.activation(self.fc(pooled_output))

In [22]:
def build_model(vocab_size: int, seq_len: int, d_model: int = 512, num_groups: int = 1, num_layers: int = 12, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    hypothesis_embed = InputEmbeddings(d_model, vocab_size)
    context_embed = InputEmbeddings(d_model, vocab_size)

    # Create the positional encoding layers
    hypothesis_pos = PositionalEncoding(d_model, seq_len, dropout)
    context_pos = PositionalEncoding(d_model, seq_len, dropout)

    # Create the encoder blocks
    hypothesis_encoder_blocks = []
    for _ in range(num_layers):
        encoder_self_attention_block = MultiQueryAttentionBlock(d_model, h, num_groups, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        hypothesis_encoder_blocks.append(encoder_block)
    
    context_encoder_blocks = []
    for _ in range(num_layers):
        encoder_self_attention_block = MultiQueryAttentionBlock(d_model, h, num_groups, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        context_encoder_blocks.append(encoder_block)
    
    mix_encoder_blocks = []
    for _ in range(num_layers):
        mix_cross_attention_block = MultiQueryAttentionBlock(d_model, h, num_groups, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = MixEncoderBlock(d_model, mix_cross_attention_block, feed_forward_block, dropout)
        mix_encoder_blocks.append(encoder_block)
    
    hypothesis_encoder = Encoder(d_model, nn.ModuleList(hypothesis_encoder_blocks))
    context_encoder = Encoder(d_model, nn.ModuleList(context_encoder_blocks))
    mix_encoder = MixEncoder(d_model, nn.ModuleList(mix_encoder_blocks))
    
    transformer = Transformer(d_model, dropout, hypothesis_encoder, context_encoder, mix_encoder, hypothesis_embed, context_embed, hypothesis_pos, context_pos)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer
    

In [23]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [24]:
# model = build_model(vocab_size=tokenizer.get_vocab_size(), seq_len=512)
model = build_model(vocab_size=tokenizer.vocab_size, seq_len=512)

model.to(device)

Transformer(
  (hypothesis_encoder): Encoder(
    (layers): ModuleList(
      (0-11): 12 x EncoderBlock(
        (self_attention_block): MultiQueryAttentionBlock(
          (w_q): Linear(in_features=512, out_features=512, bias=False)
          (w_k): Linear(in_features=512, out_features=512, bias=False)
          (w_v): Linear(in_features=512, out_features=512, bias=False)
          (w_o): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): Laye

In [25]:
# wandb.init(project="Hallucination Detection", entity="subhanshu20135")

In [26]:
optimizer = torch.optim.AdamW(model.parameters(), lr=10e-4, weight_decay=10e-6)

In [27]:
# criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
criterion = nn.CrossEntropyLoss().to(device)

In [28]:
for epoch in range(10):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    train_iterator = tqdm(train_loader, desc=f'Processing Epoch {epoch:02d}')
    for batch in train_iterator:
        hypothesis_encoded = batch["hypothesis_encoded"].to(device)
        context_encoded = batch["context_encoded"].to(device)
        
        hypothesis_mask = batch["hypothesis_mask"].to(device)
        context_mask = batch["context_mask"].to(device)
        
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        
        hypothesis_outputs = model.hypothesis_encode(hypothesis_encoded, hypothesis_mask)
        
        context_outputs = model.context_encode(context_encoded, context_mask)
        
        mix_outputs = model.hypothesis_context_mix(hypothesis_outputs, hypothesis_mask, context_outputs, context_mask)
        
        outputs = model.project(mix_outputs)
        
        predictions = torch.max(outputs, dim=1).values
        
        loss = criterion(predictions.float(), labels.float())

        train_correct += (np.round(predictions.detach().cpu().numpy()) == labels.cpu().numpy()).sum().item()
        
        train_loss += loss.item()
        
        optimizer.step()
        
        loss.backward()
        
        train_total += labels.size(0)
    
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    val_spearmanr = 0.0
    with torch.no_grad():
        val_iterator = tqdm(val_loader, desc=f'Processing Epoch {epoch:02d}')
        for batch in val_iterator:
            hypothesis_encoded = batch["hypothesis_encoded"].to(device)
            context_encoded = batch["context_encoded"].to(device)

            hypothesis_mask = batch["hypothesis_mask"].to(device)
            context_mask = batch["context_mask"].to(device)

            labels = batch["label"].to(device)

            hypothesis_outputs = model.hypothesis_encode(hypothesis_encoded, hypothesis_mask)

            context_outputs = model.context_encode(context_encoded, context_mask)

            mix_outputs = model.hypothesis_context_mix(hypothesis_outputs, hypothesis_mask, context_outputs, context_mask)

            outputs = model.project(mix_outputs)

            predictions = torch.max(outputs, dim=1).values

            loss = criterion(predictions.float(), labels.float())

            val_correct +=  (np.round(predictions.detach().cpu().numpy()) == labels.cpu().numpy()).sum().item()

            val_loss += loss.item()

            val_total += labels.size(0)
    
    
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    
    train_accuracy = train_correct / train_total
    val_accuracy = val_correct / val_total
    
    # wandb.log({
    #     "Train Loss": train_loss,
    #     "Validation Loss": val_loss,
    #     "Train Accuracy": train_accuracy,
    #     "Validation Accuracy": val_accuracy
    # })
    
    torch.save(model.state_dict(), f"weights/MQA_Model_{epoch}.pt")
    
    print("Epoch:", epoch, "Train Loss:", train_loss, "Validation Loss:", val_loss, "Train Accuracy:", train_accuracy, "Validation Accuracy:", val_accuracy)
 

Processing Epoch 00: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:55<00:00,  2.27it/s]
Processing Epoch 00: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.57it/s]


Epoch: 0 Train Loss: 7.340923507690429 Validation Loss: 9.251575912748065 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 01: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.41it/s]
Processing Epoch 01: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.67it/s]


Epoch: 1 Train Loss: 7.337360883712768 Validation Loss: 9.25240833797152 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 02: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:52<00:00,  2.40it/s]
Processing Epoch 02: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.72it/s]


Epoch: 2 Train Loss: 7.339178365707397 Validation Loss: 9.25240965495034 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 03: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:52<00:00,  2.40it/s]
Processing Epoch 03: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.70it/s]


Epoch: 3 Train Loss: 7.337629810333252 Validation Loss: 9.236242619771806 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 04: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.41it/s]
Processing Epoch 04: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.71it/s]


Epoch: 4 Train Loss: 7.339720308303833 Validation Loss: 9.250493772446163 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 05: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.42it/s]
Processing Epoch 05: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.71it/s]


Epoch: 5 Train Loss: 7.3369019470214845 Validation Loss: 9.281496592930385 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 06: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.41it/s]
Processing Epoch 06: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.72it/s]


Epoch: 6 Train Loss: 7.338115064620972 Validation Loss: 9.234732907915872 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 07: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.42it/s]
Processing Epoch 07: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.69it/s]


Epoch: 7 Train Loss: 7.340805337905884 Validation Loss: 9.250510862895421 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 08: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.40it/s]
Processing Epoch 08: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.72it/s]


Epoch: 8 Train Loss: 7.3395347442626955 Validation Loss: 9.267896769538758 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


Processing Epoch 09: 100%|██████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.43it/s]
Processing Epoch 09: 100%|████████████████████████████████████████████████████████████████████| 63/63 [00:09<00:00,  6.72it/s]


Epoch: 9 Train Loss: 7.343775835037231 Validation Loss: 9.249551190270317 Train Accuracy: 0.441 Validation Accuracy: 0.56312625250501


In [29]:
model.eval()
test_loss = 0.0
test_correct = 0
test_total = 0
with torch.no_grad():
    test_iterator = tqdm(test_loader, desc=f'Processing: ')
    for batch in test_iterator:
        hypothesis_encoded = batch["hypothesis_encoded"].to(device)
        context_encoded = batch["context_encoded"].to(device)

        hypothesis_mask = batch["hypothesis_mask"].to(device)
        context_mask = batch["context_mask"].to(device)

        labels = batch["label"].to(device)

        hypothesis_outputs = model.hypothesis_encode(hypothesis_encoded, hypothesis_mask)

        context_outputs = model.context_encode(context_encoded, context_mask)

        mix_outputs = model.hypothesis_context_mix(hypothesis_outputs, hypothesis_mask, context_outputs, context_mask)

        outputs = model.project(mix_outputs)

        predictions = torch.max(outputs, dim=1).values
        
        loss = criterion(predictions.float(), labels.float())

        test_correct += (np.round(predictions.cpu().numpy()) == labels.cpu().numpy()).sum().item()
        
        test_loss += loss.item()

        test_total += labels.size(0)
    
    test_loss /= len(test_loader)
    
    test_accuracy = test_correct / test_total

    print("Test Loss:", test_loss,"Test Accuracy:", test_accuracy)
 

Processing: 100%|███████████████████████████████████████████████████████████████████████████| 188/188 [00:28<00:00,  6.67it/s]

Test Loss: 9.830484556390884 Test Accuracy: 0.5926666666666667



