# 1. Data preprocessing

In [1]:
# %pip install matplotlib scikit-learn pandas

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import re

In [3]:
# Read data
df1 = pd.read_json('domain1_train_data.json', lines=True)
df2 = pd.read_json('domain2_train_data.json', lines=True)
# Get domains
df1['domain'],df2['domain'] = 0, 1

# Train dev split
df1_train, df1_dev = train_test_split(df1, stratify=df1['label'], random_state=42)
df2_train, df2_dev = train_test_split(df2, stratify=df2['label'], random_state=42)

# Join data in both domains for trating them jointly (Augmentation)
df_train = pd.concat([df1_train, df2_train]).reset_index(drop=True)
df_dev = pd.concat([df1_dev, df2_dev]).reset_index(drop=True)

# Shuffle datasets
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

# 3 DL Models (BiLSTM)

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

<torch._C.Generator at 0x7f2189bda550>

In [5]:
# Weights function
def weights(df):
    w = list(len(df['label'])/df['label'].value_counts())
    sample_weights = [0] * df.shape[0]
    for idx, label in enumerate(df['label']):
        sample_weights[idx] = w[label]
    return sample_weights

# Prepare pytorch dataset
class Dataset(Dataset):
    def __init__(self, text, labels, domain):
        self.text = text
        self.labels = labels
        self.domain = domain
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        text = torch.tensor(self.text[idx])
        label = torch.tensor(self.labels[idx])
        domain = torch.tensor(self.domain[idx])
        return text, label, domain
    
# Define collate (pre_process) function
def collate_batch(batch):  
    texts, labels, domain = zip(*batch)
    text_len = [len(txt) for txt in texts]
    text = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    labels = torch.tensor(labels, dtype=torch.float32).to(device).reshape(-1,1)
    domain = torch.tensor(domain, dtype=torch.float32).to(device).reshape(-1,1)
    return text, labels, text_len, domain

# Create datasets
train_DS = Dataset(df_train['text'], df_train['label'], df_train['domain'])
dev_DS = Dataset(df_dev['text'], df_dev['label'], df_train['domain'])

# Sample train data
sampler_tr = torch.utils.data.WeightedRandomSampler(weights(df_train), num_samples=len(train_DS), replacement=True)
sampler_ts = torch.utils.data.WeightedRandomSampler(weights(df_dev), num_samples=len(dev_DS), replacement=True)

# Create dataloaders
bs = 32
x_tr_dl = DataLoader(train_DS, batch_size=bs, collate_fn=collate_batch, sampler=sampler_tr)
x_dev_dl = DataLoader(dev_DS, batch_size=bs, collate_fn=collate_batch, sampler=sampler_ts)

In [20]:
# Source: https://github.com/fungtion/DANN/blob/master/models/functions.py
from torch.autograd import Function
# Reverse layer for discriminator model
class ReverseLayerF(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)
    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha
        return output, None

def reverse_gradient(x, alpha=3):
    return ReverseLayerF.apply(x, alpha)

# Bidirectional LSTM model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):
        super(BiLSTM, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embeding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # BiLSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=n_layers, batch_first=True, dropout = 0.5)
        # Linear layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        # Dropout layer
        self.dropout = nn.Dropout(0.5)
        # Sigmoid layer
        self.Sigmoid = nn.Sigmoid()

    def forward(self, text, text_lengths):
        # initial hidden and cell states
        h0 = torch.zeros(self.n_layers * 2, text.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.n_layers * 2, text.size(0), self.hidden_dim).to(device)
        # Embeding 
        embedded = self.dropout(self.embedding(text))
        # Batch packing
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        # BiLSTM pass
        output, (hidden, cell_state) = self.lstm(packed_embedded, (h0, c0))
        # Concatenate hidden states in the BiLSTM
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # Dropout layer
        hidden = self.dropout(hidden)
        # Return the classifier's output
        linear = self.fc(hidden)
        # Sigmoid
        return self.Sigmoid(linear.squeeze(0))

# Discriminator model
# Source: https://github.com/NaJaeMin92/pytorch-DANN/blob/master/model.py
class Discriminator(nn.Module):
    def __init__(self, hidden_dim):
        super(Discriminator, self).__init__()
        self.discriminator = nn.Sequential(
                nn.Linear(hidden_dim*2, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, 1),
                nn.Sigmoid()
            )

    def forward(self, input_features):
        reversed_input = reverse_gradient(input_features)
        print(reversed_input.shape)
        x = self.discriminator(reversed_input)
        return x

In [21]:
# Instantiate the model
model = BiLSTM(vocab_size=90000, embedding_dim=128*8, hidden_dim=256*2, output_dim=1, n_layers=2).to(device)
discriminator = Discriminator(hidden_dim=256*2).to(device)
print(model)
print(discriminator)

BiLSTM(
  (embedding): Embedding(90000, 1024, padding_idx=0)
  (lstm): LSTM(1024, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (Sigmoid): Sigmoid()
)
Discriminator(
  (discriminator): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
    (3): Sigmoid()
  )
)


In [22]:
# Loss fn
loss_fn = nn.BCELoss()
# Optimizer
optimizer = torch.optim.Adam(list(model.parameters()) + list(discriminator.parameters()))

In [23]:
# Binary accuracy function
def binary_accuracy(predictions, y):
    rounded_preds = torch.round(torch.sigmoid(predictions)).squeeze()  # Ensure it's a 1D tensor
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

def compute_f1(predictions, labels):
    # Convert predictions to binary
    preds_binary = torch.round(torch.sigmoid(predictions))
    preds_binary = preds_binary.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    return f1_score(labels, preds_binary)

In [24]:
from tqdm import tqdm
from sklearn.metrics import f1_score
# Train
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()

    # Instanciate variables
    train_loss, total = 0, 0
    class_acc, c_real, c_preds = 0, [], []
    domain_acc, d_preds, c_preds = 0, [], []

    # Iterate dataloader
    for X, y, text_len, domain in tqdm(dataloader):
        # BilSTM
        class_pred = model(X, text_len)   # Forward pass
        class_loss = loss_fn(class_pred, y)     # Compute loss 
        class_acc += torch.sum((class_pred>=0.5).float() == y)
        c_real.extend(y.int().detach().cpu().numpy().reshape(-1,1))
        c_preds.extend((class_pred>=0.5).int().detach().cpu().numpy().reshape(-1,1))
        
        # Discriminator
        domain_pred = discriminator(class_pred)
        domain_loss = loss_fn(domain_pred, domain)
        domain_acc += torch.sum((domain_pred>=0.5).float() == domain)
        d_real.extend(domain.int().detach().cpu().numpy().reshape(-1,1))
        d_preds.extend((domain_pred>=0.5).int().detach().cpu().numpy().reshape(-1,1))

        # DANN loss
        loss = class_loss + domain_loss
        loss.backward()             # Backpropagation
        optimizer.step()            # Update parameters
        optimizer.zero_grad()       # Reset gradient
    
        # Metrics
        train_loss += loss.item()
        total += len(y)
        
    # General metrics
    train_loss /= total
    return train_loss, class_acc/size, f1_score(c_real, c_preds), domain_acc/size, f1_score(d_real, d_preds)

In [25]:
# Test
def test(dataloader, model, loss_fn, n_epoch):
    size = len(dataloader.dataset)
    model.eval()

    # Instanciate variables
    test_loss, total = 0, 0
    class_acc, c_real, c_preds = 0, [], []
    domain_acc, d_preds, c_preds = 0, [], []

    # Iterate dataloader
    with torch.no_grad():       # Specify no gradient
        for X, y, text_len, domain in tqdm(dataloader):
            # BilSTM
            class_pred = model(X, text_len)   # Forward pass
            class_loss = loss_fn(class_pred, y)     # Compute loss 
            class_acc += torch.sum((class_pred>=0.5).float() == y)
            c_real.extend(y.int().detach().cpu().numpy().reshape(-1,1))
            c_preds.extend((class_pred>=0.5).int().detach().cpu().numpy().reshape(-1,1))
            
            # Discriminator
            domain_pred = discriminator(class_pred)
            domain_loss = loss_fn(domain_pred, domain)
            domain_acc += torch.sum((domain_pred>=0.5).float() == domain)
            d_real.extend(domain.int().detach().cpu().numpy().reshape(-1,1))
            d_preds.extend((domain_pred>=0.5).int().detach().cpu().numpy().reshape(-1,1))

            # DANN loss
            loss = class_loss + domain_loss

            # Metrics
            test_loss += loss.item()                # Compute loss  
            total += len(y)                         # Total observations

    # General metrics
    test_loss /= total
    return test_loss, acc/size, f1_score(real, preds)

In [26]:
from tqdm import tqdm, tqdm_notebook # show progress bar

# Epochs
epochs = 15
train_loss, valid_loss = [], []
print("Training BiLSTM network model!")
for t in range(epochs):
    tl, train_acc, f1_tr = train(x_tr_dl, model, loss_fn, optimizer)
    vl, valid_acc, f1_ts = test(x_dev_dl, model, loss_fn)
    # Losses to dict
    train_loss.append(tl)
    valid_loss.append(vl)

    # Print results
    tqdm.write(
        f'epoch #{t}\ttrain_acc: {train_acc:.3f}\tvalid_acc: {valid_acc:.3f}',
    )
    tqdm.write(
        f'epoch #{t}\tf1_tr: {f1_tr:.3f}\tf1_ts: {f1_ts:.3f}',
    )

Training BiLSTM network model!


  0%|          | 0/422 [00:00<?, ?it/s]

torch.Size([32, 1])





RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x1 and 1024x512)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(10,5))
epoch_ticks = range(1, epochs + 1)
plt.plot(epoch_ticks, train_loss)
plt.plot(epoch_ticks, valid_loss)
plt.legend(['Train Loss', 'Valid Loss'])
plt.title('Losses') 
plt.xlabel('Epoch #')
plt.ylabel('Loss')
plt.xticks(epoch_ticks)
plt.show()

In [None]:
def predict(dl, ln):
    with torch.no_grad():  # Disable gradient computation for efficiency
        preds, acc, test_acc = [], 0, 0
        for X, y, text_len in dl:
            # Dev data
            pred = model(X, text_len)
            test_acc += torch.sum((pred>=0.5).float() == y)
    return (test_acc/ln).detach().cpu().numpy().item()

In [None]:
predict(x_tr_dl, df_train.shape[0])

In [None]:
predict(x_dev_dl, df_dev.shape[0])

# Predict on sample data

In [None]:
test = pd.read_json('test_data.json', lines=True)['text']
# train = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in df_train['text']]
test = [[t if t != 0 else 1 for t in ls] for ls in test]

In [None]:
d = [i.split() for i in training_set]
l_ = []
for l in d:
    l_.extend(l)
corpus = Counter(l_)
corpus

In [None]:
preds = []
for line in test:
    text_tensor = torch.tensor(line).unsqueeze(0).to(device)
    text_length = torch.tensor([len(line)])
    # Pass the sequence and its length to the model
    prediction = model(text_tensor, text_length)
    preds.extend((prediction>=0.5).int().detach().cpu().numpy().reshape(-1))

In [None]:
test_df = pd.DataFrame(columns = ['id', 'value'])
for idx, v in enumerate(preds):
    test_df.loc[idx] = [idx, preds[idx]]
test_df.to_csv('predictions.csv', index=False)

In [None]:
test_df

In [None]:
#  Export model
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('model_scripted.pt') # Save

In [None]:
# Load Model
model = torch.jit.load('model_scripted.pt')
model.eval()