# 1. Data preprocessing

In [1]:
# %pip install matplotlib scikit-learn pandas

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import re
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Read data
df1 = pd.read_json('../Data/domain1_train_data.json', lines=True)
df2 = pd.read_json('../Data/domain2_train_data.json', lines=True)
# df2, _ = RandomOverSampler(random_state=42).fit_resample(df2, df2['label'])
# df2 = pd.concat([df2[df2['label'] == 0].sample(1500), df2[df2['label'] == 1]]).sample(frac=1).reset_index(drop=True)
# Get domains
df1['domain'],df2['domain'] = 0, 1

# # Train dev split
df1_train, df1_dev = train_test_split(df1, random_state=42)
df2_train, df2_dev = train_test_split(df2, random_state=42)

# # Join data in both domains for trating them jointly (Augmentation)
# df_train = pd.concat([df1_train, df2_train]).reset_index(drop=True)

# # Reset index
# df1_dev = df1_dev.reset_index(drop=True)
# df2_dev = df2_dev.reset_index(drop=True)

# # Shuffle datasets
# df_train = df_train.sample(frac=1).reset_index(drop=True)

# 3 DL Models (BiLSTM + DANN)

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

<torch._C.Generator at 0x7ff04bde5d30>

In [12]:
# Prepare pytorch dataset
class Dataset(Dataset):
    def __init__(self, text, labels, domain):
        self.text = text
        self.labels = labels
        self.domain = domain
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        text = torch.tensor(self.text[idx])
        label = torch.tensor(self.labels[idx]).reshape(-1,1)
        domain = torch.tensor(self.domain[idx])
        return text, label, domain
    
# Define collate (pre_process) function
def collate_batch(batch):  
    texts, labels, domain = zip(*batch)
    text_len = [len(txt) for txt in texts]
    text = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    labels = torch.tensor(labels, dtype=torch.float32).to(device).reshape(-1,1)
    domain = torch.tensor(domain, dtype=torch.float32).to(device).reshape(-1,1)
    return text, labels, text_len, domain

# Rest indexes 
df1_train.reset_index(drop=True, inplace=True)
df1_dev.reset_index(drop=True, inplace=True)
df2_train.reset_index(drop=True, inplace=True)
df2_dev.reset_index(drop=True, inplace=True)

# Create datasets
train_DS1 = Dataset(df1_train['text'], df1_train['label'], df1_train['domain'])
train_DS2 = Dataset(df2_train['text'], df2_train['label'], df2_train['domain'])
dev_DS1 = Dataset(df1_dev['text'], df1_dev['label'], df1_dev['domain'])
dev_DS2 = Dataset(df2_dev['text'], df2_dev['label'], df2_dev['domain'])

# Create dataloaders
bs = 32
x_tr1 = DataLoader(train_DS1, batch_size=bs, collate_fn=collate_batch)
x_tr2 = DataLoader(train_DS2, batch_size=bs, collate_fn=collate_batch)
x_dev1 = DataLoader(dev_DS1, batch_size=bs, collate_fn=collate_batch)
x_dev2 = DataLoader(dev_DS2, batch_size=bs, collate_fn=collate_batch)

In [13]:
from torch.autograd import Function
class ReverseLayerF(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)
        
    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha
        return output, None

def reverse_gradient(x, alpha=7):
    return ReverseLayerF.apply(x, alpha)

# Bidirectional LSTM model
class DANN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(DANN, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Feature extraction Layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=n_layers, batch_first=True,dropout = 0.5)

        # Classiffier layer
        self.class_classifier = nn.Sequential()
        self.class_classifier.add_module('fc1', nn.Linear(hidden_dim*2, 1))
        self.class_classifier.add_module('dropout', nn.Dropout(0.2))
        self.class_classifier.add_module('Sigmoid', nn.Sigmoid())

        # Domain classifier Layer
        self.domain_classifier = nn.Sequential()
        self.domain_classifier.add_module('fc1', nn.Linear(hidden_dim*2, hidden_dim))
        self.domain_classifier.add_module('relu', nn.ReLU())
        self.domain_classifier.add_module('fc2', nn.Linear(hidden_dim, 1))
        self.domain_classifier.add_module('sigmoid', nn.Sigmoid())

    def forward(self, text, text_lengths, alpha):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        output, (hidden, cell_state) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        reverse_feature = ReverseLayerF.apply(hidden, alpha)
        class_output = self.class_classifier(hidden)
        domain_output = self.domain_classifier(reverse_feature)
        
        return class_output, domain_output

In [23]:
# Instantiate the model
h_dim, e_dim = 256*4, 128*2
model = DANN(vocab_size=90000, embedding_dim=e_dim, hidden_dim=h_dim, n_layers=2).to(device)

for p in model.parameters():
    p.requires_grad = True
    
print(model)

DANN(
  (embedding): Embedding(90000, 256, padding_idx=0)
  (lstm): LSTM(256, 1024, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (class_classifier): Sequential(
    (fc1): Linear(in_features=2048, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (Sigmoid): Sigmoid()
  )
  (domain_classifier): Sequential(
    (fc1): Linear(in_features=2048, out_features=1024, bias=True)
    (relu): ReLU()
    (fc2): Linear(in_features=1024, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)


In [24]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters())
# lr_sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.005, patience=5,)

# Weights function
def weights_class(y, c):
    y = pd.Series(y.int().numpy(force=True).reshape(-1))
    w = len(y)/y.value_counts()
    if c == 0:
        try:
            return w[0]
        except:
            return 1
    else:
        try:
            return w[1]
        except:
            return 1

In [25]:
# ____________________________________________________________________________________________________________
# Function to get models' metrics
def model_metrics(dataloader_iter, alpha):
    # training model using source data
    X, y, text_len, domain = next(dataloader_iter)
    
    # Predict
    class_output, domain_output = model(X, text_len, alpha)
    
    # Loss fn
    loss_fn_cl = nn.BCELoss(weight = torch.tensor(weights_class(y, 0)).to(device))
    loss_fn_d = nn.BCELoss(weight = torch.tensor(weights_class(domain, 1)).to(device))

    # Classifier metrics
    class_loss = loss_fn_cl(class_output, y) 
    class_acc = torch.sum((class_output>=0.5).float() == y)
    
    # Discriminator metrics
    domain_loss = loss_fn_d(domain_output, domain)
    domain_acc = torch.sum((domain_output>=0.5).float() == domain)
    total = y.size()[0]
    
    return class_acc, domain_acc, total, class_loss, domain_loss
# ____________________________________________________________________________________________________________
# ____________________________________________________________________________________________________________
# Helper function to return training metrics
def train_model():
    # Instanciate metric's variables
    train_loss, total = 0, 0
    class_acc1, class_acc2, tot1, tot2 = 0, 0, 0, 0
    domain_acc1, domain_acc2= 0, 0

    # Train parameters
    len_dataloader = min(len(x_tr1), len(x_tr2))
    data_source_iter = iter(x_tr1)
    data_target_iter = iter(x_tr2)
    
    # Iterate dataloader
    for i in tqdm(range(len_dataloader)):
        model.train()
        # Calculate Alpha
        p = float(i + epoch * len_dataloader) / epochs / len_dataloader
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        
        # Reset gradient
        optimizer.zero_grad()
        
        # Run model
        cl_a1, d_a1, t1, cl1, dl1 = model_metrics(data_source_iter, alpha)
        cl_a2, d_a2, t2, cl2, dl2 = model_metrics(data_target_iter, alpha)

        # Metrics
        class_acc1 += cl_a1.cpu().numpy()
        class_acc2 += cl_a2.cpu().numpy()
        domain_acc1 += d_a1.cpu().numpy()
        domain_acc2 += d_a2.cpu().numpy()
        tot1 += t1
        tot2 += t2
        loss = cl1 + dl1 + dl2 + cl2
        # Metrics
        train_loss += loss.item()
        loss.backward()             # Backpropagation
        optimizer.step()            # Update parameters
        # lr_sched.step(train_loss)

    # Print results
    d_acc = (domain_acc1 + domain_acc2)/(tot1 + tot2)
    cl1_acc = class_acc1/ tot1
    cl2_acc = class_acc2/tot2
    loss = train_loss/len_dataloader
    
    tqdm.write(
        f'Domain_Acc: {d_acc:.3f}\
        Class1_Acc: {cl1_acc:.3f}\
        Class2_Acc: {cl2_acc:.3f}\
        Loss: {loss:.3f}',
    )
    # ____________________________________________________________________________________________________________


In [26]:
# ____________________________________________________________________________________________________________
# ____________________________________________________________________________________________________________
# Helper function to return test metrics
def test_model():
    # Instanciate metric's variables
    train_loss, total = 0, 0
    class_acc1, class_acc2, tot1, tot2 = 0, 0, 0, 0
    domain_acc1, domain_acc2= 0, 0

    # Test parameters
    len_dataloader = min(len(x_dev1), len(x_dev2))
    data_source_iter = iter(x_dev1)
    data_target_iter = iter(x_dev2)
    
    # Iterate dataloader
    for i in tqdm(range(len_dataloader)):
        model.eval()
        # Calculate Alpha
        p = float(i + epoch * len_dataloader) / epochs / len_dataloader
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        
        # Reset gradient
        optimizer.zero_grad()
        
        # Run model
        cl_a1, d_a1, t1, cl1, dl1 = model_metrics(data_source_iter, alpha)
        cl_a2, d_a2, t2, cl2, dl2 = model_metrics(data_target_iter, alpha)

        # Metrics
        class_acc1 += cl_a1.cpu().numpy()
        class_acc2 += cl_a2.cpu().numpy()
        domain_acc1 += d_a1.cpu().numpy()
        domain_acc2 += d_a2.cpu().numpy()
        tot1 += t1
        tot2 += t2
        loss = cl1 + dl1 + cl2 + dl2
    
        # Metrics
        train_loss += loss.item()

    # Print results
    d_acc = (domain_acc1 + domain_acc2)/(tot1 + tot2)
    cl1_acc = class_acc1/ tot1
    cl2_acc = class_acc2/tot2
    loss = train_loss/len_dataloader
    
    tqdm.write(
        f'Domain_Acc: {d_acc:.3f}\
        Class1_Acc: {cl1_acc:.3f}\
        Class2_Acc: {cl2_acc:.3f}\
        Loss: {loss:.3f}',
    )
    # ____________________________________________________________________________________________________________


In [27]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from tqdm import tqdm, tqdm_notebook # show progress bar

# Epochs
epochs = 25
train_loss, valid_loss1, valid_loss2 = [], [], []
print("Training BiLSTM network model!")

for epoch in range(epochs):
    print('epoch: %d'% (epoch))
    train_model()
    test_model()

Training BiLSTM network model!
epoch: 0


  1%|          | 1/118 [00:01<02:17,  1.18s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1014.00 MiB. GPU 0 has a total capacity of 7.79 GiB of which 574.69 MiB is free. Process 1156307 has 2.77 GiB memory in use. Process 1386554 has 4.43 GiB memory in use. Of the allocated memory 3.25 GiB is allocated by PyTorch, and 1020.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.figure(figsize=(10,5))
epoch_ticks = range(1, epochs + 1)
plt.plot(epoch_ticks, train_loss)
plt.plot(epoch_ticks, valid_loss)
plt.legend(['Train Loss', 'Valid Loss'])
plt.title('Losses') 
plt.xlabel('Epoch #')
plt.ylabel('Loss')
plt.xticks(epoch_ticks)
plt.show()

In [None]:
def predict(dl, ln):
    with torch.no_grad():  # Disable gradient computation for efficiency
        preds, acc, test_acc = [], 0, 0
        for X, y, text_len, domain in dl:
            # Dev data
            pred, domain_output = model(X, text_len, alpha)
            test_acc += torch.sum((pred>=0.5).float() == y)
    return (test_acc/ln).detach().cpu().numpy().item()

In [None]:
predict(x_tr_dl, df_train.shape[0])

In [None]:
predict(x_dev_dl, df_dev.shape[0])

# Predict on sample data

In [None]:
test = pd.read_json('../Data/test_data.json', lines=True)['text']
# train = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in df_train['text']]
# test = [[t if t != 0 else 1 for t in ls] for ls in test]

In [None]:
# Unmasked data
import json
with open('../Data/test_Data_unmasked.json', 'r') as f:
    test = json.load(f)

In [None]:
preds = []
for line in test:
    text_tensor = torch.tensor(line).unsqueeze(0).to(device)
    text_length = torch.tensor([len(line)])
    # Pass the sequence and its length to the model
    pred, domain_output = model(text_tensor, text_length, 1)
    preds.extend((pred>=0.5).int().detach().cpu().numpy().reshape(-1))

In [None]:
test_df = pd.DataFrame(columns = ['id', 'value'])
for idx, v in enumerate(preds):
    test_df.loc[idx] = [idx, preds[idx]]
test_df.to_csv('../Data/predictions.csv', index=False)

In [None]:
test_df['value'].value_counts()

In [None]:
#  Export model
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('model_scripted.pt') # Save

In [None]:
# Load Model
model = torch.jit.load('model_scripted.pt')
model.eval()