# 1. Data preprocessing

In [1]:
# %pip install matplotlib scikit-learn pandas

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import re

In [2]:
# Read data
df1 = pd.read_json('../Data/domain1_train_data.json', lines=True)
df2 = pd.read_json('../Data/domain2_train_data.json', lines=True)

# Define Domains
df1['domain'], df2['domain'] = 0, 1

# Split set 1
df1_train, df1_dev = train_test_split(df1, stratify=df1['label'], random_state=0, test_size=0.2)
# Split set 2
x2_1 = df2[df2['label'] == 1].sample(500, random_state=0)
x2_0 = df2[df2['label'] == 0].sample(500, random_state=0)
df2_train = df2[[i not in list(pd.concat([x2_1, x2_0]).reset_index()['index']) for i in df2.index]].reset_index(drop=True)
df2_dev = pd.concat([x2_1,x2_0]).reset_index(drop=True)

# Print classes proportion
print(round(df2_train['label'].value_counts()/len(df2_train['label']),2))

label
0    0.92
1    0.08
Name: count, dtype: float64


# 3 DL Models (BiLSTM + DANN)

In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

<torch._C.Generator at 0x7fc70674cf90>

In [11]:
# Weights function
def weights(df):
    w = list(len(df['label'])/df['label'].value_counts())
    sample_weights = [0] * df.shape[0]
    for idx, label in enumerate(df['label']):
        sample_weights[idx] = w[label]
    return sample_weights

# Prepare pytorch dataset
class Dataset(Dataset):
    def __init__(self, text, labels, domain):
        self.text = text
        self.labels = labels
        self.domain = domain
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        text = torch.tensor(self.text[idx])
        label = torch.tensor(self.labels[idx]).reshape(-1,1)
        domain = torch.tensor(self.domain[idx])
        return text, label, domain
    
# Define collate (pre_process) function
def collate_batch(batch):  
    texts, labels, domain = zip(*batch)
    text_len = [len(txt) for txt in texts]
    text = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    labels = torch.tensor(labels, dtype=torch.float32).to(device).reshape(-1,1)
    domain = torch.tensor(domain, dtype=torch.float32).to(device).reshape(-1,1)
    return text, labels, text_len, domain

# Reset indexes
df1_train.reset_index(drop=True, inplace=True)
df2_train.reset_index(drop=True, inplace=True)
df1_dev.reset_index(drop=True, inplace=True)
df2_dev.reset_index(drop=True, inplace=True)

# Create datasets
train_DS1 = Dataset(df1_train['text'], df1_train['label'], df1_train['domain'])
train_DS2 = Dataset(df2_train['text'], df2_train['label'], df2_train['domain'])
dev_DS1 = Dataset(df1_dev['text'], df1_dev['label'], df1_dev['domain'])
dev_DS2 = Dataset(df2_dev['text'], df2_dev['label'], df2_dev['domain'])

# Sample train data
# sampler_tr1 = torch.utils.data.WeightedRandomSampler(weights(df1_train), num_samples=len(train_DS1), replacement=True)
sampler_tr2 = torch.utils.data.WeightedRandomSampler(weights(df2_train), num_samples=len(train_DS2), replacement=True)

# Create dataloaders
bs = 32
x_tr1 = DataLoader(train_DS1, batch_size=bs, collate_fn=collate_batch)
x_tr2 = DataLoader(train_DS2, batch_size=bs, collate_fn=collate_batch, sampler=sampler_tr2)
x_dev1 = DataLoader(dev_DS1, batch_size=bs, collate_fn=collate_batch)
x_dev2 = DataLoader(dev_DS2, batch_size=bs, collate_fn=collate_batch)

In [12]:
from torch.autograd import Function
class ReverseLayerF(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)
        
    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha
        return output, None

def reverse_gradient(x, alpha=7):
    return ReverseLayerF.apply(x, alpha)

# Bidirectional LSTM model
class DANN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(DANN, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Feature extraction Layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=n_layers, batch_first=True,dropout = 0.5)

        # Classiffier layer
        self.class_classifier = nn.Sequential()
        self.class_classifier.add_module('fc1', nn.Linear(hidden_dim*2, 1))
        self.class_classifier.add_module('dropout', nn.Dropout(0.2))
        self.class_classifier.add_module('Sigmoid', nn.Sigmoid())

        # Domain classifier Layer
        self.domain_classifier = nn.Sequential()
        self.domain_classifier.add_module('fc1', nn.Linear(hidden_dim*2, hidden_dim))
        self.domain_classifier.add_module('relu', nn.ReLU())
        self.domain_classifier.add_module('fc2', nn.Linear(hidden_dim, 1))
        self.domain_classifier.add_module('sigmoid', nn.Sigmoid())

    def forward(self, text, text_lengths, alpha):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        output, (hidden, cell_state) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        reverse_feature = ReverseLayerF.apply(hidden, alpha)
        class_output = self.class_classifier(hidden)
        domain_output = self.domain_classifier(reverse_feature)
        
        return class_output, domain_output

In [13]:
# Instantiate the model
h_dim, e_dim = 256, 128
model = DANN(vocab_size=90000, embedding_dim=e_dim, hidden_dim=h_dim, n_layers=2).to(device)

for p in model.parameters():
    p.requires_grad = True
    
print(model)

DANN(
  (embedding): Embedding(90000, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (class_classifier): Sequential(
    (fc1): Linear(in_features=512, out_features=1, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (Sigmoid): Sigmoid()
  )
  (domain_classifier): Sequential(
    (fc1): Linear(in_features=512, out_features=256, bias=True)
    (relu): ReLU()
    (fc2): Linear(in_features=256, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)


In [14]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters())
# lr_sched = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.005, patience=5,)

# Weights function
def weights_class(y, c):
    y = pd.Series(y.int().numpy(force=True).reshape(-1))
    w = len(y)/y.value_counts()
    if c == 0:
        try:
            return w[0]
        except:
            return 1
    else:
        try:
            return w[1]
        except:
            return 1

In [15]:
# ____________________________________________________________________________________________________________
# Function to get models' metrics
def model_metrics(dataloader_iter, alpha):
    # training model using source data
    X, y, text_len, domain = next(dataloader_iter)
    # Predict
    class_output, domain_output = model(X, text_len, alpha)
    # Loss fn
    loss_fn_cl = nn.BCELoss(weight = torch.tensor(weights_class(y, 0)).to(device))
    loss_fn_d = nn.BCELoss(weight = torch.tensor(weights_class(domain, 1)).to(device))
    # Classifier metrics
    class_loss = loss_fn_cl(class_output, y) 
    class_acc = torch.sum((class_output>=0.5).float() == y)
    # Discriminator metrics
    domain_loss = loss_fn_d(domain_output, domain)
    domain_acc = torch.sum((domain_output>=0.5).float() == domain)
    total = y.size()[0]
    return class_acc, domain_acc, total, class_loss, domain_loss
# ____________________________________________________________________________________________________________
# ____________________________________________________________________________________________________________
# Helper function to return training metrics
def train_model():
    # Instanciate metric's variables
    train_loss, total = 0, 0
    class_acc1, class_acc2, tot1, tot2 = 0, 0, 0, 0
    domain_acc1, domain_acc2= 0, 0
    # Train parameters
    len_dataloader = min(len(x_tr1), len(x_tr2))
    data_source_iter = iter(x_tr1)
    data_target_iter = iter(x_tr2)
    # Iterate dataloader
    for i in tqdm(range(len_dataloader)):
        model.train()
        # Calculate Alpha
        p = float(i + epoch * len_dataloader) / epochs / len_dataloader
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        
        # Reset gradient
        optimizer.zero_grad()
        # Run model
        cl_a1, d_a1, t1, cl1, dl1 = model_metrics(data_source_iter, alpha)
        cl_a2, d_a2, t2, cl2, dl2 = model_metrics(data_target_iter, alpha)
        # Metrics
        class_acc1 += cl_a1.cpu().numpy()
        class_acc2 += cl_a2.cpu().numpy()
        domain_acc1 += d_a1.cpu().numpy()
        domain_acc2 += d_a2.cpu().numpy()
        tot1 += t1
        tot2 += t2
        loss = cl1 + cl2
        # Metrics
        train_loss += loss.item()
        loss.backward()             # Backpropagation
        optimizer.step()            # Update parameters
        # lr_sched.step(train_loss)
    # Print results
    d_acc = (domain_acc1 + domain_acc2)/(tot1 + tot2)
    cl1_acc = class_acc1/ tot1
    cl2_acc = class_acc2/tot2
    loss = train_loss/len_dataloader
    
    tqdm.write(
        f'Domain_Acc: {d_acc:.3f}\
        Class1_Acc: {cl1_acc:.3f}\
        Class2_Acc: {cl2_acc:.3f}\
        Loss: {loss:.3f}',
    )
    # ____________________________________________________________________________________________________________


In [16]:
# ____________________________________________________________________________________________________________
# ____________________________________________________________________________________________________________
# Helper function to return test metrics
def test_model():
    # Instanciate metric's variables
    test_loss, total = 0, 0
    class_acc1, class_acc2, tot1, tot2 = 0, 0, 0, 0
    domain_acc1, domain_acc2= 0, 0

    # Test parameters
    len_dataloader = min(len(x_dev1), len(x_dev2))
    data_source_iter = iter(x_dev1)
    data_target_iter = iter(x_dev2)
    
    # Iterate dataloader
    for i in tqdm(range(len_dataloader)):
        model.eval()
        # Calculate Alpha
        p = float(i + epoch * len_dataloader) / epochs / len_dataloader
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        
        # Reset gradient
        optimizer.zero_grad()
        
        # Run model
        cl_a1, d_a1, t1, cl1, dl1 = model_metrics(data_source_iter, alpha)
        cl_a2, d_a2, t2, cl2, dl2 = model_metrics(data_target_iter, alpha)

        # Metrics
        class_acc1 += cl_a1.cpu().numpy()
        class_acc2 += cl_a2.cpu().numpy()
        domain_acc1 += d_a1.cpu().numpy()
        domain_acc2 += d_a2.cpu().numpy()
        tot1 += t1
        tot2 += t2
        loss = cl1 + dl2
    
        # Metrics
        test_loss += loss.item()

    # Print results
    d_acc = (domain_acc1 + domain_acc2)/(tot1 + tot2)
    cl1_acc = class_acc1/ tot1
    cl2_acc = class_acc2/tot2
    loss = test_loss/len_dataloader
    
    tqdm.write(
        f'Domain_Acc: {d_acc:.3f}\
        Class1_Acc: {cl1_acc:.3f}\
        Class2_Acc: {cl2_acc:.3f}\
        Loss: {loss:.3f}',
    )
    # ____________________________________________________________________________________________________________


In [40]:
from tqdm import tqdm
from sklearn.metrics import f1_score
from tqdm import tqdm, tqdm_notebook # show progress bar

# Epochs
epochs = 5
train_loss, valid_loss1, valid_loss2 = [], [], []
print("Training BiLSTM network model!")

for epoch in range(epochs):
    print('epoch: %d'% (epoch))
    train_model()
    test_model()

Training BiLSTM network model!
epoch: 0


100%|██████████| 125/125 [00:26<00:00,  4.68it/s]


Domain_Acc: 0.539        Class1_Acc: 0.862        Class2_Acc: 0.863        Loss: 1.048


100%|██████████| 32/32 [00:02<00:00, 11.22it/s]


Domain_Acc: 0.585        Class1_Acc: 0.761        Class2_Acc: 0.688        Loss: 2.029
epoch: 1


100%|██████████| 125/125 [00:26<00:00,  4.69it/s]


Domain_Acc: 0.527        Class1_Acc: 0.878        Class2_Acc: 0.880        Loss: 0.831


100%|██████████| 32/32 [00:02<00:00, 11.11it/s]


Domain_Acc: 0.579        Class1_Acc: 0.770        Class2_Acc: 0.689        Loss: 2.143
epoch: 2


100%|██████████| 125/125 [00:26<00:00,  4.65it/s]


Domain_Acc: 0.536        Class1_Acc: 0.886        Class2_Acc: 0.884        Loss: 0.736


100%|██████████| 32/32 [00:02<00:00, 11.35it/s]


Domain_Acc: 0.622        Class1_Acc: 0.766        Class2_Acc: 0.678        Loss: 2.304
epoch: 3


100%|██████████| 125/125 [00:26<00:00,  4.75it/s]


Domain_Acc: 0.541        Class1_Acc: 0.889        Class2_Acc: 0.882        Loss: 0.717


100%|██████████| 32/32 [00:02<00:00, 11.73it/s]


Domain_Acc: 0.589        Class1_Acc: 0.740        Class2_Acc: 0.679        Loss: 2.438
epoch: 4


100%|██████████| 125/125 [00:26<00:00,  4.69it/s]


Domain_Acc: 0.546        Class1_Acc: 0.893        Class2_Acc: 0.888        Loss: 0.693


100%|██████████| 32/32 [00:02<00:00, 11.34it/s]

Domain_Acc: 0.605        Class1_Acc: 0.771        Class2_Acc: 0.688        Loss: 2.511





In [63]:
def predict(dl, ln):
    with torch.no_grad():  # Disable gradient computation for efficiency
        preds, acc, test_acc = [], 0, 0
        for X, y, text_len, domain in dl:
            # Dev data
            pred, domain_output = model(X, text_len, 1)
            preds.extend((pred>=0.5).int().detach().cpu().numpy().reshape(-1))
    return preds

In [64]:
# Join data in both domains for trating them jointly (Augmentation)
dev_set = pd.concat([df1_dev, df2_dev]).reset_index(drop=True)
train_set = pd.concat([df1_train, df2_train]).reset_index(drop=True)

# Train and dev sets
x_tr, y_train = train_set['text'], train_set['label']
x_dev, y_dev = dev_set['text'], dev_set['label']

# Print classes proportion
print(round(train_set['label'].value_counts()/len(train_set['label']),2))

label
0    0.81
1    0.19
Name: count, dtype: float64


In [65]:
# Create datasets
train_DS = Dataset(train_set['text'], train_set['label'], train_set['domain'])
dev_DS = Dataset(dev_set['text'], dev_set['label'], dev_set['domain'])

# Sample train data
sampler_tr = torch.utils.data.WeightedRandomSampler(weights(train_set), num_samples=len(train_DS), replacement=True)

# Create dataloaders
bs = 32
x_tr_ = DataLoader(train_DS, batch_size=bs, collate_fn=collate_batch, sampler=sampler_tr)
x_ts_ = DataLoader(dev_DS, batch_size=bs, collate_fn=collate_batch)

In [73]:
tr_pr = predict(x_tr_, train_set.shape[0])
sum(tr_pr == y_train)/len(y_train)

0.497125

In [74]:
dev_pr = predict(x_ts_, dev_set.shape[0])

In [77]:
from sklearn.metrics import roc_auc_score
print(f'Accuracy: {sum(dev_pr == y_dev)/len(y_dev):.3f}\
        F1 Score: {f1_score(y_dev, dev_pr):.3f}\
        ROC_AUC: {roc_auc_score(y_dev, dev_pr):.3f}')

Accuracy: 0.730        F1 Score: 0.674        ROC_AUC: 0.730


# Predict on sample data

In [48]:
test = pd.read_json('../Data/test_data.json', lines=True)['text']
test = [[t if t != 0 else 1 for t in ls] for ls in test]

In [59]:
test_DS = Dataset(test, [1]*len(test), [1]*len(test))
x_tst = DataLoader(test_DS, batch_size=bs, collate_fn=collate_batch)
preds, acc, test_acc = [], 0, 0
for X, y, text_len, domain in x_tst:
    # Dev data
    pred, domain_output = model(X, text_len, 1)
    preds.extend((pred>=0.5).int().detach().cpu().numpy().reshape(-1))

In [61]:
test_df = pd.DataFrame(columns = ['id', 'value'])
for idx, v in enumerate(preds):
    test_df.loc[idx] = [idx, preds[idx]]
test_df.to_csv('../Data/predictions.csv', index=False)

In [62]:
test_df['value'].value_counts()

value
0    2605
1    1395
Name: count, dtype: int64