In [1]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torchtext
from torchtext.data import get_tokenizer
from torchtext import data, datasets

import json
import random

from tqdm import tqdm, trange

from POS_utils import *

In [2]:
# Dataset path
TrustPilot_processed_dataset_path = "..//dataset//TrustPilot_processed//"

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

mode_save_path = 'Models\\'
model_name = 'baseline_both_{}.pt'

In [3]:
####################################
#         Hyper-parameters         #
####################################
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
N_LAYERS = 2
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 50

In [4]:
####################################
#          Preparing Data          #
####################################
SEED = 25042020


# 1. data.Field()
# TEXT = data.Field(include_lengths=True, pad_token='<pad>', unk_token='<unk>')
TEXT = data.Field(lower = True)
TAG_LABEL = data.Field(unk_token = None)
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()

In [5]:
fields = {'text':('text', TEXT), 
          'tag_label':('tag_label', TAG_LABEL),
          'age_label':('age_label', AGE_LABEL),
          'gender_label':('gender_label', GENDER_LABEL)}

In [6]:
# train, val, test
# 2. data.TabularDataset
train_data, valid_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
                                                               train="train.jsonl",
                                                               validation = "valid.jsonl",
                                                               test="test.jsonl",
                                                               fields=fields,
                                                               format="json")

In [7]:
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))

Number of train_data = 486
Number of valid_data = 54
Number of test_data = 60



In [8]:
# 4. data.BucketIterator
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                               batch_size=BATCH_SIZE,
                                                               device=device,
                                                               sort_key=lambda x: len(x.text))

# 5. Build vocab
# TEXT.build_vocab(train_data)
TAG_LABEL.build_vocab(train_data)
AGE_LABEL.build_vocab(train_data)
GENDER_LABEL.build_vocab(train_data)


In [9]:
MIN_FREQ = 2

TEXT.build_vocab(train_data, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

In [10]:
# Parameters
INPUT_DIM = len(TEXT.vocab)
TAG_OUTPUT_DIM = len(TAG_LABEL.vocab)
AGE_OUTPUT_DIM = len(AGE_LABEL.vocab)
GENDER_OUTPUT_DIM = len(GENDER_LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [11]:
model = BiLSTMPOSTagger(INPUT_DIM, 
                        EMBEDDING_DIM, 
                        HIDDEN_DIM, 
                        TAG_OUTPUT_DIM, 
                        N_LAYERS, 
                        BIDIRECTIONAL, 
                        DROPOUT, 
                        PAD_IDX)

model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(766, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=512, out_features=13, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [12]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,393,413 trainable parameters


In [13]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([766, 100])


In [14]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.3783, -0.7497, -2.1660,  ...,  0.6147, -0.0183,  0.0452],
        [ 0.6506, -1.2298,  0.1797,  ..., -0.8687, -0.2222,  0.3094],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.1194,  0.5419,  0.6217,  ..., -0.3805, -0.0422,  0.3516],
        [ 0.3576,  0.0981, -0.1268,  ...,  0.2066, -0.1067, -0.4127],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882]])

In [15]:

model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 1.3783, -0.7497, -2.1660,  ...,  0.6147, -0.0183,  0.0452],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.1194,  0.5419,  0.6217,  ..., -0.3805, -0.0422,  0.3516],
        [ 0.3576,  0.0981, -0.1268,  ...,  0.2066, -0.1067, -0.4127],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882]])


In [16]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [17]:
TAG_PAD_IDX = TAG_LABEL.vocab.stoi[TAG_LABEL.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [19]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch.text
        tags = batch.tag_label
        
        optimizer.zero_grad()
        
        #text = [sent len, batch size]
        
        predictions = model(text)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:

def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch.text
            tags = batch.tag_label
            
            predictions = model(text)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
repeated_time = 0

best_valid_loss = float('inf')
best_epoch = -1
    
saved_model = mode_save_path+model_name.format(repeated_time+1)
    
for epoch in range(NUM_EPOCHS):    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion, TAG_PAD_IDX)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_epoch = epoch
        torch.save(model.state_dict(), saved_model)

model.load_state_dict(torch.load(saved_model))
print("Bset epoch: {:3f}".format(best_epoch))
evaluate_bias(model, test_iter, TAG_PAD_IDX)

Bset epoch: 20.000000
Overall
Accuracy : 0.886364
F Score : 0.866469
Age group
Accuracy Under35: 0.869565 V.S. Over45: 0.911290
F Score Under35: 0.853551 V.S. Over45: 0.888278
Gender group
Accuracy Female: 0.880266 V.S. Male: 0.889885
F Score Female: 0.865868 V.S. Male: 0.858521


0