In [1]:
# time
from datetime import datetime
notebook_start = datetime.now()

In [2]:
import pandas as pd
import re
import copy
import nltk
import random
import numpy as np
import torch

In [3]:
seed = 111

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

## Read Data

In [4]:
train_original = pd.read_csv("data/train.csv")

train_original = train_original.drop(["url_legal", "license"], axis=1)
train_original = train_original[train_original["standard_error"] != 0]

# print(len(train_original))
# train_original.head()

In [5]:
test_original = pd.read_csv("data/test.csv")

test_original = test_original.drop(["url_legal", "license"], axis=1)

# print(len(test_original))
# test_original.head()

## Preprocessing

In [6]:
# text preprocessing
def preprocessing(x):
    # lower
    x = x.lower()
    
    # tab, line space -> white space
    x = x.replace("\t", " ")
    x = x.replace("\n", " ")
    
    # special characters
    x = re.sub(r"[^a-z0-9!@$%*-_+=&,'.?\"]", " ", x)
    x = x.replace("!", " !")
    x = x.replace(",", " ,")
    x = x.replace(".", " .")
    x = x.replace("?", " ?")
    x = re.sub(" +", " ", x)
    x = x.strip()
    
    return x

In [7]:
train_pp = copy.deepcopy(train_original)
train_pp["excerpt"] = train_pp["excerpt"].apply(preprocessing)

# train_pp.loc[0]["excerpt"]

## Augementation

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/koo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data_to_augementation = copy.deepcopy(train_pp)
data_to_augementation["target_"] = train_pp["id"].map(str) + " " \
    + train_pp["target"].map(str) + " " \
    + train_pp["standard_error"].map(str)
data_to_augementation = data_to_augementation[["target_", "excerpt"]]

# data_to_augementation.head()

In [10]:
# save for augmentation
data_to_augementation.to_csv(
    "train_pp_for_augmentation.csv", 
    index=False, header=None, 
    sep="\t",
    encoding="utf-8"
)

In [11]:
# text augmentation using eda-nlp
!python eda/code/augment.py \
    --input=train_pp_for_augmentation.csv \
    --output=output_for_augmentation.csv \
    --num_aug=8 \
    --alpha_sr=0.1 --alpha_rd=0.1 --alpha_ri=0.1 --alpha_rs=0.1

generated augmented sentences with eda for train_pp_for_augmentation.csv to output_for_augmentation.csv with num_aug=8


In [12]:
# read augmented text data & label augmentation
train_aug = []

with open("output_for_augmentation.csv", "r", encoding="utf-8") as f:
    recent_id = None
    
    for line in f:
        label, excerpt = line.split("\t")
        id_, target, standard_error = label.split(" ")
        target_ = float(target)
        standard_error = float(standard_error)

        if recent_id != id_:
            original = train_pp[train_pp["id"] == id_]
            train_aug.append([
                original["id"].item(),
                original["excerpt"].item(),
                original["target"].item(),
                original["standard_error"].item()
            ])
            
            recent_id = id_
        
        min_ = target_ - 1.96*standard_error
        max_ = target_ + 1.96*standard_error
        target = random.uniform(min_, max_)
        
        train_aug.append([id_, excerpt, target, standard_error])

In [13]:
# print(len(train_original))
# print(len(train_aug))

In [14]:
train_aug = pd.DataFrame(
    train_aug, 
    columns=["id", "excerpt", "target", "standard_error"]
)

## Train&Test Split

In [15]:
# split section by ratio of test(0.2)
section = np.linspace(
    min(train_pp["target"]), 
    max(train_pp["target"]),
    int(len(train_pp)*0.2)
)

# print(len(section))

In [16]:
# choose a sample per section
test_id = []

for i in range(len(section)-1):
    start = section[i]
    end = section[i + 1]
    
    candidates = train_pp[train_pp["target"]>=start]
    candidates = train_pp[train_pp["target"]<end]
    
    random_index = random.randrange(0, len(candidates))
    test_id.append(candidates.iloc[random_index]["id"])
    
# len(test_id)

In [17]:
# train, test split
train = train_aug[~train_aug["id"].isin(test_id)]
test = train_aug[train_aug["id"].isin(test_id)]

# print(len(train))
# print(len(test))

In [18]:
# shuffle
train_shuffled = train.sample(frac=1).reset_index(drop=True)
test_shuffled = test.sample(frac=1).reset_index(drop=True)

# print(len(train_shuffled))
# print(len(test_shuffled))

In [19]:
# preparing
train_id = np.array(train_shuffled["id"])
train_X = np.array(train_shuffled["excerpt"])
train_y = np.array(train_shuffled["target"])

test_id = np.array(test_shuffled["id"])
test_X = np.array(test_shuffled["excerpt"])
test_y = np.array(test_shuffled["target"])

## Training 
- Loss Function : RMSE
- Need target normalization
- fine-tuning only
- using Huggingface's transformers library & pytorch
- using pretrained model(bert-base-uncased)

In [20]:
# tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [21]:
# print(tokenizer.tokenize(train_X[0]))
# print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_X[0])))
# print(tokenizer.encode(train_X[0]))

In [22]:
# encoding

def encoding(excerpts):
    input_ids = []
    attention_mask = []

    for excerpt in excerpts:
        result = tokenizer.encode_plus(
            excerpt,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(result["input_ids"])
        attention_mask.append(result["attention_mask"])
        
    input_ids = torch.cat(input_ids, dim=0)
    attention_mask = torch.cat(attention_mask, dim=0)

    return input_ids, attention_mask

In [23]:
# tensor

train_input_ids, train_attention_mask = encoding(train_X)
train_label = torch.reshape(torch.tensor(train_y, dtype=torch.float32), (len(train_y), 1))

test_input_ids, test_attention_mask = encoding(test_X)
test_label = torch.reshape(torch.tensor(test_y, dtype=torch.float32), (len(test_y), 1))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [24]:
# dataset
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_label)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_label)

In [25]:
# data loader
from torch.utils.data import DataLoader

batch_size = 4
num_workers = 6 # roughly half of cpu cores

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)

In [26]:
# model
import torch.nn as nn
from transformers import BertModel

class ReadebilityRegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.regression = nn.Sequential(
            nn.Linear(768, 1),
            nn.Dropout(0.2), 
            nn.ReLU()
        )
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = output.last_hidden_state[:, 0]
        logits = self.regression(last_hidden_state_cls)
        return logits

In [27]:
# initialize model, optimizer, scheduler
from transformers import AdamW, get_linear_schedule_with_warmup

model = ReadebilityRegressionModel()
model.to("cuda")

optimizer = AdamW(model.parameters())

epochs = 5
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps*0.05, num_training_steps=total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# loss function

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        mse_loss = nn.MSELoss(reduction='sum')
        eps = 1e-6
        loss = torch.sqrt(mse_loss(x, y) + eps)
        return loss

In [29]:
loss_fn = RMSELoss()

In [30]:
# training

def train(model, train_dataloader, val_dataloader=None, epochs=5, verbose_step=10):
    for i in range(epochs):
        start_datetime = datetime.now()
        print("Start : ", str(start_datetime))
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Time':^9}")
        print("-"*70)
        
        total_loss, batch_loss, batch_counts = 0, 0, 0
        
        model.train()
        
        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            input_ids_, attention_mask_, label_ = tuple(element.to("cuda") for element in batch)
            optimizer.zero_grad()
            
            logits = model(input_ids_, attention_mask_).float()
            
            loss = loss_fn(logits, label_)
            batch_loss += loss.item()
            total_loss += loss.item()
            
            loss.backward()
            
            # gradient clipping
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
            
            if step == 0 or step % verbose_step == 0 or step == len(train_dataloader) -1:
                middle_datetime = datetime.now()
                print(f"{i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {middle_datetime}")
                
                batch_loss, batch_count = 0, 0
                
        if val_dataloader:
            middle_datetime = datetime.now()
            avg_train_loss = total_loss / len(train_dataloader)
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            
            print(f"{i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f}  {val_loss:^10.6f} | {val_accuracy:^9.2f} | {middle_datetime}")
            print("-"*70)
            print("\n")
    
    end = datetime.now()
    print("Done : ", str(end))
    print("Total time : ", str(end-start))
    
def evaluate(model, val_dataloader):
    model.eval()
    
    val_accuracy = []
    val_loss = []
    
    for batch in val_dataloader:
        input_ids_, attention_mask_, label_ = tuple(element.to("cuda") for element in batch)
        
        with torch.no_grad():
            logits = model(input_ids_, attention_mask_)
            
        loss = loss_fn(logits, label_)
        val_loss.append(loss.item())
        
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = np.sqrt(np.mean(np.power((preds - label_).cpu().numpy(), 2)))
        val_accuracy.append(accuracy)
        
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    
    return val_loss, val_accuracy

In [None]:
train(model, train_dataloader, test_dataloader, epochs=10, verbose_step=100)

Start :  2021-06-24 16:45:28.435873
 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |   Time   
----------------------------------------------------------------------
   1    |    0    |   2.054930   |     -      |     -     | 2021-06-24 16:45:29.078307
   1    |   100   |   2.489528   |     -      |     -     | 2021-06-24 16:45:44.114672
   1    |   200   |   1.299267   |     -      |     -     | 2021-06-24 16:45:59.125547
   1    |   300   |   0.841167   |     -      |     -     | 2021-06-24 16:46:14.129171
   1    |   400   |   0.658022   |     -      |     -     | 2021-06-24 16:46:29.134374
   1    |   500   |   0.491364   |     -      |     -     | 2021-06-24 16:46:44.133756
   1    |   600   |   0.433857   |     -      |     -     | 2021-06-24 16:46:59.154979
   1    |   700   |   0.353234   |     -      |     -     | 2021-06-24 16:47:14.206933
   1    |   800   |   0.329005   |     -      |     -     | 2021-06-24 16:47:29.265986
   1    |   900   |   0.281975   |     -

## Experiment Point
- Is that necessary target normalization?
- HiperParameter & Model tuning
    - augmentation : num_aug, alpha_sr, alpha_rd, alpha_ri, alpha_rs
    - DataLoader : batch_size, num_workers
    - Model : nn.Sequential(...)
    - Initialization : epochs, total_steps, num_warmup_steps

In [None]:
# end
notebook_end = datetime.now()
print("Total Notebook Time : ", str(notebook_end - notebook_start))