# Import requirements

In [None]:
! pip install transformers nlpaug sacremoses wandb sentencepiece
! wandb login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 15.5 MB/s 
[?25hCollecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 86.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 83.1 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 62.1 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 88.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir(os.path.join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project1'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pdb
import argparse
import random
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import pandas as pd
from tqdm import tqdm, trange

from transformers import (
    DebertaV2ForSequenceClassification,
    DebertaV2TokenizerFast,
    AdamW,
    get_linear_schedule_with_warmup
)

import nlpaug.augmenter.word as naw

import wandb

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# random seed
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

  _resample_loop_p(x, t_out, interp_win, interp_delta, num_table, scale, y)


<torch._C.Generator at 0x7f5323fc37b0>

In [None]:
class ProjectDataLoader(object):
    def __init__(self, file_list, aug=None):
        self.file_list = file_list
        self.dataset = {}
        self.load()
        if aug:
            print(f'Wait for loading augmenter...')
            self.augmenter = self.load_augmenter()
        else:
            self.augmenter = None

    def load(self):
        print(f'Load datasets from {self.file_list}')
        for file_path in self.file_list:
            data_type, label = self.get_info_from_name(file_path)
            label = int(label)
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                self.dataset[file_path] = list(map(lambda x: x.strip(), lines))

    def load_augmenter(self):
        augmenter = [
            naw.BackTranslationAug(
                from_model_name='facebook/wmt19-en-de',
                to_model_name='facebook/wmt19-de-en',
                device='cuda'
            ),
            naw.ContextualWordEmbsAug(
                model_path='bert-base-uncased', # 'distilbert-base-uncased' or 'roberta-base available'
                action='insert',
                device='cuda'
            ),
            naw.ContextualWordEmbsAug(
                model_path='bert-base-uncased',
                action='substitute',
                device='cuda'
            ),
            naw.SynonymAug(aug_src='wordnet'),
            naw.RandomWordAug(action='swap')
        ]
        return augmenter

    def get_info_from_name(self, name):
        data_type, label = name.split('.')[-2:]
        return data_type, label

    # Drop duplicated data.
    def drop_duplicated(self, verbose=False):
        print('-'*30)
        print('> Drop duplicated data')
        for name, dataset in self.dataset.items():
            print('-'*30)
            print(f'{name}')
            if 'dev' in name:
                train_set = set(self.dataset[name.replace('dev', 'train')])
                orig_len = len(dataset)
                val_set = set(dataset)
                val_set = val_set - train_set
                dataset = list(val_set)
                drop_len = len(dataset)
                self.dataset[name] = dataset
                print(f'drop duplicated with train: {orig_len:,} -> {drop_len:,}')

            orig_len = len(dataset)
            set_dataset = set(dataset)
            drop_len = len(set_dataset)
            num_duplicated = orig_len - drop_len
            
            print(f'duplicated : total / {num_duplicated:,} : {orig_len:,}')
            self.dataset[name] = list(set_dataset)
            print(f'{len(self.dataset[name]):,} sentences exist in {name}.')

    # Data augmentation
    def augment(self, ratio, test=False):
        if not self.augmenter:
            print(f'Augmenter is not exist.')
            return 
        
        result = []
        for name, dataset in self.dataset.items():
            orig_len = len(dataset)
            data_type, label = self.get_info_from_name(name)
            if data_type != 'train':
                continue
            print('-'*30)
            print(f'{name}')
            
            num_sentences = len(dataset)
            num_aug_per = int(num_sentences * ratio / len(self.augmenter))
            print(f'{num_aug_per * len(self.augmenter):,} augmentated data will be added.')
            
            sampled_sentences = random.sample(dataset, num_aug_per * len(self.augmenter))
            sampled_sentences = self.list_chunk(sampled_sentences, num_aug_per)
            aug_sentences = []
            for idx, sentences in tqdm(zip(list(range(len(self.augmenter))), sampled_sentences), total=len(sampled_sentences), desc='augmentation'):
                aug_sentence = self.augmenter[idx].augment(sentences)
                aug_sentences += aug_sentence
                if test:              
                    result.append((idx, sentences, aug_sentence))

            self.dataset[name] += aug_sentences
            print(f'total: {orig_len:,} -> {len(self.dataset[name]):,}')
        if test:
            return result
        
    # Important part... We used this code before. But this method require dataset's name.
    def make_id_file(self, name, tokenizer):
        print(f'tokenizing {name}')
        data_strings = []
        id_file_data = [tokenizer.encode(line.lower()) for line in self.dataset[name]]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings

    def list_chunk(self, arr, n):
        return [arr[i: i + n] for i in range(0, len(arr), n)]

    def __getitem__(self, idx):
        return self.dataset[idx]

    # You don't need to look this method. This method shows information about our datasets.
    def summary(self):
        print('-'*30)
        print('> Smmary')
        for name, dataset in self.dataset.items():
            data_type, label = self.get_info_from_name(name)
            num_of_sentences = len(dataset)
            print('-'*30)
            print(f'[{name}]')
            print(f'number of sentences: {num_of_sentences:,}')
            print(f'dataset type: {data_type}')
            print(f'label: {label}')

class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

class WeightedFocalLoss(torch.nn.Module):
    def __init__(self, alpha=.25, gamma=2):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1-alpha]).cuda()
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = torch.nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        targets = targets.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-BCE_loss)
        F_loss = at*(1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

In [None]:
file_list = ['sentiment.train.0', 
             'sentiment.train.1', 
             'sentiment.dev.0', 
             'sentiment.dev.1']

duplication_check = True
data_augmentation = True
aug_ratio = 0.4

datasets = ProjectDataLoader(file_list, aug=False)
datasets.summary()
if duplication_check:
    datasets.drop_duplicated()
# if data_augmentation:
#     aug_sentences = datasets.augment(aug_ratio, True)  

Load datasets from ['sentiment.train.0', 'sentiment.train.1', 'sentiment.dev.0', 'sentiment.dev.1']
------------------------------
> Smmary
------------------------------
[sentiment.train.0]
number of sentences: 177,218
dataset type: train
label: 0
------------------------------
[sentiment.train.1]
number of sentences: 266,041
dataset type: train
label: 1
------------------------------
[sentiment.dev.0]
number of sentences: 2,000
dataset type: dev
label: 0
------------------------------
[sentiment.dev.1]
number of sentences: 2,000
dataset type: dev
label: 1
------------------------------
> Drop duplicated data
------------------------------
sentiment.train.0
duplicated : total / 19,437 : 177,218
157,781 sentences exist in sentiment.train.0.
------------------------------
sentiment.train.1
duplicated : total / 43,167 : 266,041
222,874 sentences exist in sentiment.train.1.
------------------------------
sentiment.dev.0
drop duplicated with train: 2,000 -> 1,726
duplicated : total / 0 : 1

In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=1)
model.to(device)
model.train()

tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-base', do_lower_case=True)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [None]:
learning_rate = 5e-5
betas = [0.9, 0.99]
eps = 1e-6
apply_scheduler = True
weight_decay = 0.01 # Order to paper, proper weight_decay is 0.01
optimizer = AdamW(model.parameters(), betas=betas, eps=eps, weight_decay=weight_decay, lr=learning_rate) 
train_epoch = 3
train_batch_size = 32
eval_batch_size = 32
total_train_step = np.ceil((len(datasets['sentiment.train.0']) + len(datasets['sentiment.train.1'])) / train_batch_size) * train_epoch
if apply_scheduler:
    scheduler = get_linear_schedule_with_warmup(optimizer, 1000, total_train_step)

training_name = f'dc_{duplication_check}-lr{learning_rate}-sc{apply_scheduler}-wd{weight_decay}-bs{train_batch_size}-da{aug_ratio}'
wandb.init(
    entity='team_koowater',
    project='DeBERTaV3',
    name=training_name,
    config={
        'model': model.__class__.__name__,
        'learning_rate': learning_rate,
        'optimizer': optimizer.__class__.__name__,
        'betas': betas,
        'eps': eps,
        'weight_decay': weight_decay,
        'scheduler': apply_scheduler,
        'duplicated_check': duplication_check,
        'data_augmentation': data_augmentation,
        'aug_ratio': aug_ratio,
        'train_epoch': train_epoch,
        'train_batch_size': train_batch_size,
        'val_batch_size': eval_batch_size
})

[34m[1mwandb[0m: Currently logged in as: [33mkoowater[0m ([33mteam_koowater[0m). Use [1m`wandb login --relogin`[0m to force relogin


# 1. Preprocess

## 고려하지 않는 augmentation

- character-level 

## 적용 예정 augmentation

- BackTranslationAug
- `WordEmbsAug`: word2vec에 따라, 문장 내 선택된 특정 단어와 유사한 단어를 선택해 교체한다.
- `TfIdfAug`: TF-IDF 유사도에 따라 단어를 삽입한다.
- `ContextualWordEmbsAug`: (BERT, DistillBERT, RoBERTa, XLNet) 등 LM의 contextual word embeddings에 따라 단어를 삽입 또는 교체한다. 
- `SynonymAug`: (WordNet, PPDB)의 동의어에 따라 단어를 교체한다.

## 고려 해야 할 augmentation

- `RandomWordAug(swap)`: 문장 내 두 단어의 순서를 교체한다.
- `RandomWordAug()`: 문장 내 단어들을 무작위로 삭제한다.
- `RandomWordAug(crop)`: 문장 내 단어 덩어리를 무작위로 삭제한다.


In [None]:
train_pos = datasets.make_id_file('sentiment.train.1', tokenizer)
train_neg = datasets.make_id_file('sentiment.train.0', tokenizer)
val_pos = datasets.make_id_file('sentiment.dev.1', tokenizer)
val_neg = datasets.make_id_file('sentiment.dev.0', tokenizer)

tokenizing sentiment.train.1
tokenizing sentiment.train.0
tokenizing sentiment.dev.1
tokenizing sentiment.dev.0


In [None]:
import pickle

In [None]:
# with open("sentiment_aug_4.train.1", "wb") as fw:
#     pickle.dump(train_pos, fw)
# with open("sentiment_aug_4.train.0", "wb") as fw:
#     pickle.dump(train_neg, fw)
# with open("sentiment_aug.dev.1", "wb") as fw:
#     pickle.dump(val_pos, fw)
# with open("sentiment_aug.dev.0", "wb") as fw:
#     pickle.dump(val_neg, fw)

In [None]:
with open("sentiment_aug_4.train.1", "rb") as fr:
    train_pos = pickle.load(fr)
with open("sentiment_aug_4.train.0", "rb") as fr:
    train_neg = pickle.load(fr)
# with open("sentiment.dev.1", "rb") as fr:
#     val_pos = pickle.load(fr)
# with open("sentiment.dev.0", "rb") as fr:
#     val_neg = pickle.load(fr)

In [None]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
val_dataset = SentimentDataset(tokenizer, val_pos, val_neg)

In [None]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]

    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids, labels

### Original paper hyperparameter

- learning rate: 6e-4
- optimizer: AdamW with weight decay, eps=1e-6, b1=0.9, b2=0.98
- batch size: 8k
- Weight decay: 0.01
- Warmup step: 10k
- Learning Rate Decay: Linear

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [None]:
lowest_valid_loss = 9999.
highest_acc = 0.

for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        losses = []
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = torch.squeeze(labels) # For DeBERTa
            labels = labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)
            logits = output.logits
            loss = output.loss
            losses.append(loss.item())
            loss.backward()

            optimizer.step()
            if apply_scheduler:
                scheduler.step()

            # mini-batch의 loss만이 아니라 epoch의 total loss를 고려해서 출력해야한다.
            tepoch.set_postfix(loss=np.mean(losses))
            if iteration != 0 and iteration % int(len(train_loader) / 20) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(val_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=True):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = torch.squeeze(labels) # For DeBERTa
                        labels = labels.to(device, dtype=torch.float)

                        output = model(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if logit < 0.5 else 1 for logit in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels
                
                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)
                wandb.log({
                    'loss': np.mean(losses),
                    'val_loss': np.mean(valid_losses),
                    'acc': acc,
                    'lr': optimizer.param_groups[0]["lr"]

                })

                losses = []
                model.train()

                if lowest_valid_loss > valid_loss or highest_acc < acc:
                        
                    print(f'Model saved - val_loss: {valid_loss}, acc: {acc}')
                    if lowest_valid_loss > valid_loss:
                        lowest_valid_loss = valid_loss
                        torch.save(model.state_dict(), f"./lowest_val_loss_{training_name}.bin")
                    if highest_acc < acc:
                        highest_acc = acc
                        torch.save(model.state_dict(), f"./highest_acc_{training_name}.bin")

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

## Test and save csv file

In [None]:
model.load_state_dict(
    torch.load('highest_acc_dc_False-lr5e-05-scFalse-wd0.0-bs128.bin')
    )
test_df = pd.read_csv('test_no_label.csv')
test_dataset = test_df['Id']
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)
    sorted_indices = range(len(input_ids))
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2, sampler=None)

In [None]:
with torch.no_grad():
    model.eval()
    sentences = []
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Val',
                                                                        position=0,
                                                                        leave=True):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)
        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if logit < 0.5 else 1 for logit in logits]
        predictions += batch_predictions

        for input_id in input_ids:
            sentences.append(tokenizer.decode(input_id))

Val: 100%|██████████| 32/32 [00:02<00:00, 10.82it/s]


In [None]:
test_df['Category'] = predictions
test_df.to_csv('submission.csv', index=False)