In [1]:
from __future__ import print_function

import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.distributed as dist

import numpy as np
import pandas as pd
import re
import operator

import os
import random

from datasets import load_dataset

Idx = str("7")
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=Idx

from tqdm.notebook import tqdm
import utils
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
from transformers import AutoModel, AutoTokenizer

Tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
DataName = "TREC" # TREC | SST | IMDB
BATCH_SIZE = 2**5

In [3]:
class DatasetForClassification(Dataset):
    def __init__(self, tokenizer, max_length=None, data_split='train'):
        def data_process(data_name="TREC", data_split='train'):
            all_data = []
            if data_name == "TREC":
                dataset = load_dataset("trec")
                for d in dataset[data_split]:
                    all_data.append({
                        "input_text": d["text"],
                        "output_text": d["coarse_label"]
                    })
            elif data_name == "SST":
                dataset = load_dataset("sst2")
                if data_split == "test":
                    data_split = "validation"
                for d in dataset[data_split]:
                    all_data.append({
                        "input_text": d["sentence"],
                        "output_text": d["label"]
                    })
            elif data_name == "IMDB":
                dataset = load_dataset("imdb")
                for d in dataset[data_split]:
                    all_data.append({
                        "input_text": d["text"],
                        "output_text": d["label"]
                    })
                
            return all_data
    
        dataset = data_process(data_name=DataName, data_split=data_split)
        
        random.seed(42)
        random.shuffle(dataset)
        
        if dist.is_initialized() and dist.get_rank() == 0:
            pbar = tqdm(total = len(dataset))
            
        inputs, targets = [], []
        for data in dataset:
            input_text = data['input_text'].strip()
            output_text = data['output_text']
            
            inputs.append(input_text)
            targets.append(output_text)
            if dist.is_initialized() and dist.get_rank() == 0:
                pbar.update(1)
        if dist.is_initialized() and dist.get_rank() == 0:
            pbar.close()
        
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        
        tokenized_input = self.tokenizer(
            self.inputs[index],
            add_special_tokens=True,
            return_attention_mask=True,
            truncation=self.max_length is not None,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        
        tokenized_input = { k: v[0] for k, v in tokenized_input.items() }
        return {
            **tokenized_input,
            "labels": self.targets[index],
        }

In [4]:
train_data = DatasetForClassification(Tokenizer, max_length=512, data_split="train")
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE)
valid_data = DatasetForClassification(Tokenizer, max_length=512, data_split="test")
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

Found cached dataset trec (/home/jovyan/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset trec (/home/jovyan/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
### Hyperparameters
Hyperparams = {
    "NumClfEpoch" : 100,
    "EmbeddingSize" : 768, # 300, 768
    "BatchSize" : BATCH_SIZE,
    "LearningRate_EMB" : 0, # Freeze
    "LearningRate_CLF" : 1e-3,
    "MaxSeqLen" : 512,
    "NumClass" : 6 if DataName == "TREC" else 2,
}

# train_dataset = dataloader.ClassifyDataset(x=x_train_emb, y=y_train)
# valid_dataset = dataloader.ClassifyDataset(x=x_valid_emb, y=y_valid)
# test_dataset = dataloader.ClassifyDataset(x=x_test_emb, y=y_test)
# train_loader = DataLoader(dataset=train_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)
# valid_loader = DataLoader(dataset=valid_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)
# test_loader = DataLoader(dataset=test_dataset, batch_size=Hyperparams["BatchSize"], shuffle=True, num_workers=0)

In [6]:
class LinearClassifier(nn.Module):
    def __init__(self, Hyperparams):
        ###
        super(LinearClassifier, self).__init__()
        self.EmbeddingSize = Hyperparams["EmbeddingSize"]
        self.LearningRate = Hyperparams["LearningRate_CLF"]
        self.BatchSize = Hyperparams["BatchSize"]
        self.MaxSeqLen = Hyperparams["MaxSeqLen"]
        self.NumClass = Hyperparams["NumClass"]
        
        self.fc = nn.Sequential(
#             nn.Dropout(self.DropoutRate),
            nn.Linear(self.EmbeddingSize, self.NumClass),
        )
        
    def forward(self, x):
        out = self.fc(x)
        return out

In [None]:
for Iter in range(5):
    Hyperparams["LearningRate_EMB"] = 0
    Hyperparams["LearningRate_CLF"] = 1e-3
    
    Embedding = AutoModel.from_pretrained('bert-base-uncased')
#     Embedding = AutoModel.from_pretrained('./save/WD+DE+WW+WE_best/')
#     Embedding = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
#     Embedding = AutoModel.from_pretrained('kanishka/GlossBERT')
#     Embedding = AutoModel.from_pretrained('./defbert')
#     Embedding = AutoModel.from_pretrained('/home/jovyan/temp/defbert1')

    Embedding = nn.DataParallel(Embedding).to(device)
    Classifier = LinearClassifier(Hyperparams)
    Classifier = nn.DataParallel(Classifier).to(device)
    
    criterion = nn.CrossEntropyLoss()
    UNKWords = []

    # Train the model
    maxacc_all = 0; Step = 0
    
    optimizer_emb = torch.optim.Adam(filter(lambda p: p.requires_grad, Embedding.parameters()), lr=Hyperparams["LearningRate_EMB"])
    optimizer_clf = torch.optim.Adam(filter(lambda p: p.requires_grad, Classifier.parameters()), lr=Hyperparams["LearningRate_CLF"])
    maxacc_val = 0
    EarlyStopCnt = 5
    total_step = len(train_loader)
    pbar1 = tqdm(total = Hyperparams["NumClfEpoch"], leave=False, desc="Epoch")
#         Embedding.train() # train mode
    Classifier.train()
    for epoch in range(Hyperparams["NumClfEpoch"]):
        pbar1.update(1)
        pbar2 = tqdm(total = len(train_loader), leave=False, desc="Training")
        for i, (batch) in enumerate(train_loader):
            batch = { k: v.to(device) for k, v in batch.items() }
            labels = batch.pop('labels')
            with torch.no_grad():
                emb = Embedding(**batch, output_hidden_states=True)
            outputs = Classifier(emb[2][-1][:,0,:])
            loss = criterion(outputs, labels)
#             optimizer_emb.zero_grad()
            optimizer_clf.zero_grad()
            loss.backward()
#             optimizer_emb.step()
            optimizer_clf.step()
            pbar2.update(1)
        pbar2.close()

#         print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, Hyperparams["NumClfEpoch"], loss.item()), end=' ')

        # Valid the model
        Embedding.eval()
        Classifier.eval()
        with torch.no_grad():
            correct = 0.
            total = 0
            for batch in valid_loader:
                batch = { k: v.to(device) for k, v in batch.items() }
                labels = batch.pop('labels')
                emb = Embedding(**batch, output_hidden_states=True)
                outputs = Classifier(emb[2][-1][:,0,:])
                predicted = []
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            if (correct/total) >= maxacc_val:
                maxacc_val = correct/total
                torch.save(Embedding.state_dict(), "./save/Embedding"+ Idx)
                torch.save(Classifier.state_dict(), "./save/Classifier"+ Idx)
                EarlyStopCnt = 5

#             print('ValidAcc: {:.4f} % , MaxAcc: {:.4f}, EarlyStopCnt: {}'.format(100 * correct/total, maxacc_val, EarlyStopCnt))

            if (correct/total) < maxacc_val:
                EarlyStopCnt = EarlyStopCnt-1
                if EarlyStopCnt == 0:
                    break

    ### Test Acc.
    Embedding.load_state_dict(torch.load("./save/Embedding"+ Idx))
    Classifier.load_state_dict(torch.load("./save/Classifier"+ Idx))

    with torch.no_grad():
        Embedding.eval()
        Classifier.eval()
        correct = 0.
        total = 0
        for batch in valid_loader:
            batch = { k: v.to(device) for k, v in batch.items() }
            labels = batch.pop('labels')
            emb = Embedding(**batch, output_hidden_states=True)
            outputs = Classifier(emb[2][-1][:,0,:])
            predicted = []
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print('==> TestAcc: {:.4f} % , ValidAcc: {:.4f}'.format(100 * correct / total, maxacc_val))

    pbar1.close()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

==> TestAcc: 92.0000 % , ValidAcc: 0.9200


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

Training:   0%|          | 0/171 [00:00<?, ?it/s]

In [None]:
bert-base-uncased