In [1]:
import transformers
from transformers import MobileBertModel, MobileBertConfig, MobileBertTokenizer, MobileBertForPreTraining, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [3]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data Exploration & Preparation

In [4]:
print(torch.__version__)
print(transformers.__version__)

1.8.2+cu111
4.16.2


1. bert-base-uncased
- 12-layer, 768-hidden, 12-heads, 110M parameters.
- Trained on lower-cased English text.

2. bert-large-uncased
- 24-layer, 1024-hidden, 16-heads, 340M parameters.
- Trained on lower-cased English text.

3. bert-base-cased
- 12-layer, 768-hidden, 12-heads, 110M parameters.
- Trained on cased English text.

4. bert-large-cased
- 24-layer, 1024-hidden, 16-heads, 340M parameters.
- Trained on cased English text.

In [5]:
# 다른 pretrained model을 사용하고 싶다면 위에서 원하는 이름을 선택해서 아래 변수명을 변경한다.
PRE_TRAINED_MODEL_NAME = "google/mobilebert-uncased"
tokenizer = MobileBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
## 딕셔너리 처리. mlflow에서 처리 가능. 키는 모델이름. 모델의 메타데이터 정리. model meta 데이터를 넣어놓고, 모델에 필요한 값을 넣음. 키워드 argument를 딕셔너리로 사용해서, mobilebert만 쓰고 모델 이름만 넣고 나머지는 nested되어 있는 dict로 사용. 
## 모델 params를 사용해서 dict관리를 하고 mlflow
BATCH_SIZE = 4
tag = 'sentiment'

In [6]:
def load_dataset(tag):
    if tag == 'irrelevant':
        train_df = pd.read_csv('data/irrelevant_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/irrelevant_test.tsv', sep='\t')
    elif tag == 'sentiment':
        train_df = pd.read_csv('data/sentiment_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/sentiment_test.tsv', sep='\t')
    else:
        train_df = None
        valid_df = None
    
    return train_df, valid_df

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation = True
        )
        
        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [8]:
train_df, valid_df = load_dataset(tag)
df_train, df_test = train_test_split(train_df, test_size=0.2, random_state=1234)
print(df_train.shape, df_test.shape)

(32887, 4) (8222, 4)


In [9]:
train_dataset = SentimentDataset(df_train.sentence.values, df_train.sentiment.values, tokenizer, max_len=512)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0)

test_dataset = SentimentDataset(df_test.sentence.values, df_test.sentiment.values, tokenizer, max_len=512)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=0)

In [10]:
data = next(iter(train_dataloader))
data.keys()
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

torch.Size([4, 512])
torch.Size([4, 512])
torch.Size([4])


### Training

In [11]:
bert_model = MobileBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False) 

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing MobileBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = MobileBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False) 
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [13]:
model = SentimentClassifier(2)
model = model.to(device)

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing MobileBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [16]:
EPOCHS = 50 ## 바꿔야할 파라미터
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=10,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [17]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [18]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [19]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):

    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    val_acc, val_loss = eval_model(
        model,
        test_dataloader,
        loss_fn,
        device,
        len(df_test)
    )
    
    print(f'Epoch [{epoch + 1}/{EPOCHS}] Train loss: {train_loss} acc: {train_acc} | Val loss: {val_loss} acc: {val_acc}')

    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'model/'+tag+'_BERT_model.bin')
        be4st_accuracy = val_acc

Epoch [1/50] Train loss: 11829.01041102961 acc: 0.6788396630887584 | Val loss: 0.6079193522976095 acc: 0.7017757236682073

Epoch [2/50] Train loss: 0.6588206771726405 acc: 0.7394715237023748 | Val loss: 0.732547078240873 acc: 0.7037217222087083

Epoch [3/50] Train loss: 0.6432582089496628 acc: 0.7922279320096086 | Val loss: 1.0482096718936789 acc: 0.6971539771345171

Epoch [4/50] Train loss: 0.7028352655753197 acc: 0.841943625140633 | Val loss: 1.3233018156992868 acc: 0.6931403551447336

Epoch [5/50] Train loss: 0.5985469914003259 acc: 0.8768510353635175 | Val loss: 1.5624019861467224 acc: 0.681464363901727

Epoch [6/50] Train loss: 0.7295174352360444 acc: 0.9012375710767172 | Val loss: 1.7712680763927924 acc: 0.6802481148139139

Epoch [7/50] Train loss: 0.42886713599984344 acc: 0.9181135403046796 | Val loss: 1.869744169604592 acc: 0.6831671126246656

Epoch [8/50] Train loss: 0.3347808203101577 acc: 0.9305196582236142 | Val loss: 2.1603773286860344 acc: 0.684869861347604

Epoch [9/50] 

### Evaluation

In [None]:
valid_dataset = SentimentDataset(valid_df.sentence.values, valid_df.sentiment.values, tokenizer, max_len=512)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, num_workers=0)

In [None]:
result = eval_model(model, valid_dataloader, loss_fn, device, len(valid_dataset))
print(result)

In [None]:
torch.save(quantized_model.state_dict(), 'quantized_mobilebert_model.pt')

### Load model and Get probabilities

In [None]:
import torch.quantization

In [8]:
model = SentimentClassifier(2)
model.load_state_dict(torch.load('model/mobileBert_67.pt', map_location ="cpu"))
model = model.to('cpu')

In [None]:

quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [8]:
#quantize model
model.qconfig = torch.quantization.get_default_qconfig('qnnpack')
torch.quantization.prepare(model, inplace=True)
# Calibrate
print(torch.backends.quantized.supported_engines)
torch.quantization.convert(model, inplace=True)
#optimize
script_subnet = torch.jit.script(model)
script_subnet_optimized = optimize_for_model(script_model)

['none']


RuntimeError: Didn't find engine for operation quantized::linear_prepack NoQEngine (operator () at ..\aten\src\ATen\native\quantized\cpu\qlinear_prepack.cpp:202)
(no backtrace available)

In [None]:
# set quantization config for server (x86)
model.qconfig = torch.quantization.get_default_config('fbgemm')

# insert observers
torch.quantization.prepare(model, inplace=True)
# Calibrate the model and collect statistics

# convert to quantized version
torch.quantization.convert(model, inplace=True)

In [13]:
model = SentimentClassifier(2)
model.load_state_dict(torch.load('model/mobilebert_model.pt', map_location='cpu'), strict = False)

<All keys matched successfully>

In [9]:
def inference(input_text, model):
            
    encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=512,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to("cpu")
    attention_mask = encoded_review['attention_mask'].to("cpu")
    
    logits = model(input_ids, attention_mask)
    softmax_prob = torch.nn.functional.softmax(logits, dim=1)
    _, prediction = torch.max(softmax_prob, dim=1)
    
    return softmax_prob, prediction
        

In [10]:
# Example code
review_text = "He also signalled Russia’s interest in developing trade and economic as well as investment and security cooperation both with Moscow’s European partners and the US."
class_prob, pred = inference(review_text, model.eval())
print(class_prob.detach().cpu().numpy()[0])
print(pred.detach().cpu().numpy()[0])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.


[0.00211425 0.99788576]
1


In [3]:
import html
print(html.unescape('에&#50068;'))

에쎔


In [None]:
train_df

In [None]:
from tqdm import tqdm
def save_results(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(df.iterrows()):
            news_id = row['id']
            text = row['sentence'].replace('\t',' ')
            sentiment = row['sentiment']
            class_prob, pred = inference(text, model)

            class_prob = [str(x) for x in class_prob.detach().cpu().numpy()[0]]
            pred = pred.detach().cpu().numpy()[0]

            result = str(news_id).replace('\t','')+'\t'+text+'\t'+'\t'.join(class_prob)+'\t'+str(pred)+'\t'+str(int(sentiment)).replace('\t','')
            
            f.write(result+'\n')

In [None]:
save_results(train_df, 'data/'+tag+'_bert_prediction_train.csv')

In [None]:
save_results(valid_df, 'data/'+tag+'_bert_prediction_test.csv')