In [2]:
import transformers
from transformers import AlbertModel, AlbertConfig, AlbertTokenizer, AlbertForPreTraining, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
torch.cuda.get_device_name()

'NVIDIA GeForce RTX 3060 Laptop GPU'

In [4]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data Exploration & Preparation

In [5]:
print(torch.version.cuda)

10.1


1. bert-base-uncased
- 12-layer, 768-hidden, 12-heads, 110M parameters.
- Trained on lower-cased English text.

2. bert-large-uncased
- 24-layer, 1024-hidden, 16-heads, 340M parameters.
- Trained on lower-cased English text.

3. bert-base-cased
- 12-layer, 768-hidden, 12-heads, 110M parameters.
- Trained on cased English text.

4. bert-large-cased
- 24-layer, 1024-hidden, 16-heads, 340M parameters.
- Trained on cased English text.

In [6]:
# 다른 pretrained model을 사용하고 싶다면 위에서 원하는 이름을 선택해서 아래 변수명을 변경한다.
PRE_TRAINED_MODEL_NAME = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME,return_dict=False)
BATCH_SIZE = 2
tag = 'sentiment'

In [7]:
def load_dataset(tag):
    if tag == 'irrelevant':
        train_df = pd.read_csv('data/irrelevant_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/irrelevant_test.tsv', sep='\t')
    elif tag == 'sentiment':
        train_df = pd.read_csv('data/sentiment_train.tsv', sep='\t')
        valid_df = pd.read_csv('data/sentiment_test.tsv', sep='\t')
    else:
        train_df = None
        valid_df = None
    
    return train_df, valid_df

In [8]:
class SentimentDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation = True
        )
        
        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [9]:
train_df, valid_df = load_dataset(tag)
df_train, df_test = train_test_split(train_df, test_size=0.2, random_state=1234)
print(df_train.shape, df_test.shape)

(32887, 4) (8222, 4)


In [10]:
train_dataset = SentimentDataset(df_train.sentence.values, df_train.sentiment.values, tokenizer, max_len=512)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0)

test_dataset = SentimentDataset(df_test.sentence.values, df_test.sentiment.values, tokenizer, max_len=512)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=0)

In [11]:
data = next(iter(train_dataloader))
data.keys()
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2])


### Training

In [12]:
bert_model = AlbertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [13]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AlbertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [14]:
model = SentimentClassifier(2)
model = model.to(device)

In [15]:
EPOCHS = 10 ## 바꿔야할 파라미터
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [16]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [17]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [19]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):

    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    val_acc, val_loss = eval_model(
        model,
        test_dataloader,
        loss_fn,
        device,
        len(df_test)
    )
    
    print(f'Epoch [{epoch + 1}/{EPOCHS}] Train loss: {train_loss} acc: {train_acc} | Val loss: {val_loss} acc: {val_acc}')

    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'model/'+tag+'_BERT_model.bin')
        be4st_accuracy = val_acc

Epoch [1/10] Train loss: 0.6774538520970967 acc: 0.6066834919573083 | Val loss: 0.6746532628858678 acc: 0.5985161761128679



### Evaluation

In [None]:
valid_dataset = SentimentDataset(valid_df.sentence.values, valid_df.sentiment.values, tokenizer, max_len=512)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, num_workers=4)

In [None]:
result = eval_model(model, valid_dataloader, loss_fn, device, len(valid_dataset))
print(result)

(tensor(0.6641, device='cuda:0', dtype=torch.float64), 3.2149424813576415)


### Load model and Get probabilities

In [18]:
model = SentimentClassifier(2)
model.load_state_dict(torch.load('model/'+tag+'_BERT_model.bin'))
model = model.to(device)

In [19]:
def inference(input_text, model):
    encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=512,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    
    logits = model(input_ids, attention_mask)
    softmax_prob = torch.nn.functional.softmax(logits, dim=1)
    _, prediction = torch.max(softmax_prob, dim=1)
    
    return softmax_prob, prediction
        

In [20]:
# Example code
review_text = "any line of news to test code"
class_prob, pred = inference(review_text, model)
print(class_prob.detach().cpu().numpy()[0])
print(pred.detach().cpu().numpy()[0])

[0.50530934 0.49469066]
0


In [21]:
from tqdm import tqdm
def save_results(df, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for idx, row in tqdm(df.iterrows()):
            news_id = row['id']
            text = row['sentence'].replace('\t',' ')
            sentiment = row['sentiment']
            class_prob, pred = inference(text, model)

            class_prob = [str(x) for x in class_prob.detach().cpu().numpy()[0]]
            pred = pred.detach().cpu().numpy()[0]

            result = str(news_id).replace('\t','')+'\t'+text+'\t'+'\t'.join(class_prob)+'\t'+str(pred)+'\t'+str(int(sentiment)).replace('\t','')
            
            f.write(result+'\n')

In [22]:
save_results(train_df, 'data/'+tag+'_bert_prediction_train.csv')

41109it [11:06, 61.68it/s]


In [23]:
save_results(valid_df, 'data/'+tag+'_bert_prediction_test.csv')

17628it [04:45, 61.64it/s]
