이번 과제는 Bert Model을 사용하여 BBC 뉴스 기사의 category를 분류해보는 과제입니다. clone coding을 하시되, 코드 주석을 line by line으로 꼼꼼하게 달아보시며 공부해보세요!

## 데이터 로드 및 탐색

In [1]:
%%capture
!pip install transformers

In [2]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
df = pd.read_csv('/content/drive/MyDrive/24s_basic_NLP/bbc-text.csv')

In [37]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [38]:
print(len(df))

2225


In [39]:
df.groupby('category').count()

Unnamed: 0_level_0,text
category,Unnamed: 1_level_1
business,510
entertainment,386
politics,417
sport,511
tech,401


## BertTokenizer (base)

토크나이저로 pretrain된 BERT의 BertTokenizer를 갖고 옵니다. 여러 종류를 시도해보세요.

- bert-base-uncased : 108MB param, all lowercase
- bert-large-cased : 340MB param, both upper and lower
- bert-base-cased : 108MB param, multi language, both upper and lower


In [54]:
# 토크나이저 선택

# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-large-cased')


labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

## Dataset

In [55]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = [labels[label] for label in df['category']] # 카테고리 이름을 숫자 label로 변환
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']] # text 컬럼의 텍스트를 BERT 토크나이저로 토큰화하여 저장

    def classes(self):
        return self.labels # 레이블 목록 반환

    def __len__(self):
        return len(self.labels) # 데이터 샘플 개수 반환

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx]) # 인덱스에 해당하는 라벨 반환

    def get_batch_texts(self, idx):
        return self.texts[idx] # 인덱스에 해당하는 토큰화된 텍스트 반환

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y # 인덱스에 해당하는 텍스트와 라벨 모두 반환

## Train & Evaluate BertClassifier

pretrain된 BertModel을 불러옵니다. 다른 간단한 층들도 같이 쌓아줍니다.

- bert-base-cased: 12-layer, 768-hidden, 12-self attention heads, 110M parameters. Trained on cased English text.


다른 종류들의 pretrianed model은 아래 링크에서 확인할 수 있습니다.

https://huggingface.co/transformers/v2.9.1/pretrained_models.html

In [56]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        # self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # self.bert = BertModel.from_pretrained('bert-large-cased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5) # input size: BERT의 출력 크기(768), output size: category 개수 (5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False) # pooled_output: 문장 수준의 출력
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [57]:
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = Dataset(train_data), Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss() # multi-classification!
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].to(device)

            if len(input_id.shape) > 2:
                input_id = input_id.squeeze(1)
            if len(mask.shape) > 2:
                mask = mask.squeeze(1)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].to(device)

                if len(input_id.shape) > 2:
                    input_id = input_id.squeeze(1)
                if len(mask.shape) > 2:
                    mask = mask.squeeze(1)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')



In [58]:
def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].to(device)

            if len(input_id.shape) > 2:
                input_id = input_id.squeeze(1)
            if len(mask.shape) > 2:
                mask = mask.squeeze(1)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [59]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

1780 222 223


In [18]:
EPOCHS = 2 #EPOCH 수 늘려보기!
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 890/890 [03:13<00:00,  4.61it/s]


Epochs: 1 | Train Loss:  0.715 | Train Accuracy:  0.393 | Val Loss:  0.531 | Val Accuracy:  0.644


100%|██████████| 890/890 [03:12<00:00,  4.62it/s]


Epochs: 2 | Train Loss:  0.336 | Train Accuracy:  0.883 | Val Loss:  0.195 | Val Accuracy:  0.973


In [19]:
evaluate(model, df_test)

Test Accuracy:  0.978


### bert-base-cased (epoch 5) + inference

In [21]:
EPOCHS = 5 #EPOCH 5회
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 890/890 [03:13<00:00,  4.61it/s]


Epochs: 1 | Train Loss:  0.770 | Train Accuracy:  0.323 | Val Loss:  0.671 | Val Accuracy:  0.568


100%|██████████| 890/890 [03:14<00:00,  4.58it/s]


Epochs: 2 | Train Loss:  0.489 | Train Accuracy:  0.675 | Val Loss:  0.366 | Val Accuracy:  0.766


100%|██████████| 890/890 [03:13<00:00,  4.60it/s]


Epochs: 3 | Train Loss:  0.279 | Train Accuracy:  0.805 | Val Loss:  0.254 | Val Accuracy:  0.779


100%|██████████| 890/890 [03:12<00:00,  4.62it/s]


Epochs: 4 | Train Loss:  0.212 | Train Accuracy:  0.814 | Val Loss:  0.230 | Val Accuracy:  0.784


100%|██████████| 890/890 [03:11<00:00,  4.65it/s]


Epochs: 5 | Train Loss:  0.186 | Train Accuracy:  0.814 | Val Loss:  0.214 | Val Accuracy:  0.770


In [22]:
evaluate(model, df_test) #??

Test Accuracy:  0.803


In [34]:
# inference
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {0: 'business', 1: 'entertainment', 2: 'sport', 3: 'tech', 4: 'politics'}

def predict(text, model, device):
    model.eval()
    inputs = tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
    input_id = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_id, mask)
        prediction = torch.argmax(output, dim=1).item()
        predicted_label = labels[prediction]

    return predicted_label

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 샘플 텍스트로 인퍼런스
sample_texts = df_test['text'].sample(5).tolist()  # 테스트 데이터에서 임의로 5개 샘플 선택

for text in sample_texts:
    predicted_label = predict(text, model, device)
    print(f"Text: {text[:100]}...")  # 텍스트의 처음 100자만 출력
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)


Text: bryan twins keep us hopes alive the united states kept the davis cup final alive with victory in sat...
Predicted Label: sport
--------------------------------------------------
Text: halo fans  hope for sequel xbox video game halo 2 has been released in the us on 9 november  with a ...
Predicted Label: tech
--------------------------------------------------
Text: will the budget bring out smiling voters  as tory spokesman oliver letwin said - any chancellor woul...
Predicted Label: business
--------------------------------------------------
Text: microsoft debuts security tools microsoft is releasing tools that clean up pcs harbouring viruses an...
Predicted Label: tech
--------------------------------------------------
Text: barclays profits hit record level barclays  the uk s third-biggest bank  has seen annual pre-tax pro...
Predicted Label: business
--------------------------------------------------


준수한 성능!

---


에폭 5로 했을 때 왜 성능이 떨어지는 지 잘 모르겠음 -> 과적합 예상

### bert-base-uncased (epoch 2) + inference

In [46]:
EPOCHS = 2 #EPOCH 2회
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 890/890 [03:13<00:00,  4.61it/s]


Epochs: 1 | Train Loss:  0.763 | Train Accuracy:  0.335 | Val Loss:  0.627 | Val Accuracy:  0.644


100%|██████████| 890/890 [03:12<00:00,  4.62it/s]


Epochs: 2 | Train Loss:  0.470 | Train Accuracy:  0.841 | Val Loss:  0.326 | Val Accuracy:  0.973


In [51]:
labels = {'business':0,
          'entertainment':1,
          'sport':2,
          'tech':3,
          'politics':4
          }

evaluate(model, df_test)

Test Accuracy:  0.978


In [52]:
# inference
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
labels = {0: 'business', 1: 'entertainment', 2: 'sport', 3: 'tech', 4: 'politics'}

def predict(text, model, device):
    model.eval()
    inputs = tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
    input_id = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_id, mask)
        prediction = torch.argmax(output, dim=1).item()
        predicted_label = labels[prediction]

    return predicted_label

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 샘플 텍스트로 인퍼런스
sample_texts = df_test['text'].sample(5).tolist()  # 테스트 데이터에서 임의로 5개 샘플 선택

for text in sample_texts:
    predicted_label = predict(text, model, device)
    print(f"Text: {text[:200]}...")  # 텍스트의 처음 100자만 출력
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)


Text: juninho demand for o neill talks juninho s agent has confirmed that the player is hoping for talks with martin o neill as the brazilian midfielder comes closer to departing celtic.  brian hassell says...
Predicted Label: sport
--------------------------------------------------
Text: briton wins short film oscar three of the five nominees in the live-action short film category at this year s oscars were british. for andrea arnold  who won the category  ashvin kumar and gary mckend...
Predicted Label: entertainment
--------------------------------------------------
Text: clarke to unveil immigration plan new controls on economic migrants and tighter border patrols will be part of government plans unveiled on monday.  home secretary charles clarke wants to introduce a ...
Predicted Label: politics
--------------------------------------------------
Text: jarvis sells tube stake to spain shares in engineering group jarvis have soared more than 16% on news that it is offloading its sta

test accuracy도 좋으나 실제로 inference는 business랑 politics에서 조금 헷갈려 하는 것 같음


---
- sports나 entertainment에 비해서 통용되는 단어들이 많아서일 것으로 예상.
- entertainment <-> business 간에도 유사성이 높을 것으로 예상.


### bert-base-uncased (epoch 5) + inference

In [60]:
EPOCHS = 5 #EPOCH 5회
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 890/890 [03:13<00:00,  4.59it/s]


Epochs: 1 | Train Loss:  0.748 | Train Accuracy:  0.374 | Val Loss:  0.636 | Val Accuracy:  0.581


100%|██████████| 890/890 [03:12<00:00,  4.61it/s]


Epochs: 2 | Train Loss:  0.499 | Train Accuracy:  0.806 | Val Loss:  0.357 | Val Accuracy:  0.941


100%|██████████| 890/890 [03:12<00:00,  4.62it/s]


Epochs: 3 | Train Loss:  0.275 | Train Accuracy:  0.958 | Val Loss:  0.195 | Val Accuracy:  0.991


100%|██████████| 890/890 [03:13<00:00,  4.60it/s]


Epochs: 4 | Train Loss:  0.154 | Train Accuracy:  0.986 | Val Loss:  0.111 | Val Accuracy:  0.982


100%|██████████| 890/890 [03:12<00:00,  4.61it/s]


Epochs: 5 | Train Loss:  0.094 | Train Accuracy:  0.988 | Val Loss:  0.073 | Val Accuracy:  0.991


In [61]:
evaluate(model, df_test)

Test Accuracy:  0.991


In [62]:
# inference
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
labels = {0: 'business', 1: 'entertainment', 2: 'sport', 3: 'tech', 4: 'politics'}

def predict(text, model, device):
    model.eval()
    inputs = tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
    input_id = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        output = model(input_id, mask)
        prediction = torch.argmax(output, dim=1).item()
        predicted_label = labels[prediction]

    return predicted_label

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 샘플 텍스트로 인퍼런스
sample_texts = df_test['text'].sample(5).tolist()  # 테스트 데이터에서 임의로 5개 샘플 선택

for text in sample_texts:
    predicted_label = predict(text, model, device)
    print(f"Text: {text[:100]}...")  # 텍스트의 처음 100자만 출력
    print(f"Predicted Label: {predicted_label}")
    print("-" * 50)


Text: muslim police stops  more likely  uk muslims should accept that people of islamic appearance are mor...
Predicted Label: politics
--------------------------------------------------
Text: macy s owner buys rival for $11bn us retail giant federated department stores is to buy rival may de...
Predicted Label: business
--------------------------------------------------
Text: dent continues adelaide progress american taylor dent reached the final of the australian hardcourt ...
Predicted Label: sport
--------------------------------------------------
Text: holmes starts 2005 with gb events kelly holmes will start 2005 with a series of races in britain.  h...
Predicted Label: sport
--------------------------------------------------
Text: online games play with politics after bubbling under for some time  online games broke through onto ...
Predicted Label: tech
--------------------------------------------------


에폭 5만으로도 좋은 성능!