In [1]:
# !pip install pytorch-transformers
# !git clone https://github.com/HyunjoonCho/CS492I-IntroToDL-project.git
# import os
# os.chdir('CS492I-IntroToDL-project')

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |████████████████████████████████| 176 kB 5.3 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.20.23-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 39.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 31.7 MB/s 
Collecting botocore<1.24.0,>=1.23.23
  Downloading botocore-1.23.23-py3-none-any.whl (8.4 MB)
[K     |████████████████████████████████| 8.4 MB 52.4 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |█████████████

In [2]:
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from pathlib import Path

import os

In [3]:
# from google.colab import drive
# drive.mount('/gdrive')

# drive_root = '/gdrive/My Drive/CS492I/project-pretrain'
# print(os.listdir(Path(drive_root)))

Mounted at /gdrive
['BertForSequenceClassification_Category_best.ckpt', 'BertForSequenceClassification_Category_last.ckpt']


In [4]:
from easydict import EasyDict as edict

args = edict()
args.gpu = True
args.batch_size = 4
args.num_epochs = 15
args.learning_rate = 5e-5

device = 'cuda' if torch.cuda.is_available() and args.gpu else 'cpu'

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_category_clf = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=8)

100%|██████████| 995526/995526 [00:00<00:00, 5559179.56B/s]
100%|██████████| 625/625 [00:00<00:00, 266758.93B/s]
100%|██████████| 714314041/714314041 [00:20<00:00, 34590185.27B/s]


In [6]:
dataset_train = []
dataset_val = []
dataset_test = []

root = Path('dataset/category')
list = os.listdir(root)
for cat in list:
    files = os.listdir(root / cat)
    for i,f in enumerate(files):
        fname = root / cat / f
        with open(fname, "r", encoding="utf-8") as file:
            strings = file.read()
            if i < 150:
                dataset_train.append([strings, cat])
            elif i < 175:
                dataset_val.append([strings, cat])
            else:
                dataset_test.append([strings,cat])

print(len(dataset_train), len(dataset_val), len(dataset_test))

1200 200 200


In [7]:
print(dataset_train[0][0][:64]) #sentence
print(dataset_train[0][1]) #label

[단독]조양호, 500억대 상속세 탈루 의혹…수사 착수	조양호 한진그룹 회장 부부 소식입니다. 

조양호 회장이 수
1


In [8]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx):
        # 현재 i[sent_idx] 가 본문
        self.sentences = [i[sent_idx][:64] for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))

In [9]:
data_train = BERTDataset(dataset_train, 0, 1)
data_val = BERTDataset(dataset_val, 0, 1)
data_test = BERTDataset(dataset_test, 0, 1)

In [10]:
train_dataloader = DataLoader(data_train, batch_size=args.batch_size, num_workers=5, shuffle=True)
val_dataloader = DataLoader(data_val, batch_size=args.batch_size, num_workers=5, shuffle=True)
test_dataloader = DataLoader(data_test, batch_size=args.batch_size, num_workers=5, shuffle=True)

  cpuset_checked))


In [11]:
bert_category_clf.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [12]:
def save_model(model, mode='last'):
    torch.save(model.state_dict(),  Path('pretrained_models') / f'{type(model).__name__}_Category_{mode}.ckpt')
    # torch.save(model.state_dict(), Path(drive_root) / f'{type(model).__name__}_Category_{mode}.ckpt')

In [13]:
#training step
optimizer = optim.AdamW(bert_category_clf.parameters(), lr=1e-6)
best_val_loss = float('inf')

for epoch in range(args.num_epochs):
    train_loss = 0
    total_len = 0
    total_correct = 0
    bert_category_clf.train()
    for sentence, label in train_dataloader:
        optimizer.zero_grad()
        
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        loss, logits = bert_category_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        train_loss += loss.item()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        #scheduler.step()        
    print('[Epoch {}/{}] -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, args.num_epochs, train_loss, total_correct/total_len))

    with torch.no_grad():
        bert_category_clf.eval()
        val_loss = 0
        v_total_correct = 0
        v_total_len = 0
        for sentence, label in val_dataloader:
            encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
            padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
            
            sample = torch.tensor(padded_list)
            label = tuple((int(x[0])) for x in label)
            label = torch.tensor(label)
            sample = sample.to(device)
            label = label.to(device)
            
            labels = torch.tensor(label)
            loss, logits = bert_category_clf(sample, labels=labels)
            
            pred = torch.argmax(F.softmax(logits), dim=1)        
            correct = pred.eq(labels)
            val_loss += loss.item()
            v_total_correct += correct.sum().item()
            v_total_len += len(labels)
        print('[Epoch {}/{}] -> Validation Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, args.num_epochs, val_loss, v_total_correct / v_total_len))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_model(bert_category_clf, 'best')
    save_model(bert_category_clf)

  cpuset_checked))


[Epoch 1/15] -> Train Loss: 625.7171, Accuracy: 0.128




[Epoch 1/15] -> Validation Loss: 103.4392, Accuracy: 0.125
[Epoch 2/15] -> Train Loss: 610.3944, Accuracy: 0.174
[Epoch 2/15] -> Validation Loss: 102.7240, Accuracy: 0.180
[Epoch 3/15] -> Train Loss: 560.1364, Accuracy: 0.388
[Epoch 3/15] -> Validation Loss: 84.9601, Accuracy: 0.535
[Epoch 4/15] -> Train Loss: 483.4604, Accuracy: 0.551
[Epoch 4/15] -> Validation Loss: 74.5309, Accuracy: 0.580
[Epoch 5/15] -> Train Loss: 410.7262, Accuracy: 0.641
[Epoch 5/15] -> Validation Loss: 65.1820, Accuracy: 0.645
[Epoch 6/15] -> Train Loss: 356.0026, Accuracy: 0.703
[Epoch 6/15] -> Validation Loss: 57.6310, Accuracy: 0.685
[Epoch 7/15] -> Train Loss: 315.2721, Accuracy: 0.733
[Epoch 7/15] -> Validation Loss: 54.2302, Accuracy: 0.685
[Epoch 8/15] -> Train Loss: 281.6179, Accuracy: 0.767
[Epoch 8/15] -> Validation Loss: 48.8490, Accuracy: 0.705
[Epoch 9/15] -> Train Loss: 251.8812, Accuracy: 0.777
[Epoch 9/15] -> Validation Loss: 44.9982, Accuracy: 0.725
[Epoch 10/15] -> Train Loss: 228.4027, Accur

In [14]:
bert_category_clf.eval()

t_total_len = 0
t_total_correct = 0
with torch.no_grad():
    for sentence, label in test_dataloader:
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
            
        labels = torch.tensor(label)
        _, logits = bert_category_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        t_total_correct += correct.sum().item()
        t_total_len += len(labels)

print('Test accuracy: ', t_total_correct / t_total_len)

  cpuset_checked))
  from ipykernel import kernelapp as app


Test accuracy:  0.8


In [15]:
def test_model(model, seq):
    cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
    tmp = [seq]
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in tmp]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)

    labels = torch.tensor([1]).unsqueeze(0)
    sample = sample.to(device)
    labels = labels.to(device)
    _, logits = model(sample, labels=labels)

    pred = torch.argmax(F.softmax(logits), dim=1)

    print("뉴스의 카테고리는:", cate[pred])
    print("신뢰도는:", "{:.2f}%".format(F.softmax(logits).max().item() * 100))

In [16]:
test_model(bert_category_clf, "SK텔레콤 분사 의결... '2025년 순자산 75조, 연 매출 22조 목표'")

뉴스의 카테고리는: 기술/IT
신뢰도는: 86.46%


  del sys.path[0]
  app.launch_new_instance()
