In [None]:
# !pip install pytorch-transformers
# !git clone https://github.com/HyunjoonCho/CS492I-IntroToDL-project.git
# import os
# os.chdir('CS492I-IntroToDL-project')

In [2]:
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
import os
import random
from pathlib import Path

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')

# drive_root = '/gdrive/My Drive/CS492I/project-pretrain'
# print(os.listdir(Path(drive_root)))

In [None]:
from easydict import EasyDict as edict

args = edict()
args.gpu = True
args.batch_size = 4
args.num_epochs = 15
args.learning_rate = 1e-6

device = 'cuda' if torch.cuda.is_available() and args.gpu else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_fluctuation_clf = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)

In [7]:
dataset_train = []
dataset_val = []
dataset_test = []

root = Path('dataset/fluctuation')

years = os.listdir(root)
for year in years:
    categories = os.listdir(root / year)
    min_count = min([len(os.listdir(root / year / cat)) for cat in categories])
    train_count = int(min_count * 0.75)
    val_count = min_count // 8
    for cat in categories:
        files = random.sample(os.listdir(root / year / cat), min_count)
        for i,f in enumerate(files):
            fname = root / year / cat / f
            with open(fname, 'r', encoding='utf-8') as file:
                strings = file.read()
                if i < train_count:
                    dataset_train.append([strings, cat])
                elif i < train_count + val_count:
                    dataset_val.append([strings, cat])
                else:
                    dataset_test.append([strings,cat])

print(len(dataset_train), len(dataset_val), len(dataset_test))

3604 596 612


In [6]:
print(dataset_train[0][0][:64]) #sentence
print(dataset_train[0][1]) #label

[솔루션 구축 우수 사례] SK하이닉스, 컨테이너 앱 플랫폼
0


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx):
        # 현재 i[sent_idx] 가 본문
        self.sentences = [i[sent_idx][:64] for i in dataset]
        self.labels = [i[label_idx] for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]

    def __len__(self):
        return (len(self.labels))

In [None]:
data_train = BERTDataset(dataset_train, 0, 1)
data_val = BERTDataset(dataset_val, 0, 1)
data_test = BERTDataset(dataset_test, 0, 1)

In [None]:
train_dataloader = DataLoader(data_train, batch_size=args.batch_size, num_workers=5, shuffle=True)
val_dataloader = DataLoader(data_val, batch_size=args.batch_size, num_workers=5, shuffle=True)
test_dataloader = DataLoader(data_test, batch_size=args.batch_size, num_workers=5, shuffle=True)

In [None]:
bert_fluctuation_clf.to(device)

In [None]:
def save_model(model, mode='last'):
    torch.save(model.state_dict(),  Path('pretrained_models') / f'{type(model).__name__}_Fluctuation_{mode}.ckpt')
    # torch.save(model.state_dict(), Path(drive_root) / f'{type(model).__name__}_Category_{mode}.ckpt')

In [None]:
optimizer = optim.AdamW(bert_fluctuation_clf.parameters(), lr=args.learning_rate)
best_val_loss = float('inf')

for epoch in range(args.num_epochs):
    
    bert_fluctuation_clf.train()
    train_loss = 0
    total_len = 0
    total_correct = 0
    for sentence, label in train_dataloader:
        optimizer.zero_grad()
        
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
        
        labels = torch.tensor(label)
        loss, logits = bert_fluctuation_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)        
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('[Epoch {}/{}] -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, args.num_epochs, train_loss, total_correct / total_len))
    
    with torch.no_grad():
        bert_fluctuation_clf.eval()
        val_loss = 0
        v_total_correct = 0
        v_total_len = 0
        for sentence, label in val_dataloader:
            encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
            padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
            
            sample = torch.tensor(padded_list)
            label = tuple((int(x[0])) for x in label)
            label = torch.tensor(label)
            sample = sample.to(device)
            label = label.to(device)
            
            labels = torch.tensor(label)
            loss, logits = bert_fluctuation_clf(sample, labels=labels)
            
            pred = torch.argmax(F.softmax(logits), dim=1)        
            correct = pred.eq(labels)
            val_loss += loss.item()
            v_total_correct += correct.sum().item()
            v_total_len += len(labels)
        print('[Epoch {}/{}] -> Validation Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, args.num_epochs, val_loss, v_total_correct / v_total_len))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_model(bert_fluctuation_clf, 'best')
    save_model(bert_fluctuation_clf)

In [None]:
bert_fluctuation_clf.eval()

total_loss = 0
total_len = 0
total_correct = 0
with torch.no_grad():
    for sentence, label in test_dataloader:
        encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in sentence]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        label = tuple((int(x[0])) for x in label)
        label = torch.tensor(label)
        sample = sample.to(device)
        label = label.to(device)
            
        labels = torch.tensor(label)
        _, logits = bert_fluctuation_clf(sample, labels=labels)

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)

In [None]:
def test_model(model, seq):
    cate = ["대폭 하락","소폭 하락","소폭 상승", "대폭 상승"]
    tmp = [seq]
    encoded_list = [tokenizer.encode(t,add_special_tokens=True) for t in tmp]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)

    labels = torch.tensor([1]).unsqueeze(0)
    sample = sample.to(device)
    labels = labels.to(device)
    _, logits = model(sample, labels=labels)

    pred = torch.argmax(F.softmax(logits), dim=1)

    print("주가 변동은:", cate[pred])
    print("신뢰도는:", "{:.2f}%".format(F.softmax(logits).max().item() * 100))

test_model(bert_fluctuation_clf, ""갤Z폴드·플립3 대단하네"…삼성, 3분기 폴더블폰 시장 '싹쓸이'")