In [1]:
# 모듈 불러오기
import dill
import time
import random
import numpy as np
from sklearn.metrics import roc_curve, auc

import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import BucketIterator
from torchtext.legacy.data import Iterator

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amole\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# 환경 세팅
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_PATH = "data/processed/"

In [5]:
#데이터 불러오기

# torchtext.Field를 이용해 필드를 정의

# 문장 필드 정의
TEXT = Field(
    sequential=True, # sequential=True로 설정하여 이 필드에 문장이 들어옴을 알려준다,
    use_vocab=True, # 단어를 숫자로 변환시켜주는 단어장을 만들기 위해 사용
    tokenize=word_tokenize, # 불러온 문장을 토크나이징 할 함수 설정
    lower=True, # 대소문자를 구분할지 설정, 여기서는 모두 소문자로 변경
    batch_first=True, # 데이터의 형태를 (배치, 문장)꼴로 설정
)

# 라벨 필드 정의
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)

In [10]:
# 전처리를 끝낸 수능 데이터(tsv) 불러오기
sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="sat_train.tsv",
    validation="sat_valid.tsv",
    test="sat_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)], # (1)
    skip_header=1, # (2)
)

# Data Loader 정의
train_loader, valid_loader, test_loader = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data), # (3)
    batch_size=8,
    device=None,
    sort=False,
)

TEXT.build_vocab(sat_train_data, min_freq=2) # (4)

'''
(1) 앞에서 정의한 Field를 입력해주는 부분, 입력할 때 실제 데이터의 컬럼 순서로 입력해주어야 한다.
그리고 Field의 이름과 그 Field를 묶는다.

(2) 데이터의 첫 번째 열에는 원래의 컬럼명이 들어있다.
데이터로 사용되지 않으므로 생략한다.

(3) 앞에서 불러온 데이터를 묶어서 입력

(4) 마지막으로 불러온 데이터 중 훈련 데이터를 이용해 TEXT의 단어장을 생성
그중 2번 이상 나온 단어말을 단어장에 시용


'''



'\n(1) 앞에서 정의한 Field를 입력해주는 부분, 입력할 때 실제 데이터의 컬럼 순서로 입력해주어야 한다.\n그리고 Field의 이름과 그 Field를 묶는다.\n\n(2) 데이터의 첫 번째 열에는 원래의 컬럼명이 들어있다.\n데이터로 사용되지 않으므로 생략한다.\n\n(3) 앞에서 불러온 데이터를 묶어서 입력\n\n(4) 마지막으로 불러온 데이터 중 훈련 데이터를 이용해 TEXT의 단어장을 생성\n그중 2번 이상 나온 단어말을 단어장에 시용\n\n\n'

In [11]:
# 모델 클래스 정의
class LSTMClassifier(nn.Module):
    def __init__(
        self, 
        num_embeddings, 
        embedding_dim, 
        hidden_size, 
        num_layers, 
        pad_idx
        ):   
        super().__init__()
        self.embed_layer = nn.Embedding(
            # 생성할 Embedding Layer의 크기
            num_embeddings = num_embeddings,
            embedding_dim = embedding_dim,
            #배치별로 문장의 크기를 맞추기 위해 짧은 문장에 Padding을 붙여서 길이를 맞춘다.
            padding_idx=pad_idx
        )
        self.lstm_layer = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_size,
            num_layers = num_layers,
            bidirectional = True,
            dropout = 0.5
        )
        self.last_layer = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Dropout(0.5),
            nn.LeakyReLU(),                 
            nn.Linear(hidden_size, 1), # (1)
            nn.Sigmoid(),
        )
        
        """
        (1) output의 크기를 1로 두어 각각의 문장이 문법적으로 맞을 점수를 계산
        만일 output의 크기가 2라면 각각의 문장이 문법적으로 맞는지 틀렸는지에 대한
        점수 2개를 구할 수 있다.
        우리는 각각의 문장이 맞았는지 점수를 확인하여 가장 점수가 낮은 문장을 정답으로 고르기 위해
        output의 값을 1로 둔다.
         
        (2) 우리가 구한 점수를 확률로 변환하기 위해 sigmoid를 마지막 층으로 둔다. 
        """  
            
    #모델의 파이프라인 정의    
    def forward(self, x):
        # 숫자로 이루어진 토큰을 input으로 받아 이 값들을 Embedding값으로 변환
        embed_x = self.embed_layer(x)
        ''' 
        LSTM은 output, (Hidden State, Celss State)을 반환한다.
        이 중 state 값들은 사용하지 않으므로 반환하지 않는다. 
        '''
        output, (_, _) = self.lstm_layer(embed_x)
        '''
        LSTM의 output은 batch size, 문장 길이, Output size라는
        size를 가지고 있다. 우리는 가장 마지막 단어의 결과값을 사용한다.
        '''
        last_output = output[:, -1, :]
        ''' 
        문장의 마지막 단어의 output을 Fully Connected Layer에 통과시켜 확률값을 계산한다.
        '''
        last_output = self.last_layer(last_output)
        return last_output

In [16]:
# train, evaluate, test 정의

# 모델의 학습 함수 정의
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for batch in train_loader: # 입력받은 Data Loader을 호출해 Batch를 부름
        optimizer.zero_grad() 
        text = batch.text # Batch는 2개의 Attribute를 가지고 있는데 여기서 text는 batch의 문장 label은 batch의 정답을 의미
        if text.shape[0] > 1:
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

    return epoch_loss / len(train_loader)

# 모델의 펴가 함수 정의
def evaluate(model, valid_loader, criterion, device):
    model.eval()
    ''' 
    Dropout과 같이 훈련과 평가의 동작이 다른 모듈들은 각 목적에 맞게 변화를 주어야 한다.
    여기서는 평가를 하기 위해 model.eval()을 먼저 선언하였다.
    '''
    epoch_loss = 0

    with torch.no_grad():
        ''' 
        torch에서는 기본적으로 Forward를 할 때 자동으로 Gradient를 계산한다
        하지만 평가를 진행하므로 Gradient를 계산할 필요가 없다.
        '''
        
        for _, batch in enumerate(valid_loader):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)
            epoch_loss += loss.item()

    return epoch_loss / len(valid_loader)

# 테스트 함수 정의
def test(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        y_real = []
        y_pred = []
        
        for batch in test_loader:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            
            output = model(text).flatten().cpu()
            
            y_real += [label]
            y_pred += [output]
            
        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc


def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # (1)
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(   # (2)
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)
if torch.cuda.is_available(): # (3)
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_classifier.to(device)

optimizer = torch.optim.Adam(lstm_classifier.parameters()) # (4)
bce_loss_fn = nn.BCELoss() # (5)

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_classifier, train_loader, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_classifier, valid_loader, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\t Val. Loss: {valid_loss:.5f}")
    
    
    ''' 
    (1) Embedding Layer에 사용한 Padding Index를 가져온다. 
    TEXT.vocab.stoi는 앞서 만든 단어장에서 단어를 토큰으로 만들어주는 Dic이다
    반대로 TEXT.vocab.itos를 통해 토큰을 단어로 바꿀 수 있다.
    
    (2) 학습시킬 모델을 정의
    
    (3) GPU를 사용할 수 있다면 사용한다.
    
    (4) Optimizer는 Adam
    
    (5) 손실함수는 Binary Cross Entropy
    '''

Epoch: 01 | Time: 0m 1s
	Train Loss: 0.54992
	 Val. Loss: 0.55605
Epoch: 02 | Time: 0m 1s
	Train Loss: 0.44309
	 Val. Loss: 0.52379
Epoch: 03 | Time: 0m 1s
	Train Loss: 0.40979
	 Val. Loss: 0.52337
Epoch: 04 | Time: 0m 1s
	Train Loss: 0.45662
	 Val. Loss: 0.51289
Epoch: 05 | Time: 0m 1s
	Train Loss: 0.42997
	 Val. Loss: 0.50872
Epoch: 06 | Time: 0m 1s
	Train Loss: 0.41866
	 Val. Loss: 0.50453
Epoch: 07 | Time: 0m 1s
	Train Loss: 0.42126
	 Val. Loss: 0.50724
Epoch: 08 | Time: 0m 1s
	Train Loss: 0.44132
	 Val. Loss: 0.50123
Epoch: 09 | Time: 0m 1s
	Train Loss: 0.41032
	 Val. Loss: 0.50295
Epoch: 10 | Time: 0m 1s
	Train Loss: 0.45836
	 Val. Loss: 0.50634
Epoch: 11 | Time: 0m 1s
	Train Loss: 0.42594
	 Val. Loss: 0.50948
Epoch: 12 | Time: 0m 1s
	Train Loss: 0.43285
	 Val. Loss: 0.51192
Epoch: 13 | Time: 0m 1s
	Train Loss: 0.44799
	 Val. Loss: 0.51244
Epoch: 14 | Time: 0m 1s
	Train Loss: 0.41737
	 Val. Loss: 0.51356
Epoch: 15 | Time: 0m 1s
	Train Loss: 0.43713
	 Val. Loss: 0.52342
Epoch: 16 

In [19]:
# 테스트룰 통한 모델의 성능 확인

_ = lstm_classifier.cpu()
test_auroc = test(lstm_classifier, sat_test_iterator, "cpu")

print(f"SAT Dataset Test AUROC: {test_auroc:.5f}")

SAT Dataset Test AUROC: 0.84615


In [20]:
# 다른 곳에 쓰기위해 dill 적용
with open("baseline_model.dill", "wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": lstm_classifier
    }
    dill.dump(model, f)

In [22]:
# 모델의 성능 높이기
# 사전학습 모델 사용

# 사전 학습 데이터 불러오기(CoLA)
TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=word_tokenize,
    lower=True,
    batch_first=True,
)
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)


cola_train_data, cola_valid_data, cola_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="cola_train.tsv",
    validation="cola_valid.tsv",
    test="cola_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

TEXT.build_vocab(cola_train_data, min_freq=2) # (1)
cola_train_iterator, cola_valid_iterator, cola_test_iterator = BucketIterator.splits(
    (cola_train_data, cola_valid_data, cola_test_data), 
    batch_size=32, 
    device=None,
    sort=False,
)

''' 
(1) 사전 학습을 할 때 중요한 점은 사전학습 때 이용한 모델의 단어장을 유지하는것이다.
예를 들어 A모델의 Hi라는 단어는 1번 토큰인데 B 모델에서는 2번 토큰이라면 같은 단어지만
토큰 값이 달라지기에 모델의 성능이 보장되지 않는다. 따라서 CoLA데이터로 만든 단어장을 계속 사용한다.
'''


# 추가 학습 데이터 불러오기 (수능)
sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="sat_train.tsv",
    validation="sat_valid.tsv",
    test="sat_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

train_loader, valid_loader, test_loader = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data),
    batch_size=8,
    device=None,
    sort=False,
)


In [24]:
# 데이터 사전 학습(CoLA)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_pool_classifier = LSTMClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_pool_classifier.to(device)

optimizer = torch.optim.Adam(lstm_pool_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(
        lstm_pool_classifier, 
        cola_train_iterator, # CoLA를 활용하여 사전학습
        optimizer, 
        bce_loss_fn, 
        device
    )
    valid_loss = evaluate(
        lstm_pool_classifier, 
        cola_valid_iterator, 
        bce_loss_fn, 
        device
    )

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

Epoch: 01 | Time: 0m 54s
	Train Loss: 0.61252
	 Val. Loss: 0.61964
Epoch: 02 | Time: 0m 52s
	Train Loss: 0.61073
	 Val. Loss: 0.61661
Epoch: 03 | Time: 0m 52s
	Train Loss: 0.61001
	 Val. Loss: 0.61787
Epoch: 04 | Time: 0m 52s
	Train Loss: 0.60996
	 Val. Loss: 0.61968
Epoch: 05 | Time: 0m 53s
	Train Loss: 0.60980
	 Val. Loss: 0.61847
Epoch: 06 | Time: 0m 53s
	Train Loss: 0.61029
	 Val. Loss: 0.61846
Epoch: 07 | Time: 0m 53s
	Train Loss: 0.60923
	 Val. Loss: 0.61790
Epoch: 08 | Time: 0m 54s
	Train Loss: 0.60791
	 Val. Loss: 0.61851
Epoch: 09 | Time: 0m 54s
	Train Loss: 0.60865
	 Val. Loss: 0.61844
Epoch: 10 | Time: 0m 54s
	Train Loss: 0.60943
	 Val. Loss: 0.61852
Epoch: 11 | Time: 0m 53s
	Train Loss: 0.60824
	 Val. Loss: 0.61832
Epoch: 12 | Time: 0m 50s
	Train Loss: 0.60884
	 Val. Loss: 0.61852
Epoch: 13 | Time: 0m 51s
	Train Loss: 0.61001
	 Val. Loss: 0.61819
Epoch: 14 | Time: 0m 51s
	Train Loss: 0.60885
	 Val. Loss: 0.61931
Epoch: 15 | Time: 0m 50s
	Train Loss: 0.60902
	 Val. Loss: 0.6

In [26]:
# 성능 비교를 위한 모델 저장
from copy import deepcopy
before_tuning_lstm_pool_classifier = deepcopy(lstm_pool_classifier)

In [27]:
# 수능 데이터를 이용한 추가학습(Fine-tuning)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

Epoch: 01 | Time: 0m 1s
	Train Loss: 0.46574
	 Val. Loss: 0.55346
Epoch: 02 | Time: 0m 1s
	Train Loss: 0.44855
	 Val. Loss: 0.53627
Epoch: 03 | Time: 0m 1s
	Train Loss: 0.43000
	 Val. Loss: 0.55400
Epoch: 04 | Time: 0m 1s
	Train Loss: 0.41707
	 Val. Loss: 0.52983
Epoch: 05 | Time: 0m 1s
	Train Loss: 0.43516
	 Val. Loss: 0.52489
Epoch: 06 | Time: 0m 1s
	Train Loss: 0.40128
	 Val. Loss: 0.53323
Epoch: 07 | Time: 0m 1s
	Train Loss: 0.40610
	 Val. Loss: 0.54822
Epoch: 08 | Time: 0m 1s
	Train Loss: 0.43160
	 Val. Loss: 0.53430
Epoch: 09 | Time: 0m 1s
	Train Loss: 0.40920
	 Val. Loss: 0.53643
Epoch: 10 | Time: 0m 1s
	Train Loss: 0.41149
	 Val. Loss: 0.53729
Epoch: 11 | Time: 0m 1s
	Train Loss: 0.39315
	 Val. Loss: 0.52993
Epoch: 12 | Time: 0m 1s
	Train Loss: 0.43080
	 Val. Loss: 0.51977
Epoch: 13 | Time: 0m 1s
	Train Loss: 0.42554
	 Val. Loss: 0.51972
Epoch: 14 | Time: 0m 1s
	Train Loss: 0.42022
	 Val. Loss: 0.52652
Epoch: 15 | Time: 0m 1s
	Train Loss: 0.42187
	 Val. Loss: 0.52371
Epoch: 16 

In [28]:
# 모델의 성능 비교
_ = before_tuning_lstm_pool_classifier.cpu()
_ = lstm_pool_classifier.cpu()

pool_sat_test_auroc = test(before_tuning_lstm_pool_classifier, sat_test_iterator, "cpu")
pool_tuned_test_auroc = test(lstm_pool_classifier, sat_test_iterator, "cpu")

print(f"Before fine-tuning SAT Dataset Test AUROC: {pool_sat_test_auroc:.5f}")
print(f"After fine-tuning SAT Dataset Test AUROC: {pool_tuned_test_auroc:.5f}")

Before fine-tuning SAT Dataset Test AUROC: 0.57692
After fine-tuning SAT Dataset Test AUROC: 0.73077


In [29]:
# 모델 저장
with open("advanced_before_tuning_model.dill", "wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": before_tuning_lstm_pool_classifier
    }
    dill.dump(model, f)

with open("advanced_after_tuning_model.dill", "wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": lstm_pool_classifier
    }
    dill.dump(model, f)

In [32]:
# 심화 모델

class LSTMPoolingClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(
            num_embeddings=num_embeddings, 
            embedding_dim=embedding_dim, 
            padding_idx=pad_idx
        )
        self.lstm_layer = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_size,
            num_layers = num_layers,
            bidirectional=True,
            dropout = 0.5,
            batch_first = True
        )
        self.last_layer = nn.Sequential(
            nn.Linear(2 * hidden_size, 1), 
            nn.Dropout(p = 0.5),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.embed_layer(x) # Token으로 들어온 데이터를 Embedding하여 값으로 변환
        output, _ = self.lstm_layer(x) # 변환된 값들을 lstm에 넣는다.
        pool = nn.functional.max_pool1d(output.transpose(1, 2), x.shape[1]) # LSTM의 결과를 Max Pooling
        pool = pool.transpose(1, 2).squeeze() # Max Pooling한 결과를 Fully Connected Layer에 넣기위해 shape를 맞춘다.
        output = self.last_layer(pool) # Fully Connected Layer에 넣어서 결과를 반환
        return output.squeeze()

In [33]:
# 모델 사전학습
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_pool_classifier = LSTMPoolingClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_pool_classifier.to(device)

optimizer = torch.optim.Adam(lstm_pool_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(
        lstm_pool_classifier, 
        cola_train_iterator, 
        optimizer, 
        bce_loss_fn, 
        device
    )
    valid_loss = evaluate(
        lstm_pool_classifier, 
        cola_valid_iterator, 
        bce_loss_fn, 
        device
    )

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

Epoch: 01 | Time: 0m 46s
	Train Loss: 0.65214
	 Val. Loss: 0.63278
Epoch: 02 | Time: 0m 45s
	Train Loss: 0.64445
	 Val. Loss: 0.61260
Epoch: 03 | Time: 0m 46s
	Train Loss: 0.64414
	 Val. Loss: 0.62399
Epoch: 04 | Time: 0m 46s
	Train Loss: 0.64183
	 Val. Loss: 0.63672
Epoch: 05 | Time: 0m 46s
	Train Loss: 0.63227
	 Val. Loss: 0.62205
Epoch: 06 | Time: 0m 46s
	Train Loss: 0.61660
	 Val. Loss: 0.61360
Epoch: 07 | Time: 0m 46s
	Train Loss: 0.60698
	 Val. Loss: 0.60756
Epoch: 08 | Time: 0m 47s
	Train Loss: 0.58776
	 Val. Loss: 0.60691
Epoch: 09 | Time: 0m 48s
	Train Loss: 0.57337
	 Val. Loss: 0.60137
Epoch: 10 | Time: 0m 47s
	Train Loss: 0.55733
	 Val. Loss: 0.59036
Epoch: 11 | Time: 0m 47s
	Train Loss: 0.54621
	 Val. Loss: 0.61912
Epoch: 12 | Time: 0m 47s
	Train Loss: 0.52138
	 Val. Loss: 0.62994
Epoch: 13 | Time: 0m 44s
	Train Loss: 0.51827
	 Val. Loss: 0.63370
Epoch: 14 | Time: 0m 44s
	Train Loss: 0.50092
	 Val. Loss: 0.62309
Epoch: 15 | Time: 0m 44s
	Train Loss: 0.48234
	 Val. Loss: 0.6

In [34]:
before_tuning_lstm_pool_classifier = deepcopy(lstm_pool_classifier)

In [35]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

Epoch: 01 | Time: 0m 2s
	Train Loss: 0.64142
	 Val. Loss: 0.62754
Epoch: 02 | Time: 0m 2s
	Train Loss: 0.47164
	 Val. Loss: 0.57256
Epoch: 03 | Time: 0m 2s
	Train Loss: 0.58599
	 Val. Loss: 0.57338
Epoch: 04 | Time: 0m 2s
	Train Loss: 0.52337
	 Val. Loss: 0.55623
Epoch: 05 | Time: 0m 2s
	Train Loss: 0.53581
	 Val. Loss: 0.56962
Epoch: 06 | Time: 0m 2s
	Train Loss: 0.55875
	 Val. Loss: 0.57594
Epoch: 07 | Time: 0m 1s
	Train Loss: 0.46472
	 Val. Loss: 0.55337
Epoch: 08 | Time: 0m 1s
	Train Loss: 0.49379
	 Val. Loss: 0.56442
Epoch: 09 | Time: 0m 1s
	Train Loss: 0.48446
	 Val. Loss: 0.53579
Epoch: 10 | Time: 0m 1s
	Train Loss: 0.42478
	 Val. Loss: 0.53740
Epoch: 11 | Time: 0m 2s
	Train Loss: 0.48835
	 Val. Loss: 0.59688
Epoch: 12 | Time: 0m 2s
	Train Loss: 0.44362
	 Val. Loss: 0.57973
Epoch: 13 | Time: 0m 2s
	Train Loss: 0.40199
	 Val. Loss: 0.62557
Epoch: 14 | Time: 0m 1s
	Train Loss: 0.37755
	 Val. Loss: 0.69300
Epoch: 15 | Time: 0m 1s
	Train Loss: 0.33619
	 Val. Loss: 0.68453
Epoch: 16 

In [36]:
_ = before_tuning_lstm_pool_classifier.cpu()
_ = lstm_pool_classifier.cpu()

pool_sat_test_auroc = test(before_tuning_lstm_pool_classifier, sat_test_iterator, "cpu")
pool_tuned_test_auroc = test(lstm_pool_classifier, sat_test_iterator, "cpu")

print(f"Before fine-tuning SAT Dataset Test AUROC: {pool_sat_test_auroc:.5f}")
print(f"After fine-tuning SAT Dataset Test AUROC: {pool_tuned_test_auroc:.5f}")

Before fine-tuning SAT Dataset Test AUROC: 0.15385
After fine-tuning SAT Dataset Test AUROC: 0.42308


In [37]:
with open("advanced_before_tuning_model.dill", "wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": before_tuning_lstm_pool_classifier
    }
    dill.dump(model, f)

In [38]:
# 성능 비교

def test(model_path):
    with open(model_path, "rb") as f:
        model = dill.load(f)
        
    sat_test_data = TabularDataset(
        path=f"{DATA_PATH}/sat_test.tsv",
        format="tsv",
        fields=[("text", model["TEXT"]), ("label", model["LABEL"])],
        skip_header=1
    )

    sat_test_iterator = BucketIterator(
        sat_test_data,
        batch_size=8, 
        device=None,
        sort=False,
        shuffle=False
    )
    classifier = model["classifier"]
    with torch.no_grad():
        y_real = []
        y_pred = []
        classifier.eval()
        for batch in sat_test_iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)

            output = classifier(text).flatten().cpu()
 
            y_real += [label]
            y_pred += [output]

        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc.round(5)

In [39]:
model_list = [
    "baseline_model.dill",
    "before_tuning_model.dill",
    "after_tuning_model.dill",
    "advanced_before_tuning_model.dill",
    "advanced_after_tuning_model.dill",
]

test_auroc = []
for file_name in model_list:
    model_name = file_name.replace(".dill", "")
    auroc = test(file_name)
    test_auroc += [(model_name, auroc)]

In [40]:
test_auroc = sorted(test_auroc, key=lambda x: x[1], reverse=True)
for rank, (model_name, auroc) in enumerate(test_auroc):
    print(f"Rank {rank+1} - {model_name:30} - Test AUROC: {auroc:.5f}")

Rank 1 - baseline_model                 - Test AUROC: 0.84615
Rank 2 - before_tuning_model            - Test AUROC: 0.84615
Rank 3 - after_tuning_model             - Test AUROC: 0.73077
Rank 4 - advanced_after_tuning_model    - Test AUROC: 0.73077
Rank 5 - advanced_before_tuning_model   - Test AUROC: 0.15385


In [41]:
# 문제풀이
def predict_problem(model_path, problem):
    with open(model_path, "rb") as f:
        model = dill.load(f)
    TEXT = model["TEXT"]
    classifier = model["classifier"]

    problem = list(map(lambda x: x.replace("[", "").replace("]", ""), problem))
    tokenized_sentences = [word_tokenize(sentence) for sentence in problem]
    sentences = []
    for tokenized_sentence in tokenized_sentences:
        sentences.append([TEXT.vocab.stoi[word] for word in tokenized_sentence])
    
    with torch.no_grad():
        classifier.eval()
        predict = []
        for sentence in sentences:
            sentence = torch.LongTensor([sentence])
            predict += [classifier(sentence).item()]
    return predict


def predict_problem_with_models(model_list, problem):
    scores = {}
    for file_name in model_list:
        model_name = file_name.replace(".dill", "")
        score = predict_problem(file_name, problem)
        scores[model_name] = score

    score_df = pd.DataFrame(scores).T
    score_df.columns = [f"answer_{i}_score" for i in range(1,6)]

    selected_answer = pd.Series(np.argmin(score_df.values, 1)+1, index=score_df.index, name="selected_answer")

    return pd.concat([selected_answer, score_df], 1)

In [42]:
problem_1 = [ 
    "Competitive activities can be more than just performance showcases which the best is recognized and the rest are overlooked.",
    "The provision of timely, constructive feedback to participants on performance is an asset that some competitions and contests offer.",
    "The provision of that type of feedback can be interpreted as shifting the emphasis to demonstrating superior performance but not necessarily excellence.",
    "The emphasis on superiority is what we typically see as fostering a detrimental effect of competition.",
    "Information about performance can be very helpful, not only to the participant who does not win or place but also to those who do.",
]
problem_1_label = [0, 1, 1, 1, 1]

In [44]:
import pandas as pd
predict_problem_with_models(model_list, problem_1).loc[map(lambda x:x[0], test_auroc)]

  return pd.concat([selected_answer, score_df], 1)


Unnamed: 0,selected_answer,answer_1_score,answer_2_score,answer_3_score,answer_4_score,answer_5_score
baseline_model,1,0.625126,0.625126,0.625126,0.625126,0.625126
before_tuning_model,1,0.667495,0.667495,0.667495,0.667495,0.667495
after_tuning_model,1,0.748441,0.748441,0.748441,0.748441,0.748441
advanced_after_tuning_model,1,0.761659,0.761659,0.761659,0.761659,0.761659
advanced_before_tuning_model,4,0.922278,0.923373,0.923696,0.482936,0.907949


In [45]:
problem_2 = [ 
    "People from more individualistic cultural contexts tend to be motivated to maintain self-focused agency or control 1 as these serve as the basis of one’s self-worth.",
    "With this form of agency comes the belief that individual successes 2 depending primarily on one’s own abilities and actions, and thus, whether by influencing the environment or trying to accept one’s circumstances, the use of control ultimately centers on the individual.",
    "The independent self may be more 3 driven to cope by appealing to a sense of agency or control.",
    "Research has shown 4 that East Asians prefer to receive, but not seek, more social support rather than seek personal control in certain cases.",
    "Therefore, people 5 who hold a more interdependent self-construal may prefer to cope in a way that promotes harmony in relationships.",
]
problem_2_label = [1, 0, 1, 1, 1]

In [46]:
predict_problem_with_models(model_list, problem_2).loc[map(lambda x:x[0], test_auroc)]

  return pd.concat([selected_answer, score_df], 1)


Unnamed: 0,selected_answer,answer_1_score,answer_2_score,answer_3_score,answer_4_score,answer_5_score
baseline_model,1,0.625126,0.625126,0.625126,0.625126,0.625126
before_tuning_model,1,0.667495,0.667495,0.667495,0.667495,0.667495
after_tuning_model,1,0.748441,0.748441,0.748441,0.748441,0.748441
advanced_after_tuning_model,1,0.761659,0.761659,0.761659,0.761659,0.761659
advanced_before_tuning_model,5,0.92095,0.914767,0.917054,0.916935,0.457136
