In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


from transformers import *
import os
import sys
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import numpy as np
import re
import pickle
import time
import pandas as pd
from pathlib import Path
import random
from torch.utils.tensorboard import SummaryWriter

# 1. Test data Embedding 

In [5]:
# load datasets
df = pd.read_csv('./data/hackathon_test_for_user.csv', encoding='cp949')

In [6]:
df

Unnamed: 0,Gender,Age,Q_number,Answer
0,0,30,59,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...
1,1,40,53,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...
2,1,40,56,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...
3,1,40,60,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...
4,1,30,51,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...
...,...,...,...,...
2875,1,20,59,<아니다> 마감 기한은 웬만해서는 지키려고 하는 편이에요. 거의 대부분 지각하지 않아요.
2876,0,20,54,<중립> 저는 상대방의 감정을 바로는 아니지만 어느 정도 알아차릴 수는 있습니다. ...
2877,1,30,50,<그렇다> 상대방이 높게 평가할 수 롤 작은 실수도 크게 보일 수 있기 때문이에요....
2878,1,30,50,<아니다> 상대방이 나를 높게 평가하면 더 잘하면 된다고 생각하기 때문에 상대방이 ...


In [7]:
# load pretrained model
def get_model():
    # * Model          | Tokenizer          | Pretrained weights shortcut
    # MODEL=(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
    tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
    model = BertModel.from_pretrained("kykim/bert-kor-base")
    n_hl = model.config.num_hidden_layers
    embed_dim = model.config.embedding_size
    return model, tokenizer, n_hl, embed_dim

In [12]:
model, tokenizer, n_hl, embed_dim = get_model()

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
class MyMapDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        data = {k:v[idx] for k,v in self.data.items()}
        return data

In [14]:
tensor = tokenizer(df['Answer'].to_list(), max_length=model.config.max_position_embeddings, return_tensors='pt', padding=True)
ds = MyMapDataset(tensor)
dl = DataLoader(ds, batch_size=32, shuffle=False)

In [15]:
def forward(model, dl, device=0):
    pooled = []
    hidden = []
    model.cuda(device)
    model.eval()
    for data in dl:
        data = {k:v.cuda(device) for k,v in data.items()}
        with torch.no_grad():
            output = model(**data, output_hidden_states=True)
        p, h = output.pooler_output, output.hidden_states
        pooled.append(p) # pooler output
        hidden.append(h[-1][:,0,:]) # only [CLS] token embedding 
    return torch.cat(pooled), torch.cat(hidden)

In [17]:
result = forward(model, dl, device=6)

In [20]:
result[0].shape, result[1].shape

(torch.Size([2880, 768]), torch.Size([2880, 768]))

In [None]:
# forward datasets
# class MyDataset(Dataset):
#     def __init__(self, data):
#         self.data = data
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return self.data[idx]
    
# ds = MyDataset(result[0])
# dl = DataLoader(ds, shuffle=False, batch_size=)

# Load each model

In [21]:
ie_path = './ckpt/IE/epoch_81.pt'
sn_path = './ckpt/SN/epoch_30.pt'
tf_path = './ckpt/TF/epoch_20.pt'
jp_path = './ckpt/jp/epoch_25.pt'
path = [ie_path, sn_path, tf_path, jp_path]

In [23]:
model = torch.load(ie_path, map_location='cpu')

In [27]:
def main(path):
    model = torch.load(ie_path, map_location='cpu')
    model.eval()
    model = model.cuda(6)
    with torch.no_grad():
        output = model(result[0].cuda(6))
        output = output.argmax(axis=1)
    
    return output

In [31]:
ie = main(ie_path)
sn = main(sn_path)
tf = main(tf_path)
jp = main(jp_path)

In [35]:
test = pd.DataFrame({'I/E': ie.tolist(), 
                    'S/N': sn.tolist(), 
                    'T/F': tf.tolist(), 
                    'J/P': jp.tolist()})

In [36]:
test.head()

Unnamed: 0,I/E,S/N,T/F,J/P
0,0,0,0,0
1,0,0,0,0
2,1,1,1,1
3,1,1,1,1
4,0,0,0,0


In [37]:
test.to_csv('basemodel_result.csv')