In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler, SequentialSampler
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
import pickle

In [2]:
# test data set
with open('./test_dataset_each_10_500.pickle', 'rb') as f:
    df = pickle.load(f)
df['category'].value_counts()

음식      91685
카페      27855
명소      15606
술집       8286
숙박       7354
문화예술     3297
시장        993
공원        560
Name: category, dtype: int64

In [3]:
tmp = list(set([str(label) + '$' + category for (label, category, ) in df[['label', 'category']].values.tolist()]))
mapping = {int(cate_label.split('$')[0]):cate_label.split('$')[1] for cate_label in tmp}
mapping

{3: '음식', 7: '시장', 4: '술집', 5: '숙박', 2: '공원', 1: '카페', 6: '문화예술', 0: '명소'}

In [4]:
raw_dataset = []
for i in range(8):
    f_df = df[df['label'] == i][['sentence', 'label']].values.tolist()
    if len(f_df) >= 3000:
        samples = random.sample(f_df, 3000)
    else:
        samples = random.sample(f_df, len(f_df))
    raw_dataset.extend(samples)
random.shuffle(raw_dataset)
len(raw_dataset)

19553

In [5]:
sentences = ['[CLS] ' + t[0] + '[SEP]' for t in raw_dataset]
labels = [t[1] for t in raw_dataset]

In [6]:
# load tokenizer
user_defined_symbols = ['[PAD]', '[UNK]', '[UNK0]','[UNK1]','[UNK2]','[UNK3]','[UNK4]','[UNK5]','[UNK6]','[UNK7]','[UNK8]','[UNK9]', '[CLS]', '[SEP]', '[MASK]', '[BOS]','[EOS]']
unused_token_num = 200
unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)]
user_defined_symbols = user_defined_symbols + unused_list

tokenizer = BertTokenizer(
    vocab_file = './hf_tokenizer_special/vocab.txt',
    max_len = 1502,
    do_lower_case=False,
)
special_token_dic = {'additional_special_tokens': user_defined_symbols}
tokenizer.add_special_tokens(special_token_dic)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print(sentences[0])
print(tokenized_texts[0])

[CLS] 김해 진영은 갈비가 유명해요 할매갈비,시민갈비,신라가든, 요즘엔 수제갈비도 맛있죠? 전 시민갈비다녀왓는데 맛나게먹고왔었어용 돼지양념갈비로 주문했어요담엔 생갈비도 한번먹어봐야겠어용 기본상에 양념게장이나와서맛나게 먹구요 계란찜도 나왔어요 아기들데리고가기도 좋을것같아요 고기3인분이였던것같아요 갈비가 진짜 크더라구요 왕갈비가 생각났어요 맛있게 구워서 먹기만하면되죵! 돼지갈비는 양념때문에 잘타지않게 자주 뒤집어주는게 팁이예요 맛있게 다먹고된장찌개까지~ 갈비양념이 너무단걸 싫어하는데 여긴적당했던것 같아요 고기가 무척이나먹고싶네요 고기중독 김해맛집 김해고기맛집 진영시민갈비[SEP]
['[CLS]', '김해', '진영', '##은', '갈비', '##가', '유명', '##해요', '할매', '##갈비', ',', '시민', '##갈비', ',', '신라', '##가', '##든', ',', '요즘', '##엔', '수제', '##갈비', '##도', '맛있', '##죠', '?', '전', '시민', '##갈비', '##다녀', '##왓', '##는데', '맛나', '##게', '##먹', '##고', '##왔', '##었', '##어', '##용', '돼지', '##양', '##념', '##갈비', '##로', '주문', '##했', '##어요', '##담', '##엔', '생', '##갈비', '##도', '한번', '##먹', '##어', '##봐야', '##겠', '##어', '##용', '기본', '##상', '##에', '양념', '##게', '##장', '##이나', '##와서', '##맛', '##나', '##게', '먹', '##구요', '계란찜', '##도', '나왔', '##어요', '아기', '##들', '##데리', '##고', '##가', '##기', '##도', '좋', '##을', '##것', '##같', '##아요', '고기', '##3', '##인', '##분', '##이', '##였', '##던', '##것', '##같', '#

In [8]:
MAX_LEN = 512
# token to index number
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# padding
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids[0]

array([    2,  3997,  4235,     6,   537,  4705,  2709, 10123,  4668,
        4536,  4240,  2981,  4505,  4391,   906, 23287,  4534,  4646,
        7488,  4252,  6472,  7461,  4444,  2769, 11694,  2755,  4214,
       10123,   176,  4579, 14418,  8021,  4196,  4235, 10123, 13117,
        4134,  4465,  4240,  9937,  4290,  4045,  4235,  5174,  4214,
        1230,  5120,  4240,  6761,  4134,  4465,  7486, 18246,     9,
       10429,  4362,    17,  4772,  7461,  3251,  4668,  9899, 10392,
       23193, 10911, 20838,  9055,  4444,  7356,  4214,  6666,  7019,
        4274, 12709,  4178,  4362,  2828,  4225,  4616, 29099,  4377,
        4477,  5086,  8572,  2546,  4626,  4421,  4800,  4478,  8743,
        9899, 14644,  4240,  1374,  4616,  4423,  2594,  4367, 14948,
        4494,  2141,  4705, 26004,  4666,  4240, 26896,  2097,  4444,
        9717, 12901,  4299,     7, 25304,  4236,     7,  2690, 10457,
        4240,  2741,  4209,     7, 22280,     7,  7201, 20778,  4367,
        2738,  4633,

In [9]:
attention_masks = []
# padding 0, non padding 1
# 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

In [11]:
# seperate train set, val set
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

# seperate attention mask with train set, val set
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2018, 
                                                       test_size=0.1)

# data to torch tensor
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])

tensor([    2, 18166,  7170,  8933,  4177,  4263, 29881,  7541,  6642,  4260,
         4177, 10709,  4616,  4391,  4364, 10859,  6681,  4391,  4454, 27523,
         2244,  4478,  1019,  4391,  2755,  6682,  4535,  4478,  6500,  6740,
         4304,  7557,  4214,  3807, 24587,  4611,  4744, 17080,  2244,  4478,
         1019, 15203,  6476,  4478,  7716,  4305,  3912,  9223,  4262,  4196,
        11525,  4214,  9560,  4444,  6552,  4494,  6681,  6476,  4444,  6748,
         6682,    12,  4237,  4462,  6681,  8417, 10326,  4268,  4477,  4801,
         7424,  7637, 22415,  4780,  4290, 12446,  4329,  4263,  6870,    21,
         6532,     8,  7456,    21,  6532,  4233,  4747,  4592,     7,  6538,
            7,  6662,     7, 11931,  8863,     7,   547,    10,  2525,  6982,
         9316, 14389,  8466,     8, 24133,  4183,     8, 27751,  4180, 23480,
           21,    10,    10,  7514,     9,  8052,     9,  6658,    10, 11317,
         7377,    12,  4282,  4280,  7508,  4444,  7365,  4236, 

In [12]:
batch_size = 1

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = WeightedRandomSampler(train_data.weights, train_data.data_size) # RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [13]:
# check gpu
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 3080


In [14]:
# model load
pretrained_model_config = BertConfig.from_pretrained('model_output')
model = BertForSequenceClassification.from_pretrained(
    'model_output',
    num_labels=8
)

Some weights of the model checkpoint at model_output were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized f

In [15]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # learning rate
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )
epochs = 5
total_steps = len(train_dataloader) * epochs # 배치반복 횟수 * 에폭

# learning rate 조정을 위한 scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [17]:
# function of calculation accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [18]:
# function of time check
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded)) # hh:mm:ss

In [19]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad() # initialize gradient

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        # Forwardpropogation               
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs[0]
        total_loss += loss.item()
        # Backwardpropogation
        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # weight parameter update from gradient
        optimizer.step()
        # learning rate 조정
        scheduler.step()
        # initialize gradient
        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # no calculate gradient in validation
        with torch.no_grad():     
            # Forward
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # compare output with ouput label and calculate accuracy
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  17,597.    Elapsed: 0:00:27.
  Batch 1,000  of  17,597.    Elapsed: 0:00:53.
  Batch 1,500  of  17,597.    Elapsed: 0:01:20.
  Batch 2,000  of  17,597.    Elapsed: 0:01:46.
  Batch 2,500  of  17,597.    Elapsed: 0:02:12.
  Batch 3,000  of  17,597.    Elapsed: 0:02:38.
  Batch 3,500  of  17,597.    Elapsed: 0:03:05.
  Batch 4,000  of  17,597.    Elapsed: 0:03:31.
  Batch 4,500  of  17,597.    Elapsed: 0:03:56.
  Batch 5,000  of  17,597.    Elapsed: 0:04:22.
  Batch 5,500  of  17,597.    Elapsed: 0:04:48.
  Batch 6,000  of  17,597.    Elapsed: 0:05:14.
  Batch 6,500  of  17,597.    Elapsed: 0:05:40.
  Batch 7,000  of  17,597.    Elapsed: 0:06:05.
  Batch 7,500  of  17,597.    Elapsed: 0:06:31.
  Batch 8,000  of  17,597.    Elapsed: 0:06:57.
  Batch 8,500  of  17,597.    Elapsed: 0:07:23.
  Batch 9,000  of  17,597.    Elapsed: 0:07:49.
  Batch 9,500  of  17,597.    Elapsed: 0:08:14.
  Batch 10,000  of  17,597.    Elapsed: 0:08:40.
  Batch 10,500  of  17,597

In [21]:
test_dataset = random.sample(df[['sentence', 'label']].values.tolist(), 100)
test_sent = ['[CLS] ' + t[0] + '[SEP]' for t in test_dataset]
test_label = [t[1] for t in test_dataset]

In [22]:
tokened_test_sent = [tokenizer.tokenize(sent) for sent in test_sent]
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokened_test_sent]
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [23]:
test_attention_masks = []
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [24]:
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_label)
test_masks = torch.tensor(test_attention_masks)

In [25]:
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = WeightedRandomSampler(test_data.weights, test_data.data_size)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [26]:
# test
t0 = time.time()
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for step, batch in enumerate(test_dataloader):
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))


Accuracy: 0.95
Test took: 0:00:01


In [27]:
def convert_input_data(sentences):
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    MAX_LEN = 512
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

def test_sentences(sentences):
    model.eval()
    inputs, masks = convert_input_data(sentences)
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    with torch.no_grad():     
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    return logits

In [35]:
mapping = dict(sorted(mapping.items()))
mapping

{0: '명소', 1: '카페', 2: '공원', 3: '음식', 4: '술집', 5: '숙박', 6: '문화예술', 7: '시장'}

In [48]:
test_s = random.sample(df[['sentence', 'label']].values.tolist(), 1)[0]
print(test_s[0])
print(mapping[test_s[1]])
logits = test_sentences([test_s[0]])

print(logits)
print(mapping[np.argmax(logits)])

하루건너 하루 오는 지니에요 요즘 아주 열심히 돌아다니고 있다 눙! 제가 스스로 정한 한달에 한번있는 문화WEEK에요. 일주일 동안 온갖 문화생활 몰아서 하는 주 그래서 오늘 금요일을 끝으로 이번 달 자체문화위크도 끝나여. 흑 사실 이번주 아니라고 한 달 내내 아무것도 안하는 건 아니지만 죄책감 없이 놀러다닐 수 있게 스스로 만들어놓은 변명같은거라서끅 그래도 문화생활은 좋은거잖아여! 마음의 양식을 쌓기 위해 정기적으로 꼭 다녀와야해여. 암튼 오늘은 또 초대권을 받게되어서 엄마와 함께 코엑스 아트홀로 연극 라이어 보고왔어요. 요즘 아주 자주 가는거같은 코엑스! 무대사진! 찔끔 더 가까이! 멋있는 조명장 들! 2층이 있긴한데 좌석은 아니였어요! 전부 1층좌석임요 대학로에서 봤었을때보다 좌석이 푹신푹신해서 더 좋았어용~ 티켓인증! 초점 나갔다! 다시찍기! 올해는 뮤지컬은 몇번 봤어도 연극은 거의 안본 거 같아여. 흑흑 왜.? 암튼 간만에 바로 눈 앞에서 배우들 연기하시는거 보니 좋았어요! 스탠리역 배우분이 찌질찌질하시면서도 귀여우시 눙. 시리즈 2탄 3탄은 못봤으면서 1편만 두 번이나 보게되었네요 조만간 나머지 시리즈도 봐야게써요 신난답! 저는 이만 자러가야겠어용 굿밤~ 코엑스 아트홀 서울 강남구 삼성1동
문화예술
[[-1.5527267 -1.440852  -2.063234  -1.5980655 -1.6387876 -1.7272673
  11.917907  -1.8195696]]
문화예술


In [49]:
model.save_pretrained('./finetune_multiclss_model')

In [55]:
test_model = BertForSequenceClassification.from_pretrained(
    'finetue_multiclss_model',
    num_labels=8
)
device = "cuda:0"
test_model = test_model.to(device)

In [56]:
def test_sentences_t(sentences):
    test_model.eval()
    inputs, masks = convert_input_data(sentences)
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    with torch.no_grad():     
        outputs = test_model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    return logits

In [64]:
test_s = random.sample(df[['sentence', 'label']].values.tolist(), 1)[0]
print(test_s[0])
print(mapping[test_s[1]])
logits = test_sentences_t([test_s[0]])

print(logits)
print(mapping[np.argmax(logits)])

연꽃 아침산책 하늘사진오늘은 비가 소강상태라 우산작은거 하나 들고 나갔다왔는데요. 저녁식사를 안하고 잤더니 걷는데 좀 허기져서 힘들었어요. 아침부터 고기 구워 먹을까봐요. 꼬리조팝 모감주꽃이 지고 씨앗을 준비했네요. 모감주씨앗이 들어있어요. 비비추꽃이 많아요 생태체험관에서 키우는 가지 방토가 주렁주렁 저 뒤로 롯데타워가 보이네요 병꽃 배롱나무 분홍꽃 애기사과나무를 타고 올라간 찔레장미 핑크뮬리 가을에 색 바뀌면 예쁘겠죠 장미화원 배롱나무 메리골드 노란색이 약간 비맞아서 흐려졌어요 천리향 천리향. 노란 작은게 꽃이겠죠. 일일초를 주로 바늘꽃 목수국
공원
[[ 0.8995074 -0.8334404 10.280355  -1.471694  -2.417973  -1.4988086
  -1.8547026 -1.6283876]]
공원


In [72]:
predicted_label_list = []
predicted_logit_list = []

for text in df['sentence']:
    logits = test_sentences_t([text])
    predicted_logit_list.append(logits[0])
    predicted_label_list.append(mapping[np.argmax(logits)])
df['pred'] = predicted_label_list
df['score'] = predicted_logit_list
df.head()

Unnamed: 0,sentence,label,category,pred,score
0,"백년고기 내가 알바하던 편의점 점주님의 아들이 나와 동갑에, 같은 대학을 다닌다는 ...",3,음식,음식,"[-2.2105129, -1.7844205, -3.0720518, 10.252276..."
1,안녕 얘들아~! 다들 방학해서 좋으니? 난 잘 모르겠어.~ 어차피 학교 가면 잘 수...,3,음식,카페,"[0.7887645, 3.412375, -3.0511878, 2.6259508, 0..."
2,젤라또 아이스크림은 주변에서 쉽게먹을 수 있는 아이스크림은 아니기에김난노와 나는 여...,1,카페,카페,"[-1.8647114, 11.358135, -2.1183596, -0.8450916..."
3,오전부터 비바람이몰아치더니 오후되니이렇게 날씨가 개었네요~ 꽃샘추위 넘나 춥다요 볼...,1,카페,카페,"[-1.4839631, 11.0343275, -2.9041655, -0.765607..."
4,주말 잘 보내고 계신가요? 전 일요일답게 늦게일어나서 이제 아침을 먹었다는. 오늘의...,3,음식,음식,"[-2.024914, -1.925607, -3.0845895, 10.284386, ..."


In [73]:
df['pred'].unique()

array(['음식', '카페', '명소', '시장', '숙박', '공원', '술집', '문화예술'], dtype=object)

In [74]:
from sklearn.metrics import classification_report

print(classification_report(y_true=df['category'], y_pred=df['pred']))

              precision    recall  f1-score   support

          공원       0.67      0.97      0.79       560
          명소       0.90      0.93      0.91     15606
        문화예술       0.73      0.99      0.84      3297
          숙박       0.84      0.97      0.90      7354
          술집       0.70      0.96      0.81      8286
          시장       0.57      0.99      0.73       993
          음식       0.99      0.91      0.95     91685
          카페       0.93      0.94      0.93     27855

    accuracy                           0.93    155636
   macro avg       0.79      0.96      0.86    155636
weighted avg       0.94      0.93      0.93    155636



https://tutorials.pytorch.kr/beginner/saving_loading_models.html