In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install pytorch-crf
!pip install seqeval==1.0.0



In [None]:
import os
import numpy as np
import torch

# root_dir = "/gdrive/My Drive/AI_Konkuk/해커톤 baseline"
root_dir = "./gdrive/MyDrive/data/hackaton"
file_list = os.listdir('%s/npydata/'%(root_dir))
feature_dict = dict()
print(file_list)

for file in file_list:
    path = '%s/npydata/%s'%(root_dir, file)
    if not os.path.isfile(path):
        continue
    if "200" not in file:
        continue
    feature_dict[file] = torch.from_numpy(np.load(path, allow_pickle=True))
    print(file, feature_dict[file].shape)
    

['bigram_train_50.npy', 'bigram_test_50.npy', '.ipynb_checkpoints', 'pumsa_onehot.npy', 'trigram_train_100.npy', 'trigram_test_100.npy', 'pumsa_onehot_test.npy', 'pumsa_onehot_200.npy', 'pumsa_onehot_200_test.npy', 'trigram_train_100_200.npy', 'bigram_train_50_200.npy', 'trigram_test_100_200.npy', 'bigram_test_50_200.npy', 'pumsa_onehot_350.npy', 'pumsa_onehot_350_test.npy', 'bigram_train_50_350.npy', 'bigram_test_50_350.npy', 'trigram_test_100_350.npy', 'trigram_train_100_350.npy']
pumsa_onehot_200.npy torch.Size([7319, 200, 46])
pumsa_onehot_200_test.npy torch.Size([995, 200, 46])
trigram_train_100_200.npy torch.Size([7319, 200, 100])
bigram_train_50_200.npy torch.Size([7319, 200, 50])
trigram_test_100_200.npy torch.Size([995, 200, 100])
bigram_test_50_200.npy torch.Size([995, 200, 50])


#모델 코드

In [None]:
import torch
import torch.nn as nn
from torchcrf import CRF

from seqeval.metrics import classification_report


class RNN_CRF(nn.Module):
    def __init__(self, config):
        super(RNN_CRF, self).__init__()

        # 전체 음절 개수
        self.eumjeol_vocab_size = config["word_vocab_size"]

        # 음절 임베딩 사이즈
        self.embedding_size = config["embedding_size"]

        # GRU 히든 사이즈
        self.hidden_size = config["hidden_size"]

        # 분류할 태그의 개수
        self.number_of_tags = config["number_of_tags"]

        # 입력 데이터에 있는 각 음절 index를 대응하는 임베딩 벡터로 치환해주기 위한 임베딩 객체
        self.embedding = nn.Embedding(num_embeddings=self.eumjeol_vocab_size,
                                      embedding_dim=self.embedding_size,
                                      padding_idx=0)
        self.dropout = nn.Dropout(config["dropout"])

        # Bi-GRU layer
        self.bi_gru = nn.LSTM(input_size = self.embedding_size+46,
                             hidden_size= self.hidden_size,
                             num_layers=1,
                             batch_first=True,
                             bidirectional=True)
        
        self.sec_bi_gru = nn.LSTM(input_size = self.embedding_size*3+150+46,
                                 hidden_size = self.hidden_size,
                                 num_layers=1,
                                 batch_first=True,
                                 bidirectional=True)
        # CRF layer
        self.crf = CRF(num_tags=self.number_of_tags, batch_first=True)

        # fully_connected layer를 통하여 출력 크기를 number_of_tags에 맞춰줌
        # (batch_size, max_length, hidden_size*2) -> (batch_size, max_length, number_of_tags)
        self.hidden2num_tag = nn.Linear(in_features=self.hidden_size*2, out_features=self.number_of_tags)

    def forward(self, inputs, labels=None, pumsa=None, inputs_ngram = None):
        # (batch_size, max_length) -> (batch_size, max_length, embedding_size)
        eumjeol_inputs = self.embedding(inputs)
        eumjeol_inputs = torch.cat([eumjeol_inputs, pumsa], dim=-1)
        encoder_outputs, hidden_states = self.bi_gru(eumjeol_inputs)

        # print('encoder',encoder_outputs.shape)
        # print('hidden',hidden_states.shape)
        # (batch_size, curr_max_length, hidden_size*2)
        d_hidden_outputs = self.dropout(encoder_outputs)
        inputs_ngram = inputs_ngram.type(torch.FloatTensor).cuda()

        eumjeol_inputs = torch.cat([eumjeol_inputs, d_hidden_outputs, inputs_ngram], dim=-1)

        encoder_outputs, hidden_states = self.sec_bi_gru(eumjeol_inputs)
        # print('d_hidden', d_hidden_outputs.shape)
        d_hidden_outputs = self.dropout(encoder_outputs)
        # cat_feature = torch.cat([d_hidden_outputs, inputs_ngram], dim = -1)
        # cat_feature = cat_feature.type(torch.DoubleTensor).cuda()
        # (batch_size, curr_max_length, hidden_size*2) -> (batch_size, curr_max_length, number_of_tags)
        logits = self.hidden2num_tag(d_hidden_outputs)
        # print('logits', logits.shape)
        if(labels is not None):
            log_likelihood = self.crf(emissions=logits,
                                      tags=labels,
                                      reduction="mean")

            loss = log_likelihood * -1.0

            return loss
        else:
            output = self.crf.decode(emissions=logits)
            return output

#개체명 사전으로 진행하기[병,근]

In [None]:
#@title feature 화

In [None]:
from tqdm import tqdm
import numpy as np
# 파라미터로 입력받은 파일에 저장된 단어 리스트를 딕셔너리 형태로 저장
def load_vocab(f_name):
    vocab_file = open(os.path.join(root_dir, f_name),'r',encoding='utf8')
    print("{} vocab file loading...".format(f_name))

    # default 요소가 저장된 딕셔너리 생성
    symbol2idx, idx2symbol = {"<PAD>":0, "<UNK>":1}, {0:"<PAD>", 1:"<UNK>"}

    # 시작 인덱스 번호 저장
    index = len(symbol2idx)
    for line in tqdm(vocab_file.readlines()):
        symbol = line.strip()
        symbol2idx[symbol] = index
        idx2symbol[index]= symbol
        index+=1

    return symbol2idx, idx2symbol

# 입력 데이터를 고정 길이의 벡터로 표현하기 위한 함수
def convert_data2feature(data, symbol2idx, max_length=None):
    # 고정 길이의 0 벡터 생성
    feature = np.zeros(shape=(max_length), dtype=np.int)
    # 입력 문장을 공백 기준으로 split
    words = data.split()

    for idx, word in enumerate(words[:max_length]):
        if word in symbol2idx.keys():
            feature[idx] = symbol2idx[word]
        else:
            feature[idx] = symbol2idx["<UNK>"]
    return feature

# 파라미터로 입력받은 파일로부터 tensor객체 생성
def load_data(config, f_name, word2idx, tag2idx):
    file = open(os.path.join(root_dir, f_name),'r',encoding='utf8')

    # return할 문장/라벨 리스트 생성
    indexing_inputs, indexing_tags = [], []

    print("{} file loading...".format(f_name))

    # 실제 데이터는 아래와 같은 형태를 가짐
    # 문장 \t 태그
    # 세 종 대 왕 은 <SP> 조 선 의 <SP> 4 대 <SP> 왕 이 야 \t B_PS I_PS I_PS I_PS O <SP> B_LC I_LC O <SP> O O <SP> O O O
    for line in tqdm(file.readlines()):
        try:
            id, sentence, tags = line.strip().split('\t')
        except:
            id, sentence = line.strip().split('\t')
        input_sentence = convert_data2feature(sentence, word2idx, config["max_length"])
        indexing_tag = convert_data2feature(tags, tag2idx, config["max_length"])

        indexing_inputs.append(input_sentence)
        indexing_tags.append(indexing_tag)
    indexing_inputs = torch.tensor(indexing_inputs, dtype=torch.long)
    indexing_tags = torch.tensor(indexing_tags, dtype=torch.long)

    return indexing_inputs, indexing_tags

# tensor 객체를 리스트 형으로 바꾸기 위한 함수
def tensor2list(input_tensor):
    return input_tensor.cpu().detach().numpy().tolist()


# 새 섹션

##input sentence로 feature 만들기

In [None]:
def bi_gram_feature(dic, sentence, labels = None, max_length = None):
  #dic = [date, time, org, loc, per]으로, 각 리스트는 np 행렬로 이루어져있음
  #NER = np.zeros(5) #순서대로 date, time, org, loc, per 순으로 원핫 인코딩을 리턴
  
  numbers = ['0','1','2','3','4','5','6','7','8','9']
  
  sentence = sentence.split()

  for idx, word in enumerate(sentence):
    if word != '<SP>' and word.isupper():
      sentence[idx] = word.lower()
    elif word in numbers:
      sentence[idx] = 'N'
  
  sentence.insert(0,'<SP>')

  bi_sentence = list(zip(*[sentence[i:] for i in range(2)]))
  # bi_sentence = [:len(bi_sentence)-1]
  
  NER_list = []
  for bi_gram in bi_sentence[:120]:
    
    NER = np.zeros(5)
    word2 = bi_gram[0]+bi_gram[1]
    
    # LOC, ORG, PER, DT, TI
    for idx, name in enumerate(name_list):
      if word2 in bi_dict[name]:
        NER[idx] = 1
        
    NER_list.append(NER)

  padding_size = max_length - len(NER_list)
  for i in range(padding_size):
    NER_list.append(np.zeros(5))
    
  return NER_list

def tri_gram_feature(dic, sentence, labels = None, max_length = None):
  #dic = [date, time, org, loc, per]으로, 각 리스트는 np 행렬로 이루어져있음
  #NER = np.zeros(5) #순서대로 date, time, org, loc, per 순으로 원핫 인코딩을 리턴
  
  numbers = ['0','1','2','3','4','5','6','7','8','9']
  
  sentence = sentence.split()

  for idx, word in enumerate(sentence):
    if word != '<SP>' and word.isupper():
      sentence[idx] = word.lower()
    elif word in numbers:
      sentence[idx] = 'N'
  
  sentence.insert(0,'<SP>')
  sentence.append('<SP>')

  bi_sentence = list(zip(*[sentence[i:] for i in range(3)]))
  NER_list = []
  for bi_gram in bi_sentence[:120]:
    
    NER = np.zeros(5)
    word2 = bi_gram[0]+bi_gram[1]+bi_gram[2]
    
    # LOC, ORG, PER, DT, TI
    for idx, name in enumerate(name_list):
      if word2 in tri_dict[name]:
        NER[idx] = 1
        
    NER_list.append(NER)

  padding_size = max_length - len(NER_list)
  for i in range(padding_size):
    NER_list.append(np.zeros(5))
    
  return NER_list

#Train and Test

In [None]:
from torch.utils.data import (DataLoader, TensorDataset)
import torch.optim as optim

def train(config):
    # 모델 객체 생성
    # 단어 딕셔너리 생성
    word2idx, idx2word = load_vocab(config["word_vocab_file"])
    tag2idx, idx2tag = load_vocab(config["tag_vocab_file"])
    # 데이터 Load
    train_input_features, train_tags = load_data(config, config["train_file"], word2idx, tag2idx)
    test_input_features, test_tags = load_data(config, config["dev_file"], word2idx, tag2idx)

    # train_input_features, train_tags = feature_dict['eumjeol.npy'], feature_dict['tag.npy']
    # test_input_features, test_tags = feature_dict['test_inputs.npy'], feature_dict['test_tag.npy']

    gram = torch.cat([feature_dict['trigram_train_100_200.npy'], feature_dict['bigram_train_50_200.npy']], dim=-1)
    test = torch.cat([feature_dict['trigram_test_100_200.npy'], feature_dict['bigram_test_50_200.npy']], dim=-1)
    # train_input_ngram_features = feature_dict['trigram_train.npy']
    # test_input_ngram_features = feature_dict['trigram_test.npy']
    train_input_ngram_features = gram
    test_input_ngram_features = test
    model = RNN_CRF(config).cuda()
    
    # 불러온 데이터를 TensorDataset 객체로 변환

    train_features = TensorDataset(train_input_features, train_tags, feature_dict['pumsa_onehot_200.npy'],  train_input_ngram_features)
    # train_features = TensorDataset(train_input_features, train_tags, jamo_feature)
    train_dataloader = DataLoader(train_features, shuffle=True, batch_size=config["batch_size"])

    test_features = TensorDataset(test_input_features, test_tags, feature_dict['pumsa_onehot_200_test.npy'], test_input_ngram_features)
    test_dataloader = DataLoader(test_features, shuffle=False, batch_size=config["batch_size"])

    # # 모델을 학습하기위한 optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.005)

    accuracy_list = []
    for epoch in range(config["epoch"]):
        model.train()
        losses = []
        for step, batch in enumerate(train_dataloader):
            # .cuda()를 이용하여 메모리에 업로드
            batch = tuple(t.cuda() for t in batch)
            input_features, labels, pumsa, ngram_features = batch
            # loss 계산
            loss = model(input_features, labels, pumsa, ngram_features)

            # 변화도 초기화
            optimizer.zero_grad()

            # loss 값으로부터 모델 내부 각 매개변수에 대하여 gradient 계산
            loss.backward()

            # 모델 내부 각 매개변수 가중치 갱신
            optimizer.step()

            if (step + 1) % 50 == 0:
                print("{} step processed.. current loss : {}".format(step + 1, loss.data.item()))
            losses.append(loss.data.item())



        print("Average Loss : {}".format(np.mean(losses)))

        # 모델 저장
        torch.save(model.state_dict(), os.path.join(config["output_dir_path"], "epoch_{}.pt".format(epoch + 1)))

        do_test(model, test_dataloader, idx2tag)



def test(config):
    # 모델 객체 생성
    model = RNN_CRF(config).cuda()
    # 단어 딕셔너리 생성
    word2idx, idx2word = load_vocab(config["word_vocab_file"])
    tag2idx, idx2tag = load_vocab(config["tag_vocab_file"])

    # train_input_ngram_features = feature_dict['trigram_train.npy']
    # test_input_ngram_features = feature_dict['trigram_test.npy']

    # for i in range(1, 21):
    #   config['trained_model_name'] = "epoch_{}.pt".format(i)
      # 저장된 가중치 Load
    model.load_state_dict(torch.load(os.path.join(config["output_dir_path"], config["trained_model_name"])))

    test_input_ngram_features = torch.cat([feature_dict['trigram_test_100.npy'], feature_dict['bigram_test_50.npy']], dim=-1)
    test_input_features, test_tags = load_data(config, config["dev_file"], word2idx, tag2idx)
    # 데이터 Load

    # 불러온 데이터를 TensorDataset 객체로 변환
    test_features = TensorDataset(test_input_features, test_tags, feature_dict['pumsa_onehot_test.npy'], test_input_ngram_features)
    test_dataloader = DataLoader(test_features, shuffle=False, batch_size=config["batch_size"])
    # 평가 함수 호출
    do_test(model, test_dataloader, idx2tag)

def do_test(model, test_dataloader, idx2tag):
    model.eval()
    predicts, answers, outputs = [], [], []
    for step, batch in enumerate(test_dataloader):
        # .cuda() 함수를 이용하요 메모리에 업로드
        batch = tuple(t.cuda() for t in batch)

        # 데이터를 각 변수에 저장
        input_features, labels, pumsa, ngram_features = batch

        # 예측 라벨 출력
        output = model(input_features, pumsa=pumsa, inputs_ngram = ngram_features)
        outputs.append(output)
        # 성능 평가를 위해 예측 값과 정답 값 리스트에 저장
        for idx, answer in enumerate(tensor2list(labels)):
            answers.extend([idx2tag[e].replace("_", "-") for e in answer if idx2tag[e] != "<SP>" and idx2tag[e] != "<PAD>"])
            predicts.extend([idx2tag[e].replace("_", "-") for i, e in enumerate(output[idx]) if idx2tag[answer[i]] != "<SP>" and idx2tag[answer[i]] != "<PAD>"] )
    
    # 성능 평가
    print(classification_report(answers, predicts))


#MAIN

In [None]:
##########################################################
#                                                        #
#        평가 기준이 되는 지표는 Macro F1 Score          #
#           제출 포맷은 id \t predict_tag                #
#            25 \t B_PS I_PS <SP> O O O ...              #
#                                                        #
##########################################################


import os
if(__name__=="__main__"):
    output_dir = os.path.join(root_dir, "output")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_dir = os.path.join(output_dir, "200")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    config = {"mode": "train",
              "train_file":"ner_train.txt",
              "dev_file": "ner_dev.txt",
              "word_vocab_file":"vocab.txt",
              "tag_vocab_file":"tag_vocab.txt",
              "trained_model_name":"epoch_{}.pt".format(11),
              "output_dir_path":output_dir,
              "word_vocab_size":2160,
              "number_of_tags": 14,
              "hidden_size": 100,
              "dropout":0.2,
              "embedding_size":100,
              "max_length": 200,
              "batch_size":64,
              "epoch":20,
              }

    if(config["mode"] == "train"):
        train(config)
    else:
        test(config)


vocab.txt vocab file loading...


100%|██████████| 2158/2158 [00:00<00:00, 347072.67it/s]


tag_vocab.txt vocab file loading...


100%|██████████| 12/12 [00:00<00:00, 50181.10it/s]


ner_train.txt file loading...


100%|██████████| 7319/7319 [00:00<00:00, 20037.63it/s]


ner_dev.txt file loading...


100%|██████████| 995/995 [00:00<00:00, 12427.73it/s]


50 step processed.. current loss : 19.182981491088867
100 step processed.. current loss : 9.986862182617188
Average Loss : 30.43329502603282




              precision    recall  f1-score   support

          DT       0.69      0.66      0.67       624
          LC       0.65      0.65      0.65       537
          OG       0.55      0.39      0.45       973
          PS       0.70      0.66      0.68       742
          TI       0.09      0.08      0.09        95

   micro avg       0.63      0.55      0.59      2971
   macro avg       0.54      0.49      0.51      2971
weighted avg       0.62      0.55      0.58      2971

50 step processed.. current loss : 5.7325439453125
100 step processed.. current loss : 6.237150192260742
Average Loss : 6.650182591313901
              precision    recall  f1-score   support

          DT       0.80      0.75      0.78       624
          LC       0.77      0.72      0.74       537
          OG       0.63      0.65      0.64       973
          PS       0.83      0.76      0.79       742
          TI       0.51      0.49      0.50        95

   micro avg       0.73      0.70      0.72    