In [4]:
import pickle as pickle
import os
import pandas as pd
import numpy as np
import random
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig, AutoModelForSequenceClassification, AutoConfig
from transformers import AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
from load_data import *
import argparse
from importlib import import_module
from pathlib import Path
import glob
import re
import neptune
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import DataLoader
import inference

def inference_proba(model, tokenized_sent, device):
  dataloader = DataLoader(tokenized_sent, batch_size=8, shuffle=False)
  model.eval()
  output_pred = []
  result=np.zeros(42).reshape(-1,42)
  device=torch.device('cuda')
  # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  for i, data in enumerate(dataloader):
    with torch.no_grad():
      # outputs = model(
      #     input_ids=data['input_ids'].to(device),
      #     attention_mask=data['attention_mask'].to(device),
      #     token_type_ids=data['token_type_ids'].to(device)
      #     )
      outputs = model(
          input_ids=data['input_ids'],
          attention_mask=data['attention_mask'],
          token_type_ids=data['token_type_ids']
          )
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    # result = np.argmax(logits, axis=-1)
    result=np.vstack([result,logits])

  return result[1:,:]

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

##################################################################
def preprocessing_dataset(dataset, label_type, apply_add_entity):
  label = []
  for i in dataset.iloc[:,8]:
    if i == 'blind':
      label.append(100)
    else:
      label.append(label_type[i])
  out_dataset = pd.DataFrame({'sentence':dataset.iloc[:,1],'entity_01':dataset.iloc[:,2],'entity_02':dataset.iloc[:,5],'label':label,})
  if apply_add_entity == True:
    out_dataset['sentence'] = out_dataset.apply(lambda x: x['entity_01']+"[SEP]"+x['entity_02']+"[SEP]"+x['sentence'], axis=1)
  return out_dataset

def load_test_dataset(dataset_dir, tokenizer):
  test_dataset = load_test(dataset_dir)

  test_label = test_dataset['label'].values
  # tokenizing dataset
  tokenized_test = tokenized_dataset(test_dataset, tokenizer)
  return tokenized_test, test_label

def lower_dir_search(dirname):  # 모델저장된 체크포인트 폴더 경로들을 찾아주는 함수
  filenames = os.listdir(dirname)
  lower_dir_list = []
  for filename in filenames:
      full_filename = os.path.join(dirname, filename)
      lower_dir_list.append(full_filename)
  return lower_dir_list

그때그때마다 다르게 지정해줘야할 파라미터들

## KFOLD 모델 말고 단순 모델


In [4]:
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed_everything(77)
# model_type = 'Electra'
model_type = 'XLMRoberta'
# MODEL_NAME = 'monologg/koelectra-base-v3-discriminator'
MODEL_NAME = 'xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

upper_dir = f"kfold_results/{MODEL_NAME.replace('/','_')}" + '5'  #모델 폴드의 상위경로
upper_dir = upper_dir + '/' + '5' # 몇번째 폴드인지 
model_path = upper_dir+'fold'
max_len = 128
apply_add_entity=False

def tokenized_dataset(dataset, tokenizer):
  concat_entity = []
  if 'Roberta' in model_type:
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
      temp = ''
      temp = '<s>' + e01 + '</s>' + '<s>' + e02 + '</s>'
      concat_entity.append(temp)
  else:
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
      temp = ''
      temp = e01 + '[SEP]' + e02
      concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      # max_length=100,
      max_length=max_len,

      add_special_tokens=True,
      )
  return tokenized_sentences

def load_test(dataset_dir):
  # load label_type, classes
  with open('../input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type,apply_add_entity=apply_add_entity)
  
  return dataset

def inference_proba(model, tokenized_sent, device):
  dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
  model.eval()
  output_pred = []
  result=np.zeros(42).reshape(-1,42)

  for i, data in enumerate(dataloader):
    with torch.no_grad():
      if 'Roberta' in model_type:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            # token_type_ids=data['token_type_ids']
            )
      else:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            token_type_ids=data['token_type_ids']
            )
      # outputs = model(
      #     input_ids=data['input_ids'].to(device),
      #     attention_mask=data['attention_mask'].to(device),
      #     token_type_ids=data['token_type_ids'].to(device)
      #     )
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    # result = np.argmax(logits, axis=-1)
    result=np.vstack([result,logits])

  return result[1:,:]

def inference(model, tokenized_sent, device):
  dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
  model.eval()
  output_pred = []
  
  for i, data in enumerate(dataloader):
    with torch.no_grad():
      if 'Roberta' in model_type:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            # token_type_ids=data['token_type_ids']
            )
      else:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            token_type_ids=data['token_type_ids']
            )
      # outputs = model(
      #     input_ids=data['input_ids'].to(device),
      #     attention_mask=data['attention_mask'].to(device),
      #     token_type_ids=data['token_type_ids'].to(device)
      #     )
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)
    output_pred.append(result)
  from itertools import chain
  output_pred = list(chain(*output_pred))

  return np.array(output_pred).flatten()


test_dataset_dir = "../input/data/test/test.tsv"
test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
test_dataset = RE_Dataset(test_dataset ,test_label)

lower_dir_list = lower_dir_search(f'{model_path}')  # 저장된 모든 모델들의 checkpoint 폴더경로 리스트
lower_dir_list = [lower_dir for lower_dir in lower_dir_list if 'checkpoint-' in lower_dir]
maximum_check_dir = sorted(lower_dir_list, key=lambda x: int(re.search(rf"checkpoint\-[0-9]+",x).group().replace("checkpoint-","")))[-1]
model_module = getattr(import_module("transformers"), f"{model_type}" + "ForSequenceClassification")
model = model_module.from_pretrained(maximum_check_dir)
# device = torch.device('cuda')
# device = torch.device('cpu')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
pred_answer = inference(model, test_dataset, device)

output = pd.DataFrame(pred_answer, columns=['pred'])
submission_folder_path = f"../submission/fold_중_한개_{upper_dir.split('/')[-2]}_{upper_dir.split('/')[-1]}/"
if not os.path.isdir(submission_folder_path):
  os.makedirs(submission_folder_path)
inference_file_path = submission_folder_path + f"fold_중_한개_{upper_dir.split('/')[-2]}_{upper_dir.split('/')[-1]}.csv"

output.to_csv(inference_file_path, index=False)
print('Submission file saved!!')
print(inference_file_path)

Submission file saved!!
../submission/fold_중_한개_xlm-roberta-large5_5/fold_중_한개_xlm-roberta-large5_5.csv


## Use KFOLD

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed_everything(77)
# model_type = 'Bert'
# model_type = 'Electra'
model_type = 'XLMRoberta'
# MODEL_NAME = 'bert-base-multilingual-cased'
# MODEL_NAME = 'monologg/koelectra-base-v3-discriminator'
MODEL_NAME = 'xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

upper_dir = f"kfold_results/{MODEL_NAME.replace('/','_')}" + '6'
max_len = 128
apply_add_entity=False
num_fold_k = 6  # 6인데.. 공간없어서 못하는중
use_kfold=True

def tokenized_dataset(dataset, tokenizer):
  concat_entity = []
  if 'Roberta' in model_type:
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
      temp = ''
      temp = '<s>' + e01 + '</s>' + '<s>' + e02 + '</s>'
      concat_entity.append(temp)
  else:
    for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
      temp = ''
      temp = e01 + '[SEP]' + e02
      concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      # max_length=100,
      max_length=max_len,

      add_special_tokens=True,
      )
  return tokenized_sentences

def load_test(dataset_dir):
  # load label_type, classes
  with open('../input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t')
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type,apply_add_entity=apply_add_entity)
  
  return dataset

def inference_proba(model, tokenized_sent, device):
  dataloader = DataLoader(tokenized_sent, batch_size=40, shuffle=False)
  model.eval()
  output_pred = []
  result=np.zeros(42).reshape(-1,42)

  for i, data in enumerate(dataloader):
    with torch.no_grad():
      if 'Roberta' in model_type:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            # token_type_ids=data['token_type_ids']
            )
      else:
        outputs = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            token_type_ids=data['token_type_ids']
            )
      # outputs = model(
      #     input_ids=data['input_ids'].to(device),
      #     attention_mask=data['attention_mask'].to(device),
      #     token_type_ids=data['token_type_ids'].to(device)
      #     )
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    # result = np.argmax(logits, axis=-1)
    result=np.vstack([result,logits])

  return result[1:,:]

In [6]:
# test_dataset_dir = "../input/data/test/test.tsv"
test_dataset_dir = "../input/data/test/final_test_ner.tsv"
test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
test_dataset = RE_Dataset(test_dataset ,test_label)


In [24]:
assert (use_kfold==True)
from tqdm import tqdm
predictions_of_proba = []

k_range = range(num_fold_k)
# k_range = [1,4,5]
# k_range = [1,5]
# k_range = [1,2,4,5]
# k_range = [1,4,5]
# weight = [0.4,0.3,0.3]

# for k_idx,weigh in tqdm(zip(k_range,weight)):
for k_idx in tqdm(k_range):
  model_path = upper_dir+f'/{k_idx}fold'
  lower_dir_list = lower_dir_search(f'{model_path}')  # 저장된 모든 모델들의 checkpoint 폴더경로 리스트
  lower_dir_list = [lower_dir for lower_dir in lower_dir_list if 'checkpoint-' in lower_dir]
  maximum_check_dir = sorted(lower_dir_list, key=lambda x: int(re.search(rf"checkpoint\-[0-9]+",x).group().replace("checkpoint-","")))[-1]
  model_module = getattr(import_module("transformers"), f"{model_type}" + "ForSequenceClassification")
  model = model_module.from_pretrained(maximum_check_dir)
  # device = torch.device('cuda')
  pred_answer = inference_proba(model, test_dataset, device)
  # predictions_of_proba.append(weigh*pred_answer)
  predictions_of_proba.append(pred_answer)

mean_ensemble_result = np.argmax(np.mean(predictions_of_proba,axis=0),axis=1)  # 평균 앙상블
output = pd.DataFrame(mean_ensemble_result, columns=['pred'])
submission_folder_path = f"../submission/{num_fold_k}fold_{model_path.split('/')[-2]}/"
if not os.path.isdir(submission_folder_path):
  os.makedirs(submission_folder_path)
inference_file_path = submission_folder_path + f"{num_fold_k}fold_{model_path.split('/')[-2]}.csv"

output.to_csv(inference_file_path, index=False)
print('Submission file saved!!')
print(inference_file_path)

2it [11:09, 365.72s/it]

In [None]:
print(inference_file_path)

# Proba 따로 해서 저장해놓기

In [32]:

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed_everything(77)
# model_type = 'Bert'
# model_type = 'Electra'
model_type = 'XLMRoberta'
# MODEL_NAME = 'bert-base-multilingual-cased'
# MODEL_NAME = 'monologg/koelectra-base-v3-discriminator'
MODEL_NAME = 'xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

upper_dir = f"kfold_results/{MODEL_NAME.replace('/','_')}" + '7'
max_len = 128
apply_add_entity=False
# num_fold_k = 6  # 6인데.. 공간없어서 못하는중
# use_kfold=True

k_idx = 0 # 0번째 폴드
k_idx = 1 # 1번째 폴드
k_idx = 2 # 1번째 폴드
k_idx = 3 # 1번째 폴드


from tqdm import tqdm
predictions_of_proba = []

model_path = upper_dir+f'/{k_idx}fold'
lower_dir_list = lower_dir_search(f'{model_path}')  # 저장된 모든 모델들의 checkpoint 폴더경로 리스트
lower_dir_list = [lower_dir for lower_dir in lower_dir_list if 'checkpoint-' in lower_dir]
maximum_check_dir = sorted(lower_dir_list, key=lambda x: int(re.search(rf"checkpoint\-[0-9]+",x).group().replace("checkpoint-","")))[-1]
model_module = getattr(import_module("transformers"), f"{model_type}" + "ForSequenceClassification")
model = model_module.from_pretrained(maximum_check_dir)
# device = torch.device('cuda')
pred_answer = inference_proba(model, test_dataset, device)
print(type(pred_answer))
print(upper_dir.split('/')[-1])
print(pred_answer.shape)

<class 'numpy.ndarray'>
xlm-roberta-large7
(1000, 42)


In [33]:
# - .npy 파일로 로컬에 저장
# numpy 배열 'pred_answer'를 경로 '/data/npyfile.npy'로 저장

npy_save_folder = f"../for_submission/{upper_dir.split('/')[-1]}"
if not os.path.isdir(npy_save_folder):
  os.makedirs(npy_save_folder)

npy_save_dir = f'{npy_save_folder}/npy_file_{k_idx}'

np.save(npy_save_dir, pred_answer)

# - .npy 파일 불러오기
# np_load.shape == (3, 50, 50)

np_load = np.load(f'{npy_save_dir}.npy')

# - numpy Array -> Tensor
# device type이 'cuda'인 tensor 완성

load_tensor = torch.from_numpy(np_load).to('cuda')
load_tensor

tensor([[ 0.5046, -0.0341,  3.8955,  ..., -0.3998, -0.2050, -0.4232],
        [-0.5850,  0.0099, -0.0280,  ...,  0.0060, -0.0199, -0.1700],
        [-0.2243,  2.3287,  0.3073,  ...,  0.2171, -0.0744, -0.3788],
        ...,
        [-0.3803, -0.0639,  0.0592,  ...,  0.0647, -0.0286, -0.0361],
        [ 3.8256, -0.0714, -0.4178,  ..., -0.1105, -0.0479,  0.0083],
        [ 0.7290, -0.0861, -0.1932,  ...,  0.0615, -0.0532, -0.2908]],
       device='cuda:0', dtype=torch.float64)

In [34]:
## 모아둔 numpy 결과파일들 앙상블하기

upper_dir = f"kfold_results/{MODEL_NAME.replace('/','_')}" + '7'
npy_save_folder = f"../for_submission/{upper_dir.split('/')[-1]}"
# npy_save_dir = f'{npy_save_folder}/npy_file_{k_idx}'
num_fold_k=4
predictions_of_proba = []
for idx in range(num_fold_k):
  pred_answer = np.load(f'{npy_save_folder}/npy_file_{idx}.npy')
  predictions_of_proba.append(pred_answer)

mean_ensemble_result = np.argmax(np.mean(predictions_of_proba,axis=0),axis=1)  # 평균 앙상블
output = pd.DataFrame(mean_ensemble_result, columns=['pred'])
submission_folder_path = f"../submission/{num_fold_k}fold_{npy_save_folder.split('/')[-1]}/"
if not os.path.isdir(submission_folder_path):
  os.makedirs(submission_folder_path)
inference_file_path = submission_folder_path + f"{num_fold_k}fold_{npy_save_folder.split('/')[-1]}.csv"

output.to_csv(inference_file_path, index=False)
print('Submission file saved!!')
print(inference_file_path)

Submission file saved!!
../submission/4fold_xlm-roberta-large7/4fold_xlm-roberta-large7.csv
