# **뉴스 토픽 분류 AI 경진대회**

https://dacon.io/competitions/official/235747/overview/description

# Setting

## install packages

In [None]:
!pip install konlpy
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install hanja
!bash install_mecab-ko_on_colab190912.sh

In [None]:
!pip install transformers
!pip install adabelief-pytorch
!pip install datasets

In [None]:
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

fatal: destination path 'Mecab-ko-for-Google-Colab' already exists and is not an empty directory.


##  GPU setting

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(777)

In [None]:
!nvidia-smi

# Import library

In [None]:
import pandas as pd
import numpy as np
import re #regular expression
from konlpy.tag import Okt,Mecab # 형태소 분석
from sklearn.feature_extraction.text import TfidfVectorizer # tokenizer


import hanja
from hanja import hangul

import torch
import torchvision
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
import torch.nn.functional as F
from torch.autograd import Variable

import datasets
from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score


from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import RobertaTokenizerFast
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import AutoTokenizer, AdamW, RobertaForSequenceClassification,get_cosine_schedule_with_warmup


from adabelief_pytorch import AdaBelief


# Data Load

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/data/"


topic_dic = pd.read_csv(path+"topic_dict.csv")
train = pd.read_csv(path + "train_data.csv")
test = pd.read_csv(path + "test_data.csv")
subm = pd.read_csv(path + "sample_submission.csv")

STOPWORDSPATH = '/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/2주차_Baseline 심화/지호/stopwords.txt'

# Data Preprocessing



1. 품사 태깅
2. 숫자, 특수문자 + 한 글자 미만 제거
3. 문장부호 제거
4. 영어, 한자 -> 한글로 변환
5. 불용어 처리
6. 추가 전처리 - 이상문자열 제거

## 품사 태깅 

In [None]:
okt = Okt() # 형태소 분석기

def clean1(text):
   clean = []

   for word in okt.pos(text, stem = True): #어간 추출
    if word[1] not in ["Josa", "Eomi", "Punctuation"]:
      clean.append(word[0])
    
   return " ".join(clean)

## 숫자, 특수문자 제거 + 한 글자 미만 제거

In [None]:
def clean2(text):
  sent_clean = re.sub(r"[^a-zA-Z가-힣]"," ",text) #substraction 숫자, 특수문자 제거
  clean =[]
  for word in sent_clean.split(" "):
    if len(word)>1: #한글자 미만 제거
      clean.append(word)

  return " ".join(clean)

In [None]:
train.title = train.title.apply(lambda x : clean1(x)) 
test.title = test.title.apply(lambda x : clean1(x))
train.title = train.title.apply(lambda x : clean2(x))
test.title = test.title.apply(lambda x : clean2(x))

## 문장부호 제거

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

def clean_punc(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()

cleaned_train_corpus = []
cleaned_test_corpus = []
train.title = train.title.apply(lambda x : hanja.translate(x, 'substitution'))
test.title = test.title.apply(lambda x : hanja.translate(x, 'substitution'))

for sent in train['title']:
    cleaned_train_corpus.append(clean_punc(sent, punct, punct_mapping))
    
for sent in test['title']:
    cleaned_test_corpus.append(clean_punc(sent, punct, punct_mapping))


## 영어/한자 -> 한글로 변환

In [None]:
def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        texts[i] = texts[i].replace("外人","외국인")
        texts[i] = texts[i].replace("日","일본")
        texts[i] = texts[i].replace("美","미국")
        texts[i] = texts[i].replace("北","북한")
        texts[i] = texts[i].replace("英","영국")
        texts[i] = texts[i].replace("中","중국")
        texts[i] = texts[i].replace("與","여당")
        texts[i] = texts[i].replace("靑","청와대")
        texts[i] = texts[i].replace("野","야당")
        texts[i] = texts[i].replace("伊","이탈리아")
        texts[i] = texts[i].replace("韓","한국")
        texts[i] = texts[i].replace("南","한국")
        texts[i] = texts[i].replace("獨","독일")
        texts[i] = texts[i].replace("佛","프랑스")
        texts[i] = texts[i].replace("檢","검찰")
        texts[i] = texts[i].replace("銀","은행")
        texts[i] = texts[i].replace("亞","아시아")
        texts[i] = texts[i].replace("人","사람")
        texts[i] = texts[i].replace("孫","손혜원")
        texts[i] = texts[i].replace("企","기업")
        texts[i] = texts[i].replace("前","이전")
        texts[i] = texts[i].replace("反","반대")
        texts[i] = texts[i].replace("安","안철수")
        texts[i] = texts[i].replace("展","전시회")
        texts[i] = texts[i].replace("故","사망")
        texts[i] = texts[i].replace("文","문재인")
        texts[i] = texts[i].replace("新","새로운")
        texts[i] = texts[i].replace("曺","조국")
        texts[i] = texts[i].replace("朴","박근혜")
        texts[i] = texts[i].replace("株","주식")
        texts[i] = texts[i].replace("男","남자")
        texts[i] = texts[i].replace("硏","연구")
        texts[i] = texts[i].replace("車","자동차")
        texts[i] = texts[i].replace("軍","군대")
        texts[i] = texts[i].replace("重","중공업")       

        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'1보','', str(review))
        review = re.sub(r'\d+','', str(review))# remove number
        review = re.sub(r'→','에서 ', str(review))
        review = re.sub(r'…',' ', str(review))
        review = re.sub(r'NYT','뉴욕 타임스', str(review))
        review = re.sub(r'KAIST','카이스트', str(review))
        review = re.sub(r'WMO','세계 기상 기구', str(review))
        review = re.sub(r'KBL','한국 프로 농구', str(review))
        review = re.sub(r'UAE','아랍에미리트', str(review))
        review = re.sub(r'EU','유럽 연합', str(review))
        review = re.sub(r'NBA','농구 연맹', str(review))
        review = re.sub(r'CIA','중앙정보국', str(review))
        review = re.sub(r'ECB','유럽 중앙 은행', str(review))
        review = re.sub(r'AFC','아시아 축구 연맹', str(review))
        review = re.sub(r'ITU','국제전기통신연합', str(review))
        review = re.sub(r'MVP','최우수 선수', str(review))
        #review = re.sub(r'MB','이명박', str(review))
        review = re.sub(r'APEC','아시아 태평량 경제협력체', str(review))
        review = re.sub(r'PSG','파리 셍제르망', str(review))
        review = re.sub(r'IMO','국제해사기구', str(review))
        review = re.sub(r'MLB','프로 야구 리그 ', str(review))
        review = re.sub(r'MOU','양해각서', str(review))
        review = re.sub(r'FA','자유계약선수제도', str(review))
        review = re.sub(r'EPL','잉글랜드프리미어리그', str(review))
        review = re.sub(r'KBO','한국야구위원회', str(review))
        review = re.sub(r'IPU','국제 의회 연맹', str(review))
        review = re.sub(r'AG','아시안게임', str(review))
        review = re.sub(r'PS','포스트시즌', str(review))
        review = re.sub(r'PO','플레이오프', str(review))
        #review = re.sub(r'닷컴','사이트', str(review))
        review = re.sub(r'OUT','방출', str(review))
        review = re.sub(r'IN','영입', str(review))
        review = re.sub(r'TPP',' 환태평양 경제 동반자협정', str(review))
        review = re.sub(r'EAS','동아시아 정상회의', str(review))
        review = re.sub(r'DC','', str(review))
        review = re.sub(r'①','', str(review))
        review = re.sub(r'②','', str(review))
        review = re.sub(r'⑤','', str(review))
        review = re.sub(r'·',' 및 ', str(review))
        #sent = re.sub(r'G20','', str(sent))
        review = re.sub(r'↑','상승 ', str(review))
        review = re.sub(r'↓','하락 ', str(review))
        review = re.sub(r'ITF','국제태권도연맹 ', str(review))
        review = re.sub(r'IS','이슬람 ', str(review))
        review = re.sub(r'러','러시아 ', str(review))
        review = re.sub(r'W농구','한국여자농구', str(review))
        review = re.sub(r'C팰리스','크리스탈팰리스', str(review))
        review = re.sub(r'SLBM','잠수함발사탄도미사일', str(review))
        review = re.sub(r'VNL','배구네이션스리그', str(review))
        #sent = re.sub(r'D','하루전', str(sent))
        review = re.sub(r'LA타임스','로스엔젤레스타임스', str(review))
        review = re.sub(r'V리그','배구리그', str(review))
        review = re.sub(r'KOVO','한국배구연맹', str(review))
        review = re.sub(r'ℓ','리터', str(review))
        review = re.sub(r'SUN','선동열', str(review))
        review = re.sub(r'WSJ',' 월스트리트 저널', str(review))
        review = re.sub(r'ERA',' 평균자책점', str(review))
        review = re.sub(r'IoT',' 사물인터넷', str(review))
        review = re.sub(r'QS',' 선발 6이닝 이상 3자책점 이하 투구', str(review))
        review = re.sub(r'NL','내셔널리그', str(review))
        review = re.sub(r'UFG20','한미 합동 군사', str(review))
        review = re.sub(r'F35','전투기', str(review))
        review = re.sub(r'WP','워싱턴포스트', str(review))
        review = re.sub(r'TK','대구와 경북', str(review))
        review = re.sub(r'ACL','아시아축구연맹 챔피언스리그', str(review))
        review = re.sub(r'IT','정보기술', str(review))
        review = re.sub(r'AI','인공지능', str(review))
        review = re.sub(r'TF','태스크포스', str(review))
        review = re.sub(r'ML','메이저리그', str(review))
        review = re.sub(r'FC','축구 클럽', str(review))
        review = re.sub(r'SI','스포츠 일러스트레이티드', str(review))
        review = re.sub(r'㈜','', str(review))
        review = re.sub(r'MS','마이크로소프트', str(review))
        review = re.sub(r'SNS','소셜 네트워크 서비스', str(review))
        review = re.sub(r'B52','', str(review))
        review = re.sub(r'VR','가상현실', str(review))
        review = re.sub(r'ELB','주가연계파생결합사채', str(review))
        review = re.sub(r'CES','국제전자제품박람회', str(review))
        review = re.sub(r'NPL','부실채권', str(review))
        review = re.sub(r'IPO','기업공개', str(review))
        review = re.sub(r'ERA','방어율', str(review))
        review = re.sub(r'MWC','모바일 산업 박람회', str(review))
        review = re.sub(r'NSC','국가안전보장회의', str(review))
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        review = re.sub("[一-龥]",'', review)
        corpus.append(review)
    return corpus

basic_preprocessed_train_corpus = clean_text(cleaned_train_corpus)
basic_preprocessed_test_corpus = clean_text(cleaned_test_corpus)

## 불용어 제거

In [None]:
stopwords = []
with open(STOPWORDSPATH) as f:
    for line in f:
        stopwords.append(line.strip())


removed_stopword_train_corpus = []
removed_stopword_test_corpus = []

for tagged in basic_preprocessed_train_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_train_corpus.append(' '.join(temp))
    
for tagged in basic_preprocessed_test_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_test_corpus.append(' '.join(temp))


train_text = removed_stopword_train_corpus
test_text = removed_stopword_test_corpus
train_label = np.asarray(train.topic_idx)

## 추가 전처리 - 이상문자열 제거 

"k 이슬람 t"라는 이상 문자열이 106개 row에서 발견되어 이를 제거하는 작업을 추가적으로 진행했습니다.

In [None]:
#'k 이슬람 t' 제거 
train["clear_title"] = train["clear_title"].str.replace("k 이슬람 t", "")
test["clear_title"] = test["clear_title"].str.replace("k 이슬람 t", "")

## 최종 결과 확인

In [None]:
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_rows', None)

np.set_printoptions(threshold = np.inf, linewidth = np.inf )

train['clear_title'] = train_text
test['clear_title'] = test_text

train = train[['index','clear_title','topic_idx']]
test = test[['index','clear_title']]

In [None]:
#train 최종 데이터 확인
train[['clear_title']]

In [None]:
#test 최종 데이터 확인
test[['clear_title']]

In [None]:
train.to_csv("/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/data/최종 데이터셋/fin_train.csv", index =False)
test.to_csv("/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/data/최종 데이터셋/fin_test.csv", index =False)

#Final Dataset Loading

In [None]:
path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/data/최종 데이터셋/"

**for Training**

In [None]:
Train = load_dataset("csv", data_files = path + "fin_train.csv")
Test = load_dataset("csv", data_files = path + "fin_test.csv")

In [None]:
total_dataset = datasets.DatasetDict({"train" : Train["train"],
                                      "test" : Test["train"]})
total_dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'clear_title', 'topic_idx'],
        num_rows: 45654
    })
    test: Dataset({
        features: ['index', 'clear_title'],
        num_rows: 9131
    })
})

**for Testing**

In [None]:
train = pd.read_csv(path + "fin_train.csv")
test = pd.read_csv(path + "fin_test.csv")
submission = pd.read_csv(path + "sample_submission.csv")

# Training

## Hyperparameter

In [None]:
epochs = 10
MAX_LEN = 46
batch_size = 32
num_cores = 2

np.random.seed(42)

device = torch.device("cuda:0")
print(torch.cuda.is_available())


lr = 1e-5
log_interval = 200


True


##Functions & Class

**tokenizing**

In [None]:
def tokenized(tokenizer, total_dataset):
  tokenized = total_dataset.map(lambda x :tokenizer(text = x["clear_title"], add_special_tokens = True,
                   max_length = MAX_LEN, padding = "max_length",
                   truncation = True) , batched = True)
  
  tokenized = tokenized.remove_columns(["index", "clear_title"])
  
  tokenized["train"] = tokenized["train"].rename_column("topic_idx", "labels")
  if "valid" in tokenized.keys():
    tokenized["valid"] = tokenized["valid"].rename_column("topic_idx", "labels")

  #torch tensor로 바꾸기
  tokenized.set_format("torch")

  if "valid" in tokenized.keys():
    return tokenized["train"], tokenized["valid"], tokenized["test"]
  else:
    return tokenized["train"], tokenized["test"]

**tokenzing and Dataset**

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
    def __getitem__(self, index):
        # get the sentence from the dataframe
        sentence = self.df_data.loc[index, 'clear_title']
        encoded_dict = tokenizer(
          text = sentence,
          add_special_tokens = True, 
          max_length = MAX_LEN,
          pad_to_max_length = True,
          truncation=True,           # Pad & truncate all sentences.
          return_tensors="pt")

        padded_token_list = encoded_dict['input_ids'][0]
        token_type_id = encoded_dict['token_type_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        target = torch.tensor(self.df_data.loc[index, "topic_idx"])
        sample = (padded_token_list, token_type_id , att_mask, target)
        return sample
    def __len__(self):
        return len(self.df_data)

In [None]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
    def __getitem__(self, index):
        # get the sentence from the dataframe
        sentence = self.df_data.loc[index, 'clear_title']
        encoded_dict = tokenizer(
          text = sentence,
          add_special_tokens = True, 
          max_length = MAX_LEN,
          pad_to_max_length = True,
          truncation=True,           # Pad & truncate all sentences.
          return_tensors="pt")

        padded_token_list = encoded_dict['input_ids'][0]
        token_type_id = encoded_dict['token_type_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        sample = (padded_token_list, token_type_id , att_mask)
        return sample
    def __len__(self):
        return len(self.df_data)

**accuracy**

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

**Training**

In [None]:
def Training(model, train_dataloader, optimizer, scheduler, num_epochs, device):

  model.train()
  Acc = []
  for e in range(num_epochs):
    train_acc = 0.0

    for batch_id, batch in enumerate(tqdm(train_dataloader)):
      optimizer.zero_grad() # 가중치 초기화

      batch = {k:v.to(device) for k,v in batch.items()} #한번에 값들 넣기
      label = batch["labels"]

      #forward
      out = model(**batch) #iterator로
      loss = out.loss #model의 method로 loss 제공
      loss.backward()

      optimizer.step()
      scheduler.step() #update learning rate schedule

      train_acc +=calc_accuracy(out.logits, label) # 누적으로 계산

      #출력문
      if batch_id % log_interval == 0:
        print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1))) # 누적으로 평균값

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1))) # epoch 마다 평균 정확도
    Acc.append(train_acc / (batch_id +1))

  return sum(Acc) / len(Acc), model

## Model 1 : Roberta - small

In [None]:
model_name = 'klue/roberta-small'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-small')

train_data1 = TrainDataset(train)

test_data1 = TestDataset(test)

train_dataloader1 = torch.utils.data.DataLoader(train_data1,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                      num_workers=NUM_CORES)
test_dataloader1 = torch.utils.data.DataLoader(test_data1,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                      num_workers=NUM_CORES)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-small',num_labels =7)
model.to(device)

# for scheduling
warmup_ratio = 0.1
t_total = len(train_dataloader1) * num_epochs
warmup_step = int(t_total * warmup_ratio)

optimizer = AdaBelief(model.parameters(), lr = lr,
                     eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = True)



scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps =warmup_step,
                                            num_training_steps = t_total)

train_acc_mean ,trained_model= Training(model, train_dataloader1, optimizer, scheduler, num_epochs, device)
print("Average : ", train_acc_mean)

#save model
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/민경"
trained_model.save_pretrained(model_path + "/{}".format(model_name))


In [None]:
print(trained_model)

## Model 2: Roberta - base

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-base', num_labels=7)
model_name = 'klue/roberta-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Train_dataset2, Test_dataset2 = tokenized(tokenizer, total_dataset)

#data loader

train_dataloader2 = torch.utils.data.DataLoader(Train_dataset2,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                      num_workers=NUM_CORES)

test_dataloader2 = torch.utils.data.DataLoader(Test_dataset2,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                      num_workers=NUM_CORES)

  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-base', num_labels=7)
model.to(device)

# for scheduling
warmup_ratio = 0.1
t_total = len(train_dataloader2) * num_epochs
warmup_step = int(t_total * warmup_ratio)

optimizer = AdaBelief(model.parameters(), lr = lr,
                     eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = True)



scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps =warmup_step,
                                            num_training_steps = t_total)

train_acc_mean ,trained_model= Training(model, train_dataloader2, optimizer, scheduler, num_epochs, device)
print("Average : ", train_acc_mean)

#save model
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/윤"
trained_model.save_pretrained(model_path + "/{}".format(model_name))


## Model 3: Roberta - large

In [None]:
model_name = "klue/roberta-large"

In [None]:
tokenizer =AutoTokenizer.from_pretrained(model_name)

Train_dataset3, Test_dataset3 = tokenized(tokenizer, total_dataset)

#data loader

train_dataloader3 = torch.utils.data.DataLoader(Train_dataset3,
                                        batch_size=batch_size,
                                        shuffle=True,
                                      num_workers=num_cores)

test_dataloader3 = torch.utils.data.DataLoader(Test_dataset3,
                                        batch_size=batch_size,
                                        shuffle=False,
                                      num_workers=num_cores)

Downloading tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
model = RobertaForSequenceClassification.from_pretrained('klue/roberta-large', num_labels=7)
model.to(device)

# for scheduling
warmup_ratio = 0.1
t_total = len(train_dataloader3) * epochs
warmup_step = int(t_total * warmup_ratio)

optimizer = AdaBelief(model.parameters(), lr=1e-5, eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = False)


scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps =warmup_step,
                                            num_training_steps = t_total)

train_acc_mean ,trained_model= Training(model, train_dataloader3, optimizer, scheduler, epochs, device)
print("Average : ", train_acc_mean)

#save model
trained_model.save_pretrained(model_path + "/{}".format(model_name))

In [None]:
print(trained_model)

## Model 4 : Koelectra - base

In [None]:
model_name = "monologg/koelectra-base-v3-discriminator"

In [None]:
tokenizer =ElectraTokenizer.from_pretrained(model_name)

Train_dataset4, Test_dataset4 = tokenized(tokenizer, total_dataset)

#data loader

train_dataloader4 = torch.utils.data.DataLoader(Train_dataset4,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                      num_workers=NUM_CORES)

test_dataloader4 = torch.utils.data.DataLoader(Test_dataset4,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False,
                                      num_workers=NUM_CORES)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels =7)
model.to(device)

# for scheduling
warmup_ratio = 0.1
t_total = len(train_dataloader4) * num_epochs
warmup_step = int(t_total * warmup_ratio)

optimizer = AdaBelief(model.parameters(), lr = lr,
                     eps=1e-16, betas=(0.9,0.999), weight_decouple = True, rectify = True)



scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps =warmup_step,
                                            num_training_steps = t_total)

train_acc_mean ,trained_model= Training(model, train_dataloader4, optimizer, scheduler, num_epochs, device)
print("Average : ", train_acc_mean)

#save model
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/염"
trained_model.save_pretrained(model_path + "/{}".format(model_name))


In [None]:
print(trained_model)

# Prediction (Voting Ensemble)

## Roberta - large : 데이콘 결과 0.85454 114위(public) 

In [None]:
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/지호"
model_name = 'klue/roberta-large'
save_path = model_path + "/"+ model_name
model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=7).to(device)

In [None]:
model.eval()
prediction_list =[]
for batch in test_dataloader3:
  batch = {k:v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    output = model(**batch)

  logits = output.logits
  predictions = torch.argmax(logits, dim = 1)
  prediction_list.extend(predictions.cpu().tolist()) # list.extend(iterator = list)항목들을 모두 바깥쪽에 넣는다.

len(prediction_list)

9131

In [None]:
submission = pd.read_csv(path +"sample_submission.csv")
print(submission.head())

submission["topic_idx"] = prediction_list
print(submission.head())
result_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/지호"
submission.to_csv(result_path + "/result_robertlarge_adabelief.csv", index = False)

   index  topic_idx
0  45654          0
1  45655          0
2  45656          0
3  45657          0
4  45658          0
   index  topic_idx
0  45654          0
1  45655          3
2  45656          2
3  45657          0
4  45658          3


## Ensemble : koelectra(0.1) +large(0.3) + small(0.5) +  base(0.1) 0.84030 145등(public)

1. saved models calling 
2. prediction
3. voting

###1. roberta-small

In [None]:
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/민경"
model_name = 'klue/roberta-small'
save_path = model_path + "/"+ model_name
model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=7).to(device)

In [None]:
preds = [] 
model.eval()
torch.set_grad_enabled(False)
for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm_notebook(test_dataloader1)):
    input_id = input_id.long().to(device)
    token_type_id = token_type_id.long().to(device)
    attention_mask = attention_mask.long().to(device)
    outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
    out = outputs[0]
    for inp in out:
      preds.append(inp.detach().cpu().numpy())
Preds = np.array(preds)

In [None]:
Roberta_small = Preds 
Roberta_small

array([[ 1.4591779 , -0.4251818 ,  2.4956343 , ..., -1.6797501 ,
        -2.6218424 , -3.5937283 ],
       [-1.8771108 , -2.0249472 , -0.42838815, ..., -0.3232079 ,
        -1.694827  , -0.9351825 ],
       [ 0.21637736, -0.4042029 ,  5.5209312 , ..., -0.7444878 ,
        -3.1604404 ,  1.0278183 ],
       ...,
       [-2.1031325 , -2.6757224 ,  5.038392  , ..., -1.3883253 ,
        -1.8889341 , -1.2277019 ],
       [ 1.7475202 ,  4.318581  ,  3.544428  , ..., -1.9515331 ,
        -1.9929845 , -3.3142335 ],
       [ 0.63582855, -0.31544095,  3.3627174 , ..., -3.1467037 ,
        -1.3221827 ,  4.0676026 ]], dtype=float32)

###2. Roberta-base

In [None]:
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/윤"
model_name = 'klue/roberta-base'
save_path = model_path + "/"+ model_name
model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=7).to(device)

In [None]:
preds = [] 
model.eval()
torch.set_grad_enabled(False)
for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm_notebook(test_dataloader2)):
    input_id = input_id.long().to(device)
    token_type_id = token_type_id.long().to(device)
    attention_mask = attention_mask.long().to(device)
    outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
    out = outputs[0]
    for inp in out:
      preds.append(inp.detach().cpu().numpy())
Preds = np.array(preds)

In [None]:
Roberta_base = Preds 
Roberta_base

array([[ 4.0617213 ,  0.27876177,  0.83959943, ..., -0.88148355,
        -2.8046308 , -4.221271  ],
       [-1.7503253 , -2.1899025 ,  0.49846193, ..., -0.9193513 ,
        -2.1130798 , -1.1184703 ],
       [-0.4292372 , -0.5965223 ,  6.4967976 , ..., -1.253781  ,
        -3.665053  , -0.33771127],
       ...,
       [-2.413768  , -1.745429  ,  6.0806003 , ..., -1.5730447 ,
        -2.9930644 , -1.4727268 ],
       [ 1.6451037 , -0.35484675,  5.686179  , ..., -2.446513  ,
        -1.5499816 , -2.3602057 ],
       [ 0.08038241, -0.64108   ,  3.8814101 , ..., -3.916439  ,
        -2.161689  ,  4.197057  ]], dtype=float32)

###3. Roberta-large

In [None]:
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/지호"
model_name = 'klue/roberta-large'
save_path = model_path + "/"+ model_name
model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=7).to(device)

In [None]:
preds = [] 
model.eval()
torch.set_grad_enabled(False)
for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm_notebook(test_dataloader3)):
    input_id = input_id.long().to(device)
    token_type_id = token_type_id.long().to(device)
    attention_mask = attention_mask.long().to(device)
    outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
    out = outputs[0]
    for inp in out:
      preds.append(inp.detach().cpu().numpy())
Preds = np.array(preds)

In [None]:
Roberta_large = Preds 
Roberta_large

array([[ 5.031582  ,  0.4835413 , -1.9664816 , ...,  0.3080583 ,
        -2.69096   , -4.206146  ],
       [-1.6322936 , -2.8785484 ,  0.7724615 , ..., -1.456878  ,
        -1.2276715 , -1.3078557 ],
       [-1.2246324 ,  0.32183525,  7.322132  , ..., -1.5378554 ,
        -3.302623  , -0.6921514 ],
       ...,
       [-1.9306413 , -2.5169573 ,  4.128827  , ..., -0.8494464 ,
        -2.9713948 , -3.04489   ],
       [-1.858782  ,  1.9159635 ,  7.9690804 , ..., -2.6619954 ,
        -2.4259474 , -2.994513  ],
       [-1.1642442 , -1.1757185 ,  5.481288  , ..., -3.4326966 ,
        -2.480573  ,  5.0481915 ]], dtype=float32)

### 4. Koelectra-base

In [None]:
model_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/염"
model_name = 'monologg/koelectra-base-v3-discriminator'
save_path = model_path + "/"+ model_name
model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=7).to(device)

In [None]:
preds = [] 
model.eval()
torch.set_grad_enabled(False)
for batch_id, (input_id,token_type_id,attention_mask) in enumerate(tqdm_notebook(test_dataloader4)):
    input_id = input_id.long().to(device)
    token_type_id = token_type_id.long().to(device)
    attention_mask = attention_mask.long().to(device)
    outputs = model(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)
    out = outputs[0]
    for inp in out:
      preds.append(inp.detach().cpu().numpy())
Preds = np.array(preds)

In [None]:
Koelectra_base = Preds 
Koelectra_base

array([[ 2.2799342 ,  1.0971454 ,  1.9229554 , ..., -3.1768875 ,
        -4.2963166 , -3.937318  ],
       [-2.0020711 , -2.1920724 ,  0.09604193, ..., -0.6008972 ,
        -2.363793  , -1.9305559 ],
       [ 0.23240103, -0.21703632,  2.977845  , ...,  0.36266443,
        -5.23138   ,  2.2204292 ],
       ...,
       [-2.5612462 , -2.0795794 ,  3.453697  , ..., -1.545476  ,
        -3.7329066 , -1.7561842 ],
       [ 0.18274824,  5.3301067 ,  2.9568539 , ..., -1.7096467 ,
        -3.6123543 , -2.6795487 ],
       [-1.9395641 , -0.36861038,  3.4598277 , ..., -2.039366  ,
        -3.251142  ,  5.1563487 ]], dtype=float32)

###Voting

In [None]:
Pred_values = Roberta_small*0.5 + Roberta_base * 0.1 + Roberta_large * 0.3 + Koelectra_base * 0.1

In [None]:
results = np.argmax(Pred_values, axis=1)
submission['topic_idx']= results

In [None]:
submission

Unnamed: 0,index,topic_idx
0,45654,3
1,45655,3
2,45656,2
3,45657,0
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,3
9129,54783,2


In [None]:
result_path = "/content/drive/Shareddrives/2022-1 KUBIG 딥러닝 콘테스트/Code/4주차_최종/민경"
submission.to_csv(result_path + "/ensenble10.csv", index = False)