In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### 0. 패키지 설치 및 라이브러리 import

In [2]:
!pip install transformers datasets



In [3]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

#### 1. 전처리 완료 데이터 불러오기

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/23-2 Kubig Contest/KUBIG-DLcontest/dataset/개정 후 전처리 결과물 (08 26)/roberta_train_df.csv")
test_df = pd.read_csv("/content/drive/MyDrive/23-2 Kubig Contest/KUBIG-DLcontest/dataset/개정 후 전처리 결과물 (08 26)/roberta_test_df.csv")

In [5]:
train_df.head()

Unnamed: 0,index,title,topic_idx,clean_title,title_list
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4,인천 핀란드 항공기 결항 휴가 여행객 분통,"['인천', '핀란드', '항공기', '결항', '휴가', '여행객', '분통']"
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4,실리콘밸리 넘어서다 구글 들이다 미국 전역 거점,"['실리콘밸리', '넘어서다', '구글', '들이다', '미국', '전역', '거점']"
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4,이란 외무 긴장 완화 해결 미국 경제 전쟁 멈추다,"['이란', '외무', '긴장', '완화', '해결', '미국', '경제', '전쟁..."
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4,nyt 클린턴 측근 기업 특수 관계 조명 공과 맞다 물리다,"['nyt', '클린턴', '측근', '기업', '특수', '관계', '조명', '..."
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4,시진핑 트럼프 중미 무역 협상 조속 타결 희망,"['시진핑', '트럼프', '중미', '무역', '협상', '조속', '타결', '..."


In [6]:
test_df.head()

Unnamed: 0,index,title,clean_title,title_list
0,45654,유튜브 내달 2일까지 크리에이터 지원 공간 운영,유튜브 내달 크리에이터 지원 공간 운영,"['유튜브', '내달', '크리에이터', '지원', '공간', '운영']"
1,45655,어버이날 맑다가 흐려져…남부지방 옅은 황사,어버이날 맑다 흐려지다 남부 지방 옅다 황사,"['어버이날', '맑다', '흐려지다', '남부', '지방', '옅다', '황사']"
2,45656,내년부터 국가RD 평가 때 논문건수는 반영 않는다,내년 국가 rd 평가 논문 건수 반영,"['내년', '국가', 'rd', '평가', '논문', '건수', '반영']"
3,45657,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,김명자 신임 과총 회장 원로 젊다 과학자 지혜 모으다,"['김명자', '신임', '과총', '회장', '원로', '젊다', '과학자', '..."
4,45658,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,회색 인간 작가 동식 양심 고백 소설 추다,"['회색', '인간', '작가', '동식', '양심', '고백', '소설', '추다']"


#### 2. 데이터셋 Tokenize 후 Dataloader로 변경

In [7]:
train, val = train_test_split(train_df, test_size=0.2, random_state=1097)

In [8]:
class TVDataset(Dataset):

  def __init__(self, csv_file, model_name):
    self.dataset = csv_file
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 2:4].values
    text = str(row[1])
    y = row[0]
    inputs = self.tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=14,
        pad_to_max_length=True,
        add_special_tokens=True
        )

    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]
    y = torch.tensor(y)  # 이걸 꼭해줘야한다..!
    return input_ids, attention_mask, y

In [9]:
class TestDataset(Dataset):

    def __init__(self, csv_file, model_name):
        self.dataset = csv_file
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 2:3].values
        text = str(row[0])
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=15,
            pad_to_max_length=True,
            add_special_tokens=True
        )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask


In [18]:
# cuda memory error solution code

import torch, gc
gc.collect()
torch.cuda.empty_cache()

#### 3. model : Roberta-large

In [11]:
model_roberta_large = "klue/roberta-large"

In [12]:
train_1 = TVDataset(train, model_roberta_large)
val_1 = TVDataset(val, model_roberta_large)
test_1 = TestDataset(test_df, model_roberta_large)

batch_size = 128

train_loader_1 = DataLoader(train_1, batch_size=batch_size, shuffle=True)
val_loader_1 = DataLoader(val_1, batch_size=batch_size, shuffle=True)
test_loader_1 = DataLoader(test_1, batch_size=batch_size, shuffle=False)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_roberta_large, num_labels=7)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [14]:
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)



##### train, val, test 데이터로 학습

In [15]:
# train
losses = []
accuracies = []
total_loss = 0.0
correct = 0
total = 0

for i in range(epochs):

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader_1):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss / total, "Accuracy:", correct.float() / total)

  0%|          | 0/286 [00:00<?, ?it/s]



Train Loss: 0.004524122963493998 Accuracy: tensor(0.8081, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.0035437099482399335 Accuracy: tensor(0.8486, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.0030541744774613784 Accuracy: tensor(0.8688, device='cuda:0')


In [16]:
# validation
model.eval()

pred = []
correct = 0
total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader_1):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.append(predicted)
  correct += (predicted == y_batch).sum()
  total += len(y_batch)

print("val accuracy:", correct.float() / total)

  0%|          | 0/72 [00:00<?, ?it/s]

val accuracy: tensor(0.8790, device='cuda:0')


In [19]:
# test
model.eval()

pred = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader_1):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.extend(predicted.tolist())

  0%|          | 0/72 [00:00<?, ?it/s]

##### 결과 확인

In [20]:
sample_sub_large = pd.read_csv("/content/drive/MyDrive/23-2 Kubig Contest/KUBIG-DLcontest/dataset/기존 전처리 결과물 (~08 25)/sample_submission.csv")
sample_sub_large['topic_idx'] = pred
sample_sub_large.to_csv("sample_sub_large.csv", index = False)

#### 4. model : Roberta-small

In [21]:
model_roberta_small = "klue/roberta-small"

In [22]:
train_2 = TVDataset(train, model_roberta_small)
val_2 = TVDataset(val, model_roberta_small)
test_2 = TestDataset(test_df, model_roberta_small)

batch_size = 128

train_loader_2 = DataLoader(train_2, batch_size=batch_size, shuffle=True)
val_loader_2 = DataLoader(val_2, batch_size=batch_size, shuffle=True)
test_loader_2 = DataLoader(test_2, batch_size=batch_size, shuffle=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_roberta_small, num_labels=7)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [24]:
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)



##### train, val, test 데이터로 학습

In [25]:
# train
losses = []
accuracies = []
total_loss = 0.0
correct = 0
total = 0

for i in range(epochs):

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader_2):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss / total, "Accuracy:", correct.float() / total)

  0%|          | 0/286 [00:00<?, ?it/s]



Train Loss: 0.00570789447254541 Accuracy: tensor(0.7826, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.004383278841285029 Accuracy: tensor(0.8279, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.0038045217289963485 Accuracy: tensor(0.8483, device='cuda:0')


In [26]:
# validation
model.eval()

pred = []
correct = 0
total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader_2):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.append(predicted)
  correct += (predicted == y_batch).sum()
  total += len(y_batch)

print("val accuracy:", correct.float() / total)

  0%|          | 0/72 [00:00<?, ?it/s]

val accuracy: tensor(0.8748, device='cuda:0')


In [27]:
# test
model.eval()

pred = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader_2):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.extend(predicted.tolist())

  0%|          | 0/72 [00:00<?, ?it/s]

##### 결과 확인

In [28]:
sample_sub_small = pd.read_csv("/content/drive/MyDrive/23-2 Kubig Contest/KUBIG-DLcontest/dataset/기존 전처리 결과물 (~08 25)/sample_submission.csv")
sample_sub_small['topic_idx'] = pred
sample_sub_small.to_csv("sample_sub_small.csv", index = False)

#### 3. model : Roberta-base

In [29]:
model_roberta_base = "klue/roberta-base"

In [30]:
train_3 = TVDataset(train, model_roberta_base)
val_3 = TVDataset(val, model_roberta_base)
test_3 = TestDataset(test_df, model_roberta_base)

batch_size = 128

train_loader_3 = DataLoader(train_3, batch_size=batch_size, shuffle=True)
val_loader_3 = DataLoader(val_3, batch_size=batch_size, shuffle=True)
test_loader_3 = DataLoader(test_3, batch_size=batch_size, shuffle=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_roberta_base, num_labels=7)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [33]:
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)

##### train, val, test 데이터로 학습

In [34]:
# train
losses = []
accuracies = []
total_loss = 0.0
correct = 0
total = 0

for i in range(epochs):

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader_3):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss / total, "Accuracy:", correct.float() / total)

  0%|          | 0/286 [00:00<?, ?it/s]



Train Loss: 0.005470691622241944 Accuracy: tensor(0.7878, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.004194981890879452 Accuracy: tensor(0.8337, device='cuda:0')


  0%|          | 0/286 [00:00<?, ?it/s]

Train Loss: 0.0036251191259761084 Accuracy: tensor(0.8541, device='cuda:0')


In [35]:
# validation
model.eval()

pred = []
correct = 0
total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader_3):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.append(predicted)
  correct += (predicted == y_batch).sum()
  total += len(y_batch)

print("val accuracy:", correct.float() / total)

  0%|          | 0/72 [00:00<?, ?it/s]

val accuracy: tensor(0.8764, device='cuda:0')


In [36]:
# test
model.eval()

pred = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader_3):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.extend(predicted.tolist())

  0%|          | 0/72 [00:00<?, ?it/s]

##### 결과 확인

In [37]:
sample_sub_base = pd.read_csv("/content/drive/MyDrive/23-2 Kubig Contest/KUBIG-DLcontest/dataset/기존 전처리 결과물 (~08 25)/sample_submission.csv")
sample_sub_base['topic_idx'] = pred
sample_sub_base.to_csv("sample_sub_base.csv", index = False)