In [1]:
!pip install transformers




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import transformers
from transformers import AutoTokenizer, AdamW, RobertaForSequenceClassification

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm

In [9]:
train = pd.read_csv("train_data.csv") # 초기 자료
back_train = pd.read_csv("train_final1.csv") # 역번역 자료
test = pd.read_csv("test_data.csv") # test 초기 자료
back_test = pd.read_csv("test_final1.csv") # 역번역 자료
topic_dict=pd.read_csv("topic_dict.csv")
sample_submission=pd.read_csv("sample_submission.csv")

In [10]:
a=pd.DataFrame()
a["title"]=back_train["title_kor"]
a["index"]=back_train["index"]
a["topic_idx"]=back_train["topic_idx"]
b=pd.DataFrame()
b["title"]=train["title"]
b["index"]=train["index"]
b["topic_idx"]=train["topic_idx"]

In [11]:
train2=pd.concat([a,b])

In [12]:
train, val = train_test_split(train2, test_size=0.2, random_state=2021)

In [13]:
class NTDataset(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = csv_file
    self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=14,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [14]:
class NTDataset_test(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = csv_file
    self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:2].values
    text = row[0]
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=14,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [15]:
train_dataset = NTDataset(train)
val_dataset = NTDataset(val)

test_dataset = NTDataset_test(test)

              index     topic_idx
count  73046.000000  73046.000000
mean   22798.926539      3.160146
std    13168.665730      1.933066
min        0.000000      0.000000
25%    11389.000000      2.000000
50%    22776.000000      3.000000
75%    34166.750000      5.000000
max    45653.000000      6.000000
              index     topic_idx
count  18262.000000  18262.000000
mean   22936.790822      3.177527
std    13221.271437      1.932156
min        2.000000      0.000000
25%    11503.500000      2.000000
50%    23008.500000      3.000000
75%    34477.750000      5.000000
max    45651.000000      6.000000
              index
count   9131.000000
mean   50219.000000
std     2636.036988
min    45654.000000
25%    47936.500000
50%    50219.000000
75%    52501.500000
max    54784.000000


In [19]:
if torch.cuda.is_available():
  device=torch.device('cuda') #relatively fast
else:
  device=torch.device('cpu') #only cpu for training & evaluating #Very slow
print(device)

cpu


In [21]:
model = RobertaForSequenceClassification.from_pretrained("klue/roberta-large", num_labels=7).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
epochs = 1
batch_size = 128

In [23]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [None]:
# train
losses = []
accuracies = []
total_loss = 0.0
correct = 0
total = 0

for i in range(epochs):

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss / total, "Accuracy:", correct.float() / total)

  0%|          | 0/357 [00:00<?, ?it/s]



Train Loss: 0.003900176573401291 Accuracy: tensor(0.8335)


  0%|          | 0/357 [00:00<?, ?it/s]

In [None]:
# validation
model.eval()

pred = []
correct = 0
total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.append(predicted)
  correct += (predicted == y_batch).sum()
  total += len(y_batch)

print("val accuracy:", correct.float() / total)

In [None]:
# test
model.eval()

pred = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  pred.extend(predicted.tolist())

In [None]:
sample_submission.topic_idx = pred

In [None]:
sample_submission.to_csv("roberta_large.csv")