# 라이브러리

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 12.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 77.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 77.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [42]:
import os
import pdb
# import wandb
import argparse
import pandas as pd
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    BertTokenizerFast,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    AutoConfig
    )

import gc

In [3]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [4]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [6]:
# data load
def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join('/gdrive/MyDrive/goorm/01. text_classification/datas', file_name), 'r', encoding='utf-8') as f:
            id_file_data = [line.lower().rstrip() for line in f.readlines()]
        return id_file_data

train_pos = make_data_strings('sentiment.train.1')
train_neg = make_data_strings('sentiment.train.0')
dev_pos = make_data_strings('sentiment.dev.1')
dev_neg = make_data_strings('sentiment.dev.0')

In [38]:
train_pos_label = [1 for i in train_pos]
train_neg_label = [0 for i in train_neg]
train_label = train_pos_label + train_neg_label

dev_pos_label = [1 for i in dev_pos]
dev_neg_label = [0 for i in dev_neg]
dev_label = dev_pos_label + dev_neg_label

train_data = train_pos + train_neg
dev_data = dev_pos + dev_neg

In [10]:
path = 'bert-base-uncased'

tokenizer = BertTokenizerFast.from_pretrained(path)
model = BertForSequenceClassification.from_pretrained(path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
def preprocess(text_data, label_data, batch_size=64):
    batch_input = tokenizer(text_data, truncation=True, padding=True)

    batch_input = {key : torch.tensor(value) for key, value in batch_input.items()}

    label = torch.tensor((label_data))

    dataset = TensorDataset(
        batch_input['input_ids'], 
        batch_input['token_type_ids'],
        batch_input['attention_mask'], 
        label)
    
    dataset_sampler = RandomSampler(dataset)
    dataset = DataLoader(dataset, sampler = dataset_sampler, batch_size= batch_size)

    return dataset          

In [43]:
train_dataloader = preprocess(train_data, train_label,batch_size=128)
dev_dataloader = preprocess(dev_data, dev_label,batch_size=128)

# 모델 학습

In [40]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
save_path = 'bert_classification'
train_epoch = 4

gc.collect()
torch.cuda.empty_cache()

# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'gamma', 'beta']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#     'weight_decay_rate': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#     'weight_decay_rate': 0.0}
# ]

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

total_steps = len(train_dataloader) * train_epoch

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(train_epoch):
# =====================================
#               Training
# =====================================
    model.train()

    train_loss = 0.0

    for batchs in tqdm(train_dataloader):
        batch = tuple(b.to(device) for b in batchs)

        l_input_ids, l_segment, l_mask, l_labels = batch

        optimizer.zero_grad()

        output = model(l_input_ids,
                        token_type_ids = l_segment if len(torch.unique(l_segment.flatten())) > 1 else None,
                        attention_mask=l_mask,
                        labels=l_labels)
        
        loss = output[0]
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f'epoch - {epoch} Train_Loss : ', avg_train_loss)

# =====================================
#               Evaluation
# =====================================
            
    model.eval()

    eval_loss, eval_accuracy = 0, 0


    for batchs in tqdm(dev_dataloader):
        batch = tuple(b.to(device) for b in batchs)
        l_input_ids, l_segment, l_mask, l_labels = batch

        with torch.no_grad():
             output = model(l_input_ids,
                            token_type_ids = l_segment if len(torch.unique(l_segment.flatten())) > 1 else None,
                            attention_mask=l_mask)
                
        logits = output[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = l_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy

    avg_accuracy = eval_accuracy/len(dev_dataloader)
    print(avg_accuracy)

    model.save_pretrained(f'/gdrive/MyDrive/goorm/01. text_classification/models/{save_path}')

# 모델 테스트

In [None]:
save_path = 'bert_classification'
model = BertForSequenceClassification.from_pretrained(f'/gdrive/MyDrive/goorm/01. text_classification/models/{save_path}')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [66]:
test_df = pd.read_csv('/gdrive/MyDrive/goorm/01. text_classification/datas/test_no_label.csv')

test_batch_input = tokenizer(test_df['Id'].tolist(), truncation=True, padding=True)
test_batch_input = {key : torch.tensor(value) for key, value in test_batch_input.items()}

test_dataset = TensorDataset(test_batch_input['input_ids'],test_batch_input['attention_mask'])
test_dataset = DataLoader(test_dataset,  batch_size= 128)

In [75]:
model.eval()


pred = []
for batchs in tqdm(test_dataset):
    batch = tuple(b.to(device) for b in batchs)
    l_input_ids, l_mask = batch

    with torch.no_grad():
            output = model(l_input_ids,
                        token_type_ids = l_segment if len(torch.unique(l_segment.flatten())) > 1 else None,
                        attention_mask=l_mask)
            
    logits = output[0].detach().cpu()
    pred.append(logits)

preds = torch.cat(pred, dim=0)
predictions = np.argmax(preds, axis=1)

100%|██████████| 8/8 [00:00<00:00, 20.13it/s]


In [None]:
test_df['Category'] = predictions

test_df.to_csv('/gdrive/My Drive/goorm/text_classification/submission.csv', index=False)