# 라이브러리

In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 14.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 63.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [2]:
import os
import pdb
# import wandb
import argparse
import pandas as pd
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.optim import Adam, AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    BertTokenizerFast,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    AutoConfig
    )

import gc

In [3]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [4]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [8]:
# data load
def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join('/gdrive/MyDrive/goorm/01. text_classification/datas', file_name), 'r', encoding='utf-8') as f:
            id_file_data = [line.lower().rstrip() for line in f.readlines()]
        return id_file_data

train_pos = make_data_strings('sentiment.train.1')
train_neg = make_data_strings('sentiment.train.0')
dev_pos = make_data_strings('sentiment.dev.1')
dev_neg = make_data_strings('sentiment.dev.0')

In [11]:
train_pos_label = [1 for i in train_pos]
train_neg_label = [0 for i in train_neg]
train_label = train_pos_label + train_neg_label

dev_pos_label = [1 for i in dev_pos]
dev_neg_label = [0 for i in dev_neg]
dev_label = dev_pos_label + dev_neg_label

train_data = pd.DataFrame(train_pos + train_neg, columns=['sentences'])
train_data['labels'] = train_label

dev_data = pd.DataFrame(dev_pos + dev_neg, columns=['sentences'])
dev_data['labels'] = dev_label

In [15]:
path = 'bert-base-uncased'
def model_load(train_data, path = 'bert-base-uncased'):
    tokenizer = BertTokenizerFast.from_pretrained(path)
    model = BertForSequenceClassification.from_pretrained(path, num_labels=len(train_data.iloc[:,-1].value_counts()))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    return model, tokenizer, device

In [26]:
def preprocess(df, batch_size=64):
    batch_input = tokenizer(df['sentences'].tolist(), truncation=True, padding=True)

    batch_input = {key : torch.tensor(value) for key, value in batch_input.items()}
    batch_input['labels'] = torch.tensor(df['labels'].tolist())

    dataset = TensorDataset(
        batch_input['input_ids'], 
        batch_input['token_type_ids'],
        batch_input['attention_mask'], 
        batch_input['labels'])
    
    dataset = TensorDataset(batch_input['input_ids'], batch_input['attention_mask'], batch_input['token_type_ids'], batch_input['labels'])
    dataset_sampler = RandomSampler(dataset)
    dataloader = DataLoader(dataset, sampler = dataset_sampler, batch_size= batch_size)

    return dataloader          

In [None]:
model, tokenizer, device = model_load(train_data,path='bert-base-uncased')

In [27]:
train_dataloader = preprocess(train_data,batch_size=128)
dev_dataloader = preprocess(dev_data, batch_size=128)

# 모델 학습

In [48]:
save_path = 'bert_classification'
epochs = 1

gc.collect()
torch.cuda.empty_cache()

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

for epoch in range(epochs):
# =====================================
#               Training
# =====================================
    model.train()

    train_loss = 0.0

    for batchs in tqdm(train_dataloader):
        batch = tuple(b.to(device) for b in batchs)

        inputs = {
                 "input_ids": batch[0],
                 "attention_mask": batch[1],
                 "token_type_ids": batch[2],
                 'labels' : batch[3]
             }

        optimizer.zero_grad()

        output = model(**inputs)
        
        loss = output[0]
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f'epoch - {epoch} Train_Loss : ', avg_train_loss)

# =====================================
#               Evaluation
# =====================================
            
    model.eval()

    eval_loss, eval_accuracy = 0, 0


    for batchs in tqdm(dev_dataloader):
        batch = tuple(b.to(device) for b in batchs)
        inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    'labels' : batch[3]
                }

        with torch.no_grad():
                output = model(**inputs)
                
        logits = output['logits']

        # CPU로 데이터 이동
        preds = logits.detach().cpu().numpy()
        preds = np.argmax(preds,axis=1).flatten()

        target_ids = inputs['labels'].to('cpu').numpy()

        eval_accuracy += (np.array(preds) == np.array(target_ids)).mean()

    avg_accuracy = eval_accuracy / len(dev_dataloader)
    print(f'epoch - {epoch} Accuracy : ',avg_accuracy)

    # model.save_pretrained(f'/gdrive/MyDrive/goorm/01. text_classification/models/{save_path}')

100%|██████████| 3463/3463 [16:00<00:00,  3.61it/s]


epoch - 0 Train_Loss :  0.046061860646189515


100%|██████████| 32/32 [00:01<00:00, 19.12it/s]

0.980712890625





# 모델 테스트

In [29]:
save_path = 'bert_classification'
model = BertForSequenceClassification.from_pretrained(f'/gdrive/MyDrive/goorm/01. text_classification/models/{save_path}')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [55]:
test_data = pd.read_csv('/gdrive/MyDrive/goorm/01. text_classification/datas/test_no_label.csv')
test_batch_input = tokenizer(test_data['Id'].tolist(),truncation=True,padding=True)

test_batch_input = {key : torch.tensor(value) for key, value in test_batch_input.items()}
test_dataset = TensorDataset(test_batch_input['input_ids'],test_batch_input['attention_mask'],test_batch_input['token_type_ids'])
test_dataloader = DataLoader(test_dataset, batch_size = 128)

In [59]:
model.eval()

pred = []
for batchs in tqdm(test_dataloader):
    batch = tuple(b.to(device) for b in batchs)
    inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2]
        }


    with torch.no_grad():
            output = model(**inputs)
            
    logits = output[0].detach().cpu().numpy()
    pred.append(logits)

predictions = np.argmax(np.concatenate(preds), axis=1)

100%|██████████| 8/8 [00:00<00:00, 19.62it/s]


In [None]:
test_df['Category'] = predictions

test_df.to_csv('/gdrive/My Drive/goorm/text_classification/submission.csv', index=False)