In [1]:
import os
import random
import time
import datetime
import torch
import argparse

import pandas as pd
import numpy as np

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertConfig

from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
torch.cuda.is_available()

False

In [5]:
# data Loading 
### dataframe의 col 이름이 document, label 이어야 한다. 
def load_data(args):
    temp = pd.read_csv(args.raw_data, sep="\t",encoding='cp949')
    temp = temp
    document = temp.document.tolist()
    labels = temp.label.tolist()
    return document, labels
# Marking 앞단 끝단에 
def add_special_token(document):
    added = ["[CLS]" + str(sentence) + "[SEP]" for sentence in document]
    return added
# Tokenization
def tokenization(document, mode="huggingface"):
    if mode == "huggingface":
        tokenizer = BertTokenizer.from_pretrained(
                'bert-base-multilingual-cased', 
                do_lower_case=False,
                )
        tokenized = [tokenizer.tokenize(sentence) for sentence in document]
        ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
        return ids
# Padding = 길이 일치
def padding(ids, args):
    ids = pad_sequences(ids, maxlen=args.max_len, dtype="long", truncating='post', padding='post')
    return ids

def attention_mask(ids):
    masks = []
    for id in ids:
        mask = [float(i>0) for i in id]
        masks.append(mask)
    return masks

def preprocess(args):
    document, labels = load_data(args)
    document = add_special_token(document)
    ids = tokenization(document)
    ids = padding(ids, args)
    masks = attention_mask(ids)
    del document
    return ids, masks, labels

def train_test_data_split(ids, masks, labels):
    train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels
                                                                      , random_state=42, test_size=0.1)
    train_masks, test_masks, _, _ = train_test_split(masks, ids, random_state=42, test_size=0.1)
    return train_ids, train_masks, train_labels, test_ids, test_masks, test_labels

def build_dataloader(ids, masks, label, args):
    dataloader = TensorDataset(torch.tensor(ids), torch.tensor(masks), torch.tensor(label))
    dataloader = DataLoader(dataloader, sampler=RandomSampler(dataloader), batch_size=args.batch_size)
    return dataloader
##############################################################################

def build_model(args):
    model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=args.num_labels)
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"{torch.cuda.get_device_name(0)} available")
        model = model.cuda()
    else:
        device = torch.device("cpu")
        print("no GPU available")
        model = model
    return model, device

def test(test_dataloader, model, device):
    model.eval()
    total_accuracy = 0
    for batch in test_dataloader:
        batch = tuple(index.to(device) for index in batch)
        ids, masks, labels = batch
        with torch.no_grad():
            outputs = model(ids, token_type_ids=None, attention_mask=masks)
        pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
        true = [label for label in labels.cpu().numpy()]
        accuracy = accuracy_score(true, pred)
        total_accuracy += accuracy
    avg_accuracy = total_accuracy/len(test_dataloader)
    print(f"test AVG accuracy : {avg_accuracy: .2f}")
    return avg_accuracy

def train(train_dataloader, test_dataloader, args):
    model, device = build_model(args)
    
    # 옵티마이저 정의 
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    
    # learning rate decay
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*args.epochs)
    
    # 시드 고정
    random.seed(args.seed_val)
    np.random.seed(args.seed_val)
    torch.manual_seed(args.seed_val)
    torch.cuda.manual_seed_all(args.seed_val)
    

    
    # 그레디언트 초기화
    model.zero_grad()
    for epoch in range(0, args.epochs):
        # 훈련모드 
        model.train()
        
        # 로스와 정확도 초기화
        total_loss, total_accuracy = 0, 0
        print("-"*30)
        for step, batch in enumerate(train_dataloader):
            if step % 500 == 0 :
                print(f"Epoch : {epoch+1} in {args.epochs} / Step : {step}")

            # 배치 선정
            batch = tuple(index.to(device) for index in batch)
            ids, masks, labels, = batch

            # forward
            outputs = model(ids, token_type_ids=None, attention_mask=masks, labels=labels)
            
            # loss 도출
            loss = outputs.loss
            total_loss += loss.item()

            # 정확도 도출
            pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            true = [label for label in labels.cpu().numpy()]
            accuracy = accuracy_score(true, pred)
            total_accuracy += accuracy
            
            # 그레디언트 연산
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # 파라미터 업데이트
            optimizer.step()
            
            # 러닝레이트 최적화
            scheduler.step()
            
            # 그레디언트 초기화
            model.zero_grad()
            
        # epoch 당 loss 와 정확도 계산
        avg_loss = total_loss / len(train_dataloader)
        avg_accuracy = total_accuracy/len(train_dataloader)
        print(f" {epoch+1} Epoch Average train loss :  {avg_loss}")
        print(f" {epoch+1} Epoch Average train accuracy :  {avg_accuracy}")
        
        # test 수행
        acc = test(test_dataloader, model, device)
        
        # 모델 저장
        os.makedirs("results", exist_ok=True)
        #f = os.path.join("results", f'epoch_{epoch+1}_evalAcc_{acc*100:.0f}.pth')
        f = args.save_data + "\\" + "results_" + f'epoch_{epoch+1}_evalAcc_{acc*100:.0f}.pth'
        torch.save(model.state_dict(), f)
        print('Saved checkpoint:', f)

def run(args):
    ids, masks, labels = preprocess(args)
    train_ids, train_masks, train_labels, test_ids, test_masks, test_labels = train_test_data_split(ids, masks, labels)
    train_dataloader = build_dataloader(train_ids, train_masks, train_labels, args)
    test_dataloader = build_dataloader(test_ids, test_masks, test_labels, args)
    train(train_dataloader, test_dataloader, args)



In [6]:
load_path = r"C:\Users\Samsung\Desktop\all\Project\23_VRPTW\Classification\{}"
save_path = r"C:\Users\Samsung\Desktop\all\Project\23_VRPTW\Classification\model"


In [7]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-raw_data", default=load_path.format("Data_Train.txt"))
    parser.add_argument("-save_data", default=save_path)
    
    parser.add_argument("-max_len", default=10, type=int)
    parser.add_argument("-batch_size", default=4, type=int)
    parser.add_argument("-num_labels", default=21, type=int)
    parser.add_argument("-epochs", default=4, type=int)
    parser.add_argument("-seed_val", default=4, type=int)

    #args = parser.parse_args()
    args,_ = parser.parse_known_args()
    run(args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


no GPU available
------------------------------
Epoch : 1 in 4 / Step : 0
Epoch : 1 in 4 / Step : 500
Epoch : 1 in 4 / Step : 1000
 1 Epoch Average train loss :  2.0711011273898503
 1 Epoch Average train accuracy :  0.3874761222540592
test AVG accuracy :  0.57
Saved checkpoint: C:\Users\Samsung\Desktop\all\Project\23_VRPTW\Classification\model\results_epoch_1_evalAcc_57.pth
------------------------------
Epoch : 2 in 4 / Step : 0
Epoch : 2 in 4 / Step : 500
Epoch : 2 in 4 / Step : 1000
 2 Epoch Average train loss :  1.2685430964353603
 2 Epoch Average train accuracy :  0.6525191021967526
test AVG accuracy :  0.64
Saved checkpoint: C:\Users\Samsung\Desktop\all\Project\23_VRPTW\Classification\model\results_epoch_2_evalAcc_64.pth
------------------------------
Epoch : 3 in 4 / Step : 0
Epoch : 3 in 4 / Step : 500
Epoch : 3 in 4 / Step : 1000
 3 Epoch Average train loss :  0.878015460593247
 3 Epoch Average train accuracy :  0.7679083094555874
test AVG accuracy :  0.69
Saved checkpoint: C:

In [14]:
parser = argparse.ArgumentParser()

In [28]:
def load_data(args):
    temp = pd.read_csv(args, sep="\t",encoding='cp949')
    temp = temp
    document = temp.document.tolist()
    labels = temp.label.tolist()
    return document, labels

In [29]:
path_ = r"C:\Users\Samsung\Desktop\all\Project\23_VRPTW\Classification\Data_Train.txt"

In [30]:
doc,lab = load_data(path_)

In [32]:
def add_special_token(document):
    added = ["[CLS]" + str(sentence) + "[SEP]" for sentence in document]
    return added

In [34]:
def tokenization(document):
    tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', 
            do_lower_case=False,
            )
    tokenized = [tokenizer.tokenize(sentence) for sentence in document]
    ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
    return ids

In [35]:
tokenization(doc)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

[[9734, 12310, 13890, 10533, 47407],
 [11474,
  14250,
  57908,
  10410,
  33824,
  26037,
  10550,
  148,
  11273,
  25853,
  10240,
  113,
  9318,
  118768,
  17342,
  17342,
  24891,
  92688,
  12692,
  88332,
  119188,
  114,
  122,
  78560,
  10174,
  113,
  25853,
  10240,
  114,
  9001,
  18778],
 [193, 15184, 12692, 28188, 122, 47407],
 [8865, 16985, 11261, 51533],
 [8887, 49919, 113, 10244, 17196, 114],
 [9049, 119308, 35506, 89108, 9420, 11882, 18392, 16323, 12605],
 [9056, 37388, 38851, 15001, 9056, 68055, 14801, 9435, 16605, 17730],
 [9091, 118667, 13764, 30134, 44026, 10240],
 [100],
 [100, 113, 20843, 10240, 120, 9355, 114],
 [9161,
  92688,
  31401,
  60479,
  12692,
  92564,
  80174,
  12605,
  119073,
  119367,
  88332,
  69168,
  10929,
  10240],
 [9272,
  48446,
  119351,
  17342,
  51533,
  12605,
  46150,
  14153,
  45725,
  28847,
  10929,
  10240],
 [9291, 92564, 62200],
 [9318, 16439, 16439, 89478],
 [9318, 119169, 61156, 119430, 10016, 21928],
 [9318,
  10739,


In [33]:
add_special_token(doc)

['[CLS]  찰기장 40kg        [SEP]',
 '[CLS] Vanilla Raspberry Cake KR 245g(바닐라라즈베리케익)1pack(245g) 냉동[SEP]',
 '[CLS] y트리오 1kg[SEP]',
 '[CLS] 건아로니아[SEP]',
 '[CLS] 계란(30구)[SEP]',
 '[CLS] 니콜하우스 생과일주스[SEP]',
 '[CLS] 다래월드 다목적 세정제[SEP]',
 '[CLS] 돌김자반80g [SEP]',
 '[CLS] 돼지갈비찜[SEP]',
 '[CLS] 딸기잼(225g/병)[SEP]',
 '[CLS] 람베르츠리블링스슈튀케250g[SEP]',
 '[CLS] 메디테라니아스파게티500g[SEP]',
 '[CLS] 몽블랑[SEP]',
 '[CLS] 바나나200 [SEP]',
 '[CLS] 바움쿠헨 흑당[SEP]',
 '[CLS] 바이탭 쿨 브리즈  핸드 선풍기[SEP]',
 '[CLS] 발사믹 드레싱(샐러디)[SEP]',
 '[CLS] 속편한우유 플러스 저지방 멸균 190ml[SEP]',
 '[CLS] 스파게티면 170g[SEP]',
 '[CLS] 육개장 사발면 농심 86g[SEP]',
 '[CLS] 음료류[SEP]',
 '[CLS] 이츠웰 떡뽂이(국) [SEP]',
 '[CLS] 일년내내 찰보리   [SEP]',
 '[CLS] 종가집 생생김치 3kg[SEP]',
 '[CLS] 참부오느 소불&amp;파타타빵[SEP]',
 '[CLS] 청정원 청정미역 150g[SEP]',
 '[CLS] 치즈떡볶이[SEP]',
 '[CLS] 포도 5kg[SEP]',
 '[CLS] 포테이토달라스(감자튀김) 2,27kg[SEP]',
 '[CLS] 프레시워터핸드세니타이저(100ml)[SEP]',
 '[CLS] 허니머스타드 드레싱(샐러디)[SEP]',
 '[CLS] 화이트  시크릿홀 울날 중형19입[SEP]',
 '[CLS](냉동)밀떡볶이[SEP]',
 '[CLS](농심)맥코믹바질10g[SEP]',
 '[CLS](떡보의하루)떡[SEP]',
 '[CLS

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


no GPU available
------------------------------
Epoch : 1 in 4 / Step : 0


In [47]:
args

AttributeError: 'tuple' object has no attribute 'raw_data'

# 참고

https://noanomal.tistory.com/entry/bert%EB%A5%BC-%ED%99%9C%EC%9A%A9%ED%95%9C-%EB%AC%B8%EC%9E%A5-%EB%B6%84%EB%A5%98bert-classification-BertForSequenceClassification