In [1]:
import os
import random
import time
import datetime
import torch
import argparse

import pandas as pd
import numpy as np

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, BertConfig

from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
torch.cuda.is_available()

True

In [9]:
# data Loading 
### dataframe의 col 이름이 document, label 이어야 한다. 
def load_data(args):
    temp = pd.read_csv(args.raw_data, sep="\t",encoding='cp949')
    temp = temp
    document = temp.document.tolist()
    labels = temp.label.tolist()
    return document, labels
# Marking 앞단 끝단에 
def add_special_token(document):
    added = ["[CLS]" + str(sentence) + "[SEP]" for sentence in document]
    return added
# Tokenization
def tokenization(document, mode="huggingface"):
    if mode == "huggingface":
        tokenizer = BertTokenizer.from_pretrained(
                'bert-base-multilingual-cased', 
                do_lower_case=False,
                )
        tokenized = [tokenizer.tokenize(sentence) for sentence in document]
        ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
        return ids
# Padding = 길이 일치
def padding(ids, args):
    ids = pad_sequences(ids, maxlen=args.max_len, dtype="long", truncating='post', padding='post')
    return ids

def attention_mask(ids):
    masks = []
    for id in ids:
        mask = [float(i>0) for i in id]
        masks.append(mask)
    return masks

def preprocess(args):
    document, labels = load_data(args)
    document = add_special_token(document)
    ids = tokenization(document)
    ids = padding(ids, args)
    masks = attention_mask(ids)
    del document
    return ids, masks, labels

def train_test_data_split(ids, masks, labels):
    train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels
                                                                      , random_state=42, test_size=0.1)
    train_masks, test_masks, _, _ = train_test_split(masks, ids, random_state=42, test_size=0.1)
    return train_ids, train_masks, train_labels, test_ids, test_masks, test_labels

def build_dataloader(ids, masks, label, args):
    dataloader = TensorDataset(torch.tensor(ids), torch.tensor(masks), torch.tensor(label))
    dataloader = DataLoader(dataloader, sampler=RandomSampler(dataloader), batch_size=args.batch_size)
    return dataloader
##############################################################################

def build_model(args):
    model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=args.num_labels)
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"{torch.cuda.get_device_name(0)} available")
        model = model.cuda()
    else:
        device = torch.device("cpu")
        print("no GPU available")
        model = model
    return model, device

def test(test_dataloader, model, device):
    model.eval()
    total_accuracy = 0
    for batch in test_dataloader:
        batch = tuple(index.to(device) for index in batch)
        ids, masks, labels = batch
        with torch.no_grad():
            outputs = model(ids, token_type_ids=None, attention_mask=masks)
        pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
        true = [label for label in labels.cpu().numpy()]
        accuracy = accuracy_score(true, pred)
        total_accuracy += accuracy
    avg_accuracy = total_accuracy/len(test_dataloader)
    print(f"test AVG accuracy : {avg_accuracy: .4f}")
    return avg_accuracy

def train(train_dataloader, test_dataloader, args):
    model, device = build_model(args)
    
    # 옵티마이저 정의 
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    
    # learning rate decay
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*args.epochs)
    
    # 시드 고정
    random.seed(args.seed_val)
    np.random.seed(args.seed_val)
    torch.manual_seed(args.seed_val)
    torch.cuda.manual_seed_all(args.seed_val)
    

    
    # 그레디언트 초기화
    model.zero_grad()
    for epoch in range(0, args.epochs):
        # 훈련모드 
        model.train()
        
        # 로스와 정확도 초기화
        total_loss, total_accuracy = 0, 0
        print("-"*30)
        for step, batch in enumerate(train_dataloader):
            if step % 500 == 0 :
                print(f"Epoch : {epoch+1} in {args.epochs} / Step : {step}")

            # 배치 선정
            batch = tuple(index.to(device) for index in batch)
            ids, masks, labels, = batch

            # forward
            outputs = model(ids, token_type_ids=None, attention_mask=masks, labels=labels)
            
            # loss 도출
            loss = outputs.loss
            total_loss += loss.item()

            # 정확도 도출
            pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            true = [label for label in labels.cpu().numpy()]
            accuracy = accuracy_score(true, pred)
            total_accuracy += accuracy
            
            # 그레디언트 연산
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # 파라미터 업데이트
            optimizer.step()
            
            # 러닝레이트 최적화
            scheduler.step()
            
            # 그레디언트 초기화
            model.zero_grad()
            
        # epoch 당 loss 와 정확도 계산
        avg_loss = total_loss / len(train_dataloader)
        avg_accuracy = total_accuracy/len(train_dataloader)
        print(f" {epoch+1} Epoch Average train loss :  {avg_loss}")
        print(f" {epoch+1} Epoch Average train accuracy :  {avg_accuracy}")
        
        # test 수행
        acc = test(test_dataloader, model, device)
        
        # 모델 저장
        os.makedirs("results", exist_ok=True)
        #f = os.path.join("results", f'epoch_{epoch+1}_evalAcc_{acc*100:.0f}.pth')
        f = args.save_data + "\\" + "results_" + f'epoch_{epoch+1}_evalAcc_{acc*100:.0f}.pth'
        if epoch % 5 ==0:
            torch.save(model.state_dict(), f)
            print('Saved checkpoint:', f)

def run(args):
    ids, masks, labels = preprocess(args)
    train_ids, train_masks, train_labels, test_ids, test_masks, test_labels = train_test_data_split(ids, masks, labels)
    train_dataloader = build_dataloader(train_ids, train_masks, train_labels, args)
    test_dataloader = build_dataloader(test_ids, test_masks, test_labels, args)
    train(train_dataloader, test_dataloader, args)

In [10]:
load_path = r"C:\Users\USER\Desktop\All\Project\23_VRP\Data\경기\version_2\{}"
save_path = r"C:\Users\USER\Desktop\All\Project\23_VRP\Classification\model"

In [11]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [12]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-raw_data", default=load_path.format("dataset_1.txt"))
    parser.add_argument("-save_data", default=save_path)
    
    parser.add_argument("-max_len", default=32, type=int)
    parser.add_argument("-batch_size", default=128, type=int)
    parser.add_argument("-num_labels", default=21, type=int)
    parser.add_argument("-epochs", default=600, type=int)
    parser.add_argument("-seed_val", default=42, type=int)

    #args = parser.parse_args()
    args,_ = parser.parse_known_args()
    run(args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NVIDIA GeForce RTX 4070 available




------------------------------
Epoch : 1 in 600 / Step : 0
 1 Epoch Average train loss :  2.548988426053846
 1 Epoch Average train accuracy :  0.24731749412455936
test AVG accuracy :  0.30
Saved checkpoint: C:\Users\USER\Desktop\All\Project\23_VRP\Classification\model\results_epoch_1_evalAcc_30.pth
------------------------------
Epoch : 2 in 600 / Step : 0
 2 Epoch Average train loss :  2.113664443428452
 2 Epoch Average train accuracy :  0.37591069330199767
test AVG accuracy :  0.48
------------------------------
Epoch : 3 in 600 / Step : 0
 3 Epoch Average train loss :  1.691626890285595
 3 Epoch Average train accuracy :  0.5499045240893067
test AVG accuracy :  0.59
------------------------------
Epoch : 4 in 600 / Step : 0
 4 Epoch Average train loss :  1.3641817247545398
 4 Epoch Average train accuracy :  0.6401164071680376
test AVG accuracy :  0.64
------------------------------
Epoch : 5 in 600 / Step : 0
 5 Epoch Average train loss :  1.1288203129897247
 5 Epoch Average train ac


KeyboardInterrupt



In [13]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-raw_data", default=load_path.format("dataset_2.txt"))
    parser.add_argument("-save_data", default=save_path)
    
    parser.add_argument("-max_len", default=20, type=int)
    parser.add_argument("-batch_size", default=128, type=int)
    parser.add_argument("-num_labels", default=21, type=int)
    parser.add_argument("-epochs", default=600, type=int)
    parser.add_argument("-seed_val", default=42, type=int)

    #args = parser.parse_args()
    args,_ = parser.parse_known_args()
    run(args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NVIDIA GeForce RTX 4070 available




------------------------------
Epoch : 1 in 600 / Step : 0
 1 Epoch Average train loss :  2.5276468380077466
 1 Epoch Average train accuracy :  0.2514009253819036
test AVG accuracy :  0.33
Saved checkpoint: C:\Users\USER\Desktop\All\Project\23_VRP\Classification\model\results_epoch_1_evalAcc_33.pth
------------------------------
Epoch : 2 in 600 / Step : 0
 2 Epoch Average train loss :  2.110715878976358
 2 Epoch Average train accuracy :  0.3645049941245594
test AVG accuracy :  0.45
------------------------------
Epoch : 3 in 600 / Step : 0
 3 Epoch Average train loss :  1.748921023832785
 3 Epoch Average train accuracy :  0.5124118683901292
test AVG accuracy :  0.57
------------------------------
Epoch : 4 in 600 / Step : 0
 4 Epoch Average train loss :  1.444826093879906
 4 Epoch Average train accuracy :  0.6138384988249118
test AVG accuracy :  0.60
------------------------------
Epoch : 5 in 600 / Step : 0
 5 Epoch Average train loss :  1.1904516558389406
 5 Epoch Average train accu

KeyboardInterrupt: 

In [7]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-raw_data", default=load_path.format("dataset_3.txt"))
    parser.add_argument("-save_data", default=save_path)
    
    parser.add_argument("-max_len", default=20, type=int)
    parser.add_argument("-batch_size", default=128, type=int)
    parser.add_argument("-num_labels", default=21, type=int)
    parser.add_argument("-epochs", default=100, type=int)
    parser.add_argument("-seed_val", default=42, type=int)

    #args = parser.parse_args()
    args,_ = parser.parse_known_args()
    run(args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NVIDIA GeForce RTX 4070 available




------------------------------
Epoch : 1 in 100 / Step : 0
 1 Epoch Average train loss :  2.5777274853474386
 1 Epoch Average train accuracy :  0.24330566980023505
test AVG accuracy :  0.35
Saved checkpoint: C:\Users\USER\Desktop\All\Project\23_VRP\Classification\model\results_epoch_1_evalAcc_35.pth
------------------------------
Epoch : 2 in 100 / Step : 0
 2 Epoch Average train loss :  2.141732441412436
 2 Epoch Average train accuracy :  0.3737514688601645
test AVG accuracy :  0.47
------------------------------
Epoch : 3 in 100 / Step : 0
 3 Epoch Average train loss :  1.7575612648113355
 3 Epoch Average train accuracy :  0.5176905846063454
test AVG accuracy :  0.60
------------------------------
Epoch : 4 in 100 / Step : 0
 4 Epoch Average train loss :  1.4352074056058317
 4 Epoch Average train accuracy :  0.6341087690951821
test AVG accuracy :  0.65
------------------------------
Epoch : 5 in 100 / Step : 0
 5 Epoch Average train loss :  1.1820481274579022
 5 Epoch Average train a

In [12]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-raw_data", default=load_path.format("dataset_4.txt"))
    parser.add_argument("-save_data", default=save_path)
    
    parser.add_argument("-max_len", default=20, type=int)
    parser.add_argument("-batch_size", default=128, type=int)
    parser.add_argument("-num_labels", default=21, type=int)
    parser.add_argument("-epochs", default=10, type=int)
    parser.add_argument("-seed_val", default=42, type=int)

    #args = parser.parse_args()
    args,_ = parser.parse_known_args()
    run(args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NVIDIA GeForce RTX 4070 available




------------------------------
Epoch : 1 in 10 / Step : 0
Epoch : 1 in 10 / Step : 500
 1 Epoch Average train loss :  0.9538592897344648
 1 Epoch Average train accuracy :  0.7443796309916394
test AVG accuracy :  0.9455
Saved checkpoint: C:\Users\USER\Desktop\All\Project\23_VRP\Classification\model\results_epoch_1_evalAcc_95.pth
------------------------------
Epoch : 2 in 10 / Step : 0
Epoch : 2 in 10 / Step : 500
 2 Epoch Average train loss :  0.19249380348825698
 2 Epoch Average train accuracy :  0.9494007505370753
test AVG accuracy :  0.9716
------------------------------
Epoch : 3 in 10 / Step : 0
Epoch : 3 in 10 / Step : 500
 3 Epoch Average train loss :  0.10919437655159116
 3 Epoch Average train accuracy :  0.9681964618026574
test AVG accuracy :  0.9781
------------------------------
Epoch : 4 in 10 / Step : 0
Epoch : 4 in 10 / Step : 500
 4 Epoch Average train loss :  0.0866916540438803
 4 Epoch Average train accuracy :  0.973581753465333
test AVG accuracy :  0.9773
------------