In [3]:
!pip install pytorch_pretrained_bert
!pip install transformers

from __future__ import absolute_import, division, print_function
import argparse
import os
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from tqdm import tqdm, trange
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE,cached_path
from model import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from transformers import BertTokenizer, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam
from utils import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Google Drive에서 파일을 복사
!cp /content/drive/MyDrive/your_folder/model.py .
!cp /content/drive/MyDrive/your_folder/utils.py .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: cannot stat '/content/drive/MyDrive/your_folder/model.py': No such file or directory
cp: cannot stat '/content/drive/MyDrive/your_folder/utils.py': No such file or directory


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/')

!cp /content/drive/MyDrive/main.py .
!cp /content/drive/MyDrive/utils.py .
!cp /content/drive/MyDrive/model.py .
!ls

CM-BERT_output	drive  main.py	model.py  __pycache__  sample_data  utils.py


In [4]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip show kobert_tokenizer

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-tw7jjsol/kobert-tokenizer_8b667324466543b3b18c3301ffff997e
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-tw7jjsol/kobert-tokenizer_8b667324466543b3b18c3301ffff997e
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Name: kobert-tokenizer
Version: 0.1
Summary: Korean BERT pre-trained cased (KoBERT) for HuggingFace 
Home-page: https://github.com/SKTBrain/KoBERT
Author: SeungHwan Jung
Author-email: digit82@gmail.com
License: Apache-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 


In [5]:
from sklearn.metrics import f1_score, accuracy_score
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [6]:
def main(i):
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir", default='data/', type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='kobert-base-v1/', type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name", default='Multi', type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir", default='CM-BERT_output/', type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=100, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size", default=24, type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=24, type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--test_batch_size", default=24, type=int,
                        help="Total batch size for test.")
    parser.add_argument("--learning_rate", default=2e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed', type=int, default=11111,
                        help="Random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")

    args = parser.parse_args()

    processors = {
        "multi": PgProcessor,
    }
    num_labels_task = {
        "multi": 7,
    }

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()  # GPU의 수를 가져옵니다.
    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    seed_num = np.random.randint(1, 10000)
    random.seed(seed_num)
    np.random.seed(seed_num)
    torch.manual_seed(seed_num)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed_num)

    if not args.do_train and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_test` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = KoBERTTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) ### KoBERT

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs


    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format("-1"))

    model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels)

    for name, param in model.named_parameters():
        param.requires_grad = False
        if "encoder.layer.0" in name or "encoder.layer.1" in name:
            param.requires_grad = True
        if "encoder.layer.2" in name or "encoder.layer.3" in name :
            param.requires_grad = True
        if "encoder.layer.4" in name or  "encoder.layer.5" in name:
            param.requires_grad = True
        if "encoder.layer.6" in name or "encoder.layer.7" in name:
            param.requires_grad = True
        if "encoder.layer.8" in name or "encoder.layer.9" in name :
            param.requires_grad = True
        if "encoder.layer.10" in name or  "encoder.layer.11" in name:
            param.requires_grad = True
        if "BertFinetun" in name or "pooler" in name:
            param.requires_grad = True

    model.to(device)


In [8]:
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('monologg/kobert')
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')


NameError: name 'args' is not defined

In [7]:
import pandas as pd

def load_and_process_data(file_path, tokenizer, max_seq_length):
    data = pd.read_csv(file_path, delimiter='\t')
    input_ids = []
    attention_masks = []
    labels = []

    for _, row in data.iterrows():
        encoded_dict = tokenizer.encode_plus(
            row['text'],  # 문장
            add_special_tokens=True,  # [CLS]와 [SEP] 추가
            max_length=max_seq_length,  # 최대 길이
            pad_to_max_length=True,
            return_attention_mask=True,  # 어텐션 마스크 반환
            return_tensors='pt',  # PyTorch 텐서 반환
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(row['label'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)


In [8]:
import argparse
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm, trange
from transformers import AdamW, BertForSequenceClassification, BertTokenizer, BertConfig
from sklearn.metrics import f1_score, accuracy_score
import os
import random

class PgProcessor:
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        return ["disgust", "sad", "anger", "fear", "neutral"]  # 실제 레이블 목록으로 수정해야 합니다.

    def _create_examples(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[0]
            label = line[1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

    def _read_tsv(self, input_file, quotechar=None):
        with open(input_file, "r") as f:
            return [line.strip().split("\t") for line in f]

class InputExample:
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def load_and_process_data(file_path, tokenizer, max_seq_length, chunk_size=1000):
    processor = PgProcessor()
    examples = processor.get_train_examples(file_path) if 'train' in file_path else processor.get_test_examples(file_path)

    for i in range(0, len(examples), chunk_size):
        chunk_examples = examples[i:i+chunk_size]
        features = convert_examples_to_features(chunk_examples, processor.get_labels(), max_seq_length, tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

        yield TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

def main(i):
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir", default='data/', type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='monologg/kobert', type=str,
                        help="Bert pre-trained model selected in the list.")
    parser.add_argument("--task_name", default='Multi', type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir", default='CM-BERT_output/', type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=100, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization.")
    parser.add_argument("--do_train", action='store_true', default=True,
                        help="Whether to run training.")
    parser.add_argument("--do_test", action='store_true', default=True,
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size", default=24, type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size", default=24, type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--test_batch_size", default=24, type=int,
                        help="Total batch size for test.")
    parser.add_argument("--learning_rate", default=2e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for.")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed', type=int, default=11111,
                        help="Random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")

    args = parser.parse_args(args=[])

    processors = {
        "multi": PgProcessor,
    }
    num_labels_task = {
        "multi": 7,
    }

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device: {} n_gpu: {}".format(device, n_gpu))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    seed_num = np.random.randint(1, 10000)
    random.seed(seed_num)
    np.random.seed(seed_num)
    torch.manual_seed(seed_num)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed_num)

    if not args.do_train and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_test` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    model = BertForSequenceClassification.from_pretrained(
        args.bert_model,
        cache_dir=args.cache_dir if args.cache_dir else None,
        num_labels=num_labels
    )

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    new_decay = ['BertFine']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(np in n for np in new_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(np in n for np in new_decay)],'lr':0.01}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    if args.do_train:
        for train_dataset in load_and_process_data(os.path.join(args.data_dir, "/content/drive/MyDrive/train.tsv"), tokenizer, args.max_seq_length):
            train_sampler = RandomSampler(train_dataset)
            train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

            num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * int(args.num_train_epochs)

            print("***** Running training *****")
            print("  Num examples = %d" % len(train_dataset))
            print("  Batch size = %d" % args.train_batch_size)
            print("  Num steps = %d" % num_train_optimization_steps)

            for _ in trange(int(args.num_train_epochs), desc="Epoch"):
                model.train()
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, labels = batch
                    outputs = model(input_ids, attention_mask=input_mask, labels=labels)
                    loss = outputs[0]

                    if n_gpu > 1:
                        loss = loss.mean()
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    loss.backward()

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

    if args.do_test:
        for test_dataset in load_and_process_data(os.path.join(args.data_dir, "/content/drive/MyDrive/test.tsv"), tokenizer, args.max_seq_length):
            test_sampler = SequentialSampler(test_dataset)
            test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.test_batch_size)

            print("***** Running test *****")
            print("  Num examples = %d" % len(test_dataset))
            print("  Batch size = %d" % args.test_batch_size)

            model.eval()

            predict_list = []
            truth_list = []

            with torch.no_grad():
                for batch in tqdm(test_dataloader, desc="Evaluating"):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, labels = batch

                    outputs = model(input_ids, attention_mask=input_mask)
                    logits = outputs[0]

                    logits = torch.argmax(logits, dim=1)
                    logits = logits.detach().cpu().numpy()
                    labels = labels.to('cpu').numpy()

                    predict_list.extend(logits)
                    truth_list.extend(labels)

            predict_list = np.array(predict_list)
            truth_list = np.array(truth_list)

            np.save('cmbert_pred.npy', predict_list)
            np.save('cmbert_truth.npy', truth_list)

            f_score = f1_score(truth_list, predict_list, average='weighted')
            acc = accuracy_score(predict_list, truth_list)

            results = {'accuracy': acc, 'F1 score': f_score}

            print(results)

if __name__ == "__main__":
    main(1)


device: cuda n_gpu: 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NotADirectoryError: [Errno 20] Not a directory: '/content/drive/MyDrive/train.tsv/train.tsv'

In [30]:
# 현재 작업 디렉토리 확인
import os
print(os.getcwd())

# 해당 디렉토리에 model.py가 있는지 확인
print(os.listdir())


/content
['.config', '__pycache__', 'model.py', 'utils.py', 'main.py', 'drive', 'CM-BERT_output', 'sample_data']
