## 1. Load the NSMC Dataset

In [1]:
import numpy as np
import pandas as pd
from Korpora import Korpora

corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(20000, random_state=42)

# train : val : test 를 6 : 2 : 2 로 분리
train_df, val_df, test_df = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

# 출력 확인
print(train_df.head(5).to_markdown())
print(f"train: {len(train_df)}, val: {len(val_df)}")
print(f"test: {len(test_df)}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /home/ho/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /home/ho/Korpora/n

## 2. Tokenizer

In [11]:
from transformers import AutoTokenizer

# tokenizer 설정
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
print(f"n_vocab: {tokenizer.vocab_size}")

# 출력 확인
example_sent = "Hello, Pytorch!"
encoded_sent = tokenizer(example_sent, return_tensors='pt')["input_ids"]
print(f"encoded: {encoded_sent}")

n_vocab: 119547
encoded: tensor([[  101, 31178,   117,   153, 20246, 10667, 10269,   106,   102]])


## 3. Supervised Learning

In [None]:
EPOCHS = 5
BATCH_SIZE = 32
LR = 5e-5

### 3-1. Dataset

In [28]:
import torch
from torch.utils.data import Dataset

class NSMCDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.encoded = tokenizer(text=data['text'].tolist(),
                                 truncation=True,
                                 padding="longest",
                                 return_tensors='pt')
        self.labels = data['label'].tolist()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encoded.items()}
        item['labels'] = torch.tensor([self.labels[idx]], dtype=torch.long)
        
        return item
        
# 출력 확인
train_dataset, val_dataset = NSMCDataset(tokenizer, train_df), NSMCDataset(tokenizer, val_df)
for key, val in train_dataset[0].items():
    print(f"{key}.shape: {val.shape}")

input_ids.shape: torch.Size([124])
token_type_ids.shape: torch.Size([124])
attention_mask.shape: torch.Size([124])
labels.shape: torch.Size([1])


  item = {key: torch.tensor(val[idx]) for key, val in self.encoded.items()}


### 3-2. Define the Network

In [29]:
from transformers import BertForSequenceClassification

net = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 3-3. Trainer

In [30]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    output_dir = "./pth",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = LR,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = EPOCHS,
)

trainer = Trainer(
    model = net,
    args = train_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

### 3-4. Test

In [31]:
test_dataset = NSMCDataset(tokenizer, test_df)
net = BertForSequenceClassification.from_pretrained("")

trainer = Trainer(
    model=net,
    args=TrainingArguments(
        output_dir='./pth',
        per_device_eval_batch_size=BATCH_SIZE
    )
)

OSError: Incorrect path_or_model_id: ''. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
test_results = trainer.evaluate(test_dataset)
print(test_results)

## 4. Self-Supervised Learning

In [17]:
EPOCHS = 5
BATCH_SIZE = 32
LR = 5e-5

### 4-1. NSP를 위해 문장 Pair 생성

In [6]:
import random

def create_sent_pairs(df):
    sent_a = []
    sent_b = []
    labels = []

    for i in range(len(df) - 1):
        sent_a.append(df.iloc[i]['text'])
        sent_b.append(df.iloc[i + 1]['text'])
        labels.append(1)  # 연속 문장
        sent_a.append(df.iloc[i]['text'])
        sent_b.append(df.iloc[random.randint(0, len(df)-1)]['text'])
        labels.append(0)  # 무작위 문장

    return sent_a, sent_b, labels

sent_a, sent_b, labels = create_sent_pairs(df)
print(f"{sent_a[0]} / {sent_b[0]}")
print(f"label: {labels[0]}")

모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. / 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...
label: 1


### 4-2. Dataset & DataCollator

In [16]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenizer, sent_a, sent_b, labels):
        self.encoded = tokenizer(sent_a, sent_b, truncation=True, padding=True, max_length=512)
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encoded.items()}
        item['next_sentence_label'] = torch.tensor([self.labels[idx]], dtype=torch.long)
        
        return item

dataset = CustomDataset(tokenizer, sent_a, sent_b, labels)
for key, val in dataset[0].items():
    print(f"{key}.shape: {val.shape}")


input_ids.shape: torch.Size([215])
token_type_ids.shape: torch.Size([215])
attention_mask.shape: torch.Size([215])
next_sentence_label.shape: torch.Size([1])


In [22]:
def data_collator(batch):
    input_ids = torch.stack([item['input_ids'].squeeze(0) for item in batch])
    attention_mask = torch.stack([item['attention_mask'].squeeze(0) for item in batch])
    labels = torch.stack([item['input_ids'].squeeze(0) for item in batch])  # MLM을 위해 input_ids를 레이블로 사용
    next_sentence_label = torch.stack([item['next_sentence_label'].squeeze(0) for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'next_sentence_label': next_sentence_label
    }

### 4-3 Define the Network

In [25]:
from transformers import BertForPreTraining

net = BertForPreTraining.from_pretrained("bert-base-multilingual-cased")

### 4-4. Trainer

In [26]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    output_dir = "./pth/",
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    logging_dir = "./result/logs"
)

trainer = Trainer(
    model = net,
    args = train_args,
    train_dataset = dataset,
    data_collator = data_collator
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
trainer.train()

KeyboardInterrupt: 