In [12]:
args = {
    'train_data_path': './ratings_train.txt',
    'val_data_path': './ratings_test.txt',
    'save_path': './model3',
    'max_epochs': 1,
    'model_path': 'beomi/KcELECTRA-base',
    'batch_size': 32,
    'learning_rate': 5e-5,
    'warmup_ratio': 0.0,
    'max_seq_len': 512
}

In [13]:
import pandas as pd
import torch

from torch.utils.data import Dataset

class NSMCDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length):
        df = pd.read_csv(csv_file, sep='\t')
        # NaN 값 제거
        df = df.dropna(axis=0)
        # 중복 제거
        df.drop_duplicates(subset=['document'], inplace=True)
        self.input_ids = tokenizer.batch_encode_plus(
            df['document'].to_list(),
            padding='max_length',
            max_length=max_length,
            return_tensors='pt',
            return_token_type_ids=False,
            return_attention_mask=False,
            truncation=True,
        )['input_ids']
        self.labels = torch.LongTensor(df['label'])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

In [None]:
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch

In [14]:
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast

model = ElectraForSequenceClassification.from_pretrained(args['model_path'])
tokenizer = ElectraTokenizerFast.from_pretrained(args['model_path'])

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.ou

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

def train(model, train_dataloader, args):
    model.train()
    model.to('cuda')
    global_total_step = len(train_dataloader) * args['max_epochs']
    global_step = 0
    optimizer = AdamW(model.parameters(), lr=args['learning_rate'], weight_decay=0.0)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=global_total_step)
    with tqdm(total=global_total_step, unit='step') as t:
        total = 0
        total_loss = 0
        total_correct = 0
        for epoch in range(args['max_epochs']):
            for batch in train_dataloader:
                global_step += 1
                b_input_ids = batch[0].to('cuda', non_blocking=True)
                b_labels = batch[1].to('cuda', non_blocking=True)
                model.zero_grad(set_to_none=True)
                outputs = model(
                    input_ids=b_input_ids,
                    labels=b_labels
                )
                loss, logits = outputs.loss, outputs.logits

                loss.backward()
                optimizer.step()
                scheduler.step()

                preds = logits.detach().argmax(dim=-1).cpu().numpy()
                out_label_ids = b_labels.detach().cpu().numpy()
                total_correct += (preds == out_label_ids).sum()

                batch_loss = loss.item() * len(b_input_ids)

                total += len(b_input_ids)
                total_loss += batch_loss

                t.set_postfix(loss='{:.6f}'.format(batch_loss),
                              accuracy='{:.2f}'.format(total_correct / total * 100))
                t.update(1)
                del b_input_ids
                del outputs
                del loss

In [16]:
from torch.utils.data import DataLoader

train_data_set = NSMCDataset(args['train_data_path'], tokenizer, args['max_seq_len'])
train_data_loader = DataLoader(
    dataset=train_data_set,
    batch_size=args['batch_size'],
    pin_memory=True,
    shuffle=True,
    )

MemoryError: 

In [None]:
train(model, train_data_loader, args)

In [10]:
model.save_pretrained(args['save_path'])

In [23]:
# 평점 10
pos_text = '이방원을 다룬 드라마중 최고였다고 자부함. 진짜 이방원을 보여준 듯이 연기와 인물묘사나 주변상황이 재밌었고 스토리도 진부하지 않았음. 다시 이런드라마를 볼수 있을지~ 진짜 이런 드라마하나 또 나왔음 함.'
# 평점 0
neg_text = '핵노잼 후기보고 낙였네 방금보고왔는데 개실망 재미없어요'

In [24]:
pos_input_vector = tokenizer.encode(pos_text, return_tensors='pt').to('cuda')
pos_pred = model(input_ids=pos_input_vector, labels=None).logits.argmax(dim=-1).tolist()
print(f'{pos_text} : {pos_pred[0]}')

neg_input_vector = tokenizer.encode(neg_text, return_tensors='pt').to('cuda')
neg_pred = model(input_ids=neg_input_vector, labels=None).logits.argmax(dim=-1).tolist()
print(f'{neg_text} : {neg_pred[0]}')

이방원을 다룬 드라마중 최고였다고 자부함. 진짜 이방원을 보여준 듯이 연기와 인물묘사나 주변상황이 재밌었고 스토리도 진부하지 않았음. 다시 이런드라마를 볼수 있을지~ 진짜 이런 드라마하나 또 나왔음 함. : 1
핵노잼 후기보고 낙였네 방금보고왔는데 개실망 재미없어요 : 0


In [15]:
conda install -c conda-forge transformers

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: D:\sunho9889\anaconda

  added / updated specs:
    - transformers


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.12.0               |   py39hcbf5309_0         1.0 MB  conda-forge
    huggingface_hub-0.6.0      |     pyhd8ed1ab_0          64 KB  conda-forge
    python_abi-3.9             |           2_cp39           4 KB  conda-forge
    sacremoses-0.0.53          |     pyhd8ed1ab_0         427 KB  conda-forge
    tokenizers-0.12.1          |   py39h8c9392a_0         3.0 MB  conda-forge
    transformers-4.18.0        |   py39haa95532_0         2.9 MB
    ------------------------------------------------------------
                                           Total:         7.4 MB

The following NEW packages will be INSTALLED

In [36]:
input_comment = '연기 정말 멋졌어요!! 어쩐지~ 뭘 믿고 입안에 저 공구를 넣게 해줬나 생각했는데 의사가 한거였군요 ㅋㅋㅋㅋ'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

연기 정말 멋졌어요!! 어쩐지~ 뭘 믿고 입안에 저 공구를 넣게 해줬나 생각했는데 의사가 한거였군요 ㅋㅋㅋㅋ : 1


In [37]:
input_comment = '제니가 없었다면 지금시대 여성들은 멀 입고 다녓을까 궁금함'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

제니가 없었다면 지금시대 여성들은 멀 입고 다녓을까 궁금함 : 1


In [38]:
input_comment = '왜 저러고 사냐 보는내가 다 쪽팔리네 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 현웃터진다'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

왜 저러고 사냐 보는내가 다 쪽팔리네 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ 현웃터진다 : 0


In [39]:
input_comment = '제니짝퉁ㅋㅋ 약간 치즈인더트랩보느거같음'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

제니짝퉁ㅋㅋ 약간 치즈인더트랩보느거같음 : 0


In [40]:
input_comment = '3:53 그녀는 제니처럼되기 위해 너무 열심히 노력하고 있습니다 ㅋㅋㅋㅋㅋ'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

3:53 그녀는 제니처럼되기 위해 너무 열심히 노력하고 있습니다 ㅋㅋㅋㅋㅋ : 1


In [41]:
input_comment = '치료하는게 너무 리얼하다 싶었는데 진짜 치과의사님 손이었다니ㅋㅋㅋ'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

치료하는게 너무 리얼하다 싶었는데 진짜 치과의사님 손이었다니ㅋㅋㅋ : 1


In [42]:
input_comment = '하나만 하세요 얼굴까지 잘생기셨우...'
input_vector = tokenizer.encode(input_comment, return_tensors='pt').to('cuda')
pred = model(input_ids=input_vector,labels=None).logits.argmax(dim=-1).tolist()
print(f'{input_comment} : {pred[0]}')

하나만 하세요 얼굴까지 잘생기셨우... : 0
