In [2]:
import numpy as np
import pandas as pd
from Korpora import Korpora


In [41]:
corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(10000, random_state=17)
train, valid, test = np.split(
    df.sample(frac=1, random_state=17), [int(0.6 * len(df)), int(0.8 * len(df))]

)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\hg_91\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\hg_

In [40]:
print(train.head(5).to_markdown())
print(f'Training Data Size : {len(train)}')
print(f'Validation Data Size : {len(valid)}')
print(f'Testing Data Size : {len(test)}')

AttributeError: 'function' object has no attribute 'head'

In [42]:
print(train.head(5))

                                                    text  label
33553  모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만...      1
18101                     처음 이 액션을 보면서 놀라움에 입이 떡하니 벌어졌다.      1
25176     배우가 연기를 못해서 어색한게 아니라 대사가 어색해서 이상한 거다. 무슨 외화 느낌      0
39100                                           재밋다 꼭 보셍      1
25773                   영화는 역시 미국 헐리우드가 최고라는것만 다시금 깨달았다.      0


In [43]:
import torch
import torch.nn as nn
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler


device = "cuda" if torch.cuda.is_available() else "cpu"

def make_dataset(data, tokenizer, device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.LongTensor(data.label.values, dtype=torch.long).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

In [44]:
class SentenceClassify(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers,
                 dropout=0.5,
                 bidirectional=True,
                 model_type='lstm'):
        super(SentenceClassify, self).__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0)
        
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim,1)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        
        return logits

In [45]:
epochs = 100
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased", do_lower_case=False
)

In [46]:
train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size)

valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_dataloader(valid_dataset, SequentialSampler, batch_size)

test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_dataloader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

TypeError: new() received an invalid combination of arguments - got (numpy.ndarray, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)


In [31]:
from torch import optim
from transformers import BertForSequenceClassification
from torch import optim

device = "cuda" if torch.cuda.is_available() else "cpu"

hidden_dim = 64
embedding_dim = 128
n_layers = 2
interval = 100
criterion = nn.BCEWithLogitsLoss().to(device)




model = SentenceClassify(
    n_vocab=len(tokenizer), hidden_dim=64, embedding_dim=embedding_dim,n_layers=n_layers
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

In [35]:
def train(model, dataset, criterion, optimizer, device, interval):
    model.train()
    losses = list()
    
    corrects = list()
    
    for step, (input_ids, _,labels) in enumerate(dataset):
        input_ids = input_ids.to(device)
        labels =labels.to(device).unsqueeze(1)
        
        logits = model(input_ids)
        logits = logits.long()
        loss = criterion(logits, labels)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 배치 학습 평가
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )
        
        if step % interval == 0:
            print(f'{step+40}/160-> Train Loss  : {np.mean(losses)} | Accuracy {np.mean(corrects)}')


In [36]:
loss_list,acc_list=[],[]
best_loss = 10000

def test(model, datasets, criterion, device,epoch):
    model.eval()
    losses = list()
    corrects = list()
    
    for step, (input_ids, _,labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        logits =model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )
    print(f'[epoch:{epoch+1}] Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}\n')
    a = np.mean(losses)
    b = np.mean(corrects)
    loss_list.append(a)
    acc_list.append(b)

    if a < best_loss:
        best_loss = a
        torch.save(model.state_dict(), "./LSTMSequenceClassification.pt")
        print("save the model weights")

In [37]:
for epoch in range(epochs):
    train(model, train_dataloader, criterion, optimizer, device, interval)
    test(model, valid_dataloader, criterion, device, epoch)


RuntimeError: result type Float can't be cast to the desired output type Long

In [39]:
print(train.head(5).to_markdown())

AttributeError: 'function' object has no attribute 'head'

In [15]:
tokenizer.encode(text='어디인가',add_special_tokens=False)

[9546, 48446, 12030, 11287]

In [16]:
tokenizer.model_max_length

512

In [18]:
len(tokenizer)

119547