In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import cv2
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW, BertConfig

import re
import os
import time
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

import scipy.stats as ss
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
torch.manual_seed(777)
random.seed(777)
np.random.seed(777)

def class_same(train_df, class_num=10000):
  zero_df = train_df[train_df['label']==0][:class_num]
  one_df = train_df[train_df['label']==1][:class_num]
  total_df = pd.concat([zero_df, one_df])
  total_df = total_df.sample(frac=1).reset_index(drop=True)


yelp = pd.read_csv('/content/drive/My Drive/yelp.csv')
yelp['label'] = yelp['rating']-1
del yelp['rating']

train_df = yelp[:1000]
test_df = yelp[30000:-1000]
val_df = yelp[-1000:]

In [5]:
network = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab = list(tokenizer.vocab.keys())
word_dict = dict(tokenizer.vocab.items())

# 1. 모든 sentence에 CLS, SEP 붙인다.
def get_token(text):
    text = "[CLS] " + " [SEP] ".join(sent_tokenize(text)) + " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text

# 2. token index로 변환, padding 한다.
def get_ids(tokenized_text, max_length):
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    if len(indexed_tokens) < max_length:
      input_ids = indexed_tokens + [0]*(max_length - len(indexed_tokens))
    else:
      input_ids = indexed_tokens[:max_length]
    
    return input_ids

# 3. segment_ids를 만든다.
def get_mask(input_ids):
    # Create attention masks
    attention_masks = []
    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
network

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [7]:
class BERT_DATA(Dataset):
    def __init__(self, df):

        def data_setting(df, max_len):

            #[CLS] [SEP] 붙임, token 변환
            description = list(df["review"])
            descriptions = [get_token(x) for x in description]
            description = [''.join(d) for d in descriptions]
            #max_len
            max_len=max_len      
            
            #convert Id + padding
            input_ids = [get_ids(x,max_len) for x in descriptions]
            
            #Attention_masks
            attention_masks = []
            attention_masks = get_mask(input_ids)
            
            #labels
            target = list(df['label'])
            
            # class수
            num_classes=len(df['label'].unique())
            
            return input_ids, target,  attention_masks

        input_ids, input_labels, attention_masks = data_setting(df, 512)

        self.inputs = input_ids
        self.labels = input_labels
        self.masks = attention_masks

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):

        text = torch.tensor(self.inputs[idx])
        label = self.labels[idx]
        mask = torch.LongTensor(self.masks[idx])

        return text, label, mask

In [8]:
train_data = BERT_DATA(train_df)
train_loader = DataLoader(train_data, batch_size=2, num_workers=2, shuffle=True)

test_data = BERT_DATA(test_df)
test_loader = DataLoader(test_data, batch_size=2, num_workers=2, shuffle=False)

val_data = BERT_DATA(val_df)
val_loader = DataLoader(val_data, batch_size=2, num_workers=2, shuffle=False)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Available devices ', torch.cuda.device_count())

network = network.to(device)

optimizer = AdamW(network.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

Available devices  1


In [10]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def validation(network, val_loader, device):

  network.eval() 
  with torch.no_grad():
      correct = 0
      total = 0
      for i, (inputs, labels, masks) in enumerate(val_loader):
          tests = inputs.to(device)
          test_labels = labels.long().to(device)
          masks = masks.to(device)

          outputs = network(tests, attention_mask = masks, labels = test_labels)
          _ , predicted = torch.max(outputs[1].data , 1)

          total += test_labels.size(0)
          correct += (predicted == test_labels).sum().item()

      print('Test Accuracy of the model {} %'.format(100* correct/total))

      return correct/total

In [11]:
epochs = 1
best_acc = 0
total_len = 0
total_correct = 0
network.train()
for e in range(epochs):
    start_time = time.time()
    for i, (inputs, labels, masks) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.long().to(device)
        masks = masks.to(device)
        outputs = network(inputs, attention_mask = masks, labels = labels)

        loss = outputs[0]

        pred = torch.argmax(F.softmax(outputs[1]), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 ==0:
            print(' Epoch [{} / {}] , step [{}/ {}] Loss: {: .4f} ACC: {: .2f}'
            .format(e +1, epochs, i+1, len(train_loader), loss.item(), total_correct/total_len))
            total_len = 0; total_correct = 0

            val_acc = validation(network, val_loader, device)
            if best_acc < val_acc:
              best_acc = val_acc
              torch.save(network.state_dict(), '/content/drive/My Drive/save_model/binary_bert_model_2.pt')
              print(f"Best Model Save in {e}epoch, Validation Acc : {best_acc}")

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch: {e+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

  app.launch_new_instance()


 Epoch [1 / 1] , step [100/ 500] Loss:  0.1867 ACC:  0.64
Test Accuracy of the model 92.8 %
Best Model Save in 0epoch, Validation Acc : 0.928
 Epoch [1 / 1] , step [200/ 500] Loss:  0.0932 ACC:  0.89
Test Accuracy of the model 92.5 %
 Epoch [1 / 1] , step [300/ 500] Loss:  0.0907 ACC:  0.79
Test Accuracy of the model 87.8 %
 Epoch [1 / 1] , step [400/ 500] Loss:  0.1245 ACC:  0.81
Test Accuracy of the model 93.0 %
Best Model Save in 0epoch, Validation Acc : 0.93
 Epoch [1 / 1] , step [500/ 500] Loss:  0.1501 ACC:  0.91
Test Accuracy of the model 94.0 %
Best Model Save in 0epoch, Validation Acc : 0.94
Epoch: 01 | Epoch Time: 9m 17s


In [12]:
validation(network, test_loader, device)

Test Accuracy of the model 92.27142857142857 %


0.9227142857142857