1. 상단 메뉴 -> 런타임-> 런타임 유형 변경 -> GPU
2.  transformers 설치
3. Reviews.csv 업로드

In [None]:
!pip install transformers

In [None]:
!wget -O Reviews.csv https://www.dropbox.com/s/igsnbo24jifkdjr/Reviews_mini.csv?dl=0


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import csv
import torchtext
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
from sklearn.metrics import accuracy_score as ACC
from torch.nn.utils.rnn import pad_sequence
import numpy as np

In [None]:
df=pd.read_csv('Reviews.csv',error_bad_lines=False, engine="python")

In [None]:
df = df[['Score','Text']]

In [None]:
df

In [None]:
df['Score'].hist()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenizer

In [None]:
example_sentence = df['Text'].iloc[0]
example_sentence

In [None]:
tokenizer.encode(example_sentence)

In [None]:
class textDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['Text'].iloc[idx]
        score = self.data['Score'].iloc[idx]

        encoded_text = self.tokenizer.encode(text)

        encoded_text = torch.tensor(encoded_text).long()
        score = torch.tensor(score).long()
        score = score-1 # 1~5->0~4
        return encoded_text, score

In [None]:
split_idx = int(len(df)*0.9)
train_data = df.iloc[:split_idx]
test_data = df.iloc[split_idx:]

In [None]:
batch_size = 4
device = torch.device('cuda')

# Data set
train_dataset = textDataset(train_data, tokenizer)
test_dataset = textDataset(test_data, tokenizer)

def collate_fn(batch):
  texts, scores = zip(*batch)
  texts_pad = pad_sequence(texts, batch_first=True, padding_value=0)
  texts_pad = texts_pad[:,:512]
  return texts_pad, torch.stack(scores)

# Data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=2)
test_loader =  torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=2)


In [None]:
class simpleRNN(nn.Module):
  def __init__(self):
        super(simpleRNN, self).__init__()

        self.embedding_layer = nn.Embedding(30522, embedding_dim=256)
        
        self.num_layers = 1
        self.RNN = nn.RNN(256, 256, num_layers=self.num_layers, dropout=0.1, batch_first=True)
        
        self.out = nn.Linear(256, 5)

      
  def forward(self, text):
        x=self.embedding_layer(text)
        h0 = torch.zeros(self.num_layers,x.shape[0],256)
        if torch.cuda.is_available():
            h0 = h0.cuda()
        x, h = self.RNN(x, h0)
        x = self.out(x[:,-1,:])


        return x

In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.bert = BertForSequenceClassification.from_pretrained(options_name, num_labels=5)

    def forward(self, text):
        x = self.bert(text)[0]

        return x

In [None]:
learning_rate = 1e-5
model = BERT()
model = simpleRNN()
if torch.cuda.is_available():
  model = model.cuda()
criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.bert.classifier.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [None]:
EPOCHS=1

losses = []
for epoch in range(EPOCHS):
    loss_list = []
    acc_list = []

    loss_list2 = []
    acc_list2 = []
    for i, (X_batch, y_batch) in enumerate(train_loader):
        if torch.cuda.is_available():
            X_batch = X_batch.cuda()
            y_batch = y_batch.cuda()
        #Forward 
        y_output = model(X_batch)
        loss = criterion(y_output, y_batch) #CELoss: The input is expected to contain raw, unnormalized scores for each class.
        
        #Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #misc (acc 계산, etc) 
        y_pred = torch.max(y_output, 1)[1]
        acc = ACC(y_batch.data.cpu(), y_pred.data.cpu())
        loss_list.append(loss.item())
        acc_list.append(acc)
        loss_list2.append(loss.item())
        acc_list2.append(acc)
        losses.append(loss.item())
        if (i+1) % 20 == 0:
            print('Epoch [{}/{}] Step [{}/{}] Loss: [{:.4f}] Train ACC [{:.2f}%]'.format(epoch+1, EPOCHS, \
                                                                                       i+1, len(train_loader), np.mean(loss_list2), np.mean(acc_list2)*100))
            loss_list2 = []
            acc_list2 = [] 
    print('Epoch [{}/{}] Loss: [{:.4f}] Train ACC [{:.2f}%]'.format(epoch+1, EPOCHS, np.mean(loss_list), np.mean(acc_list)*100))

In [None]:
plt.figure(figsize=(8,4))
plt.plot(losses)
plt.title('Loss graph')
plt.show()

In [None]:
test_acc_list = []
with torch.no_grad():
    model.eval()
    for X_batch, y_batch in test_loader:    
        if torch.cuda.is_available():
            X_batch = X_batch.cuda()
            y_batch = y_batch.cuda()
        
        y_output = model(X_batch)
        y_pred = torch.max(y_output, 1)[1]
        
        acc = ACC(y_batch.data.cpu(), y_pred.data.cpu())
        test_acc_list.append(acc)
    test_acc = np.mean(test_acc_list)
print('Test ACC: [{:.2f}%]'.format(test_acc*100))

# 과제

63% accuracy를 달성해보세요! (Hint: BERT를 낮은 learning rate로 학습해보세요.)