<a href="https://colab.research.google.com/github/Hyun-ho-Lee/Kaggle-and-Project/blob/master/DaconSentimental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

In [None]:
import pandas as pd 
import numpy as np 
import torch 
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
import re
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('/content/drive/My Drive/Review/train.csv')
test = pd.read_csv('/content/drive/My Drive/Review/test.csv')
submission = pd.read_csv('/content/drive/My Drive/Review/sample_submission.csv')

In [None]:
data = [train,test]

for i in data:
    print('data shape of set',i.shape) 

data shape of set (5000, 3)
data shape of set (5000, 2)


In [None]:
device = torch.device("cuda")

In [None]:
train.head()

Unnamed: 0,id,document,label
0,1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0
1,2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯...,1
2,3,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,1
3,4,아무래도 20년도지난작품이라 지금보기는너무유치하다,0
4,5,지금까지의 영화들이 그랬듯. 이 영화역시 일본에 대한 미화는 여전하다.,0


In [None]:
test.head()

Unnamed: 0,id,document
0,1,시간 때우기 좋은 영화 지루함
1,2,훈훈한 정이 느껴지는 영화! 가족끼리 드라마 보듯이 보면 딱~!
2,3,Childhood fantasy
3,4,멋있는 영화입니다. 잊을 수 없는!
4,5,너무 감동적이네요 펑펑 울었습니다


In [None]:
def text_clean(data): 
    clean_data=data['document'].map(lambda x: re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    return clean_data

In [None]:
reviews = list(train['document'])
tokenized_reviews = [r.split() for r in reviews]
review_len_by_token = [len(t) for t in tokenized_reviews]
print('train data문장 최대길이: {}'.format(np.max(review_len_by_token)))
print('train data문장 최소길이: {}'.format(np.min(review_len_by_token)))

문장 최대길이: 15
문장 최소길이: 1


In [None]:
reviews = list(test['document'])
tokenized_reviews = [r.split() for r in reviews]
review_len_by_token = [len(t) for t in tokenized_reviews]
print('test data문장 최대길이: {}'.format(np.max(review_len_by_token)))
print('test data문장 최소길이: {}'.format(np.min(review_len_by_token)))

문장 최대길이: 13
문장 최소길이: 1


#KoELECTRA Dataset

In [None]:
class NMtrain_dataset(Dataset):
  
  def __init__(self, dataset):
    
    self.dataset = dataset.dropna(axis=0) 
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.dataset['document'].map(lambda x: re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.loc[idx, ['document', 'label']].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=32,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
class NMtest_dataset(Dataset):
  
  def __init__(self, dataset):
    
    self.dataset = dataset.dropna(axis=0) 
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.dataset['document'].map(lambda x: re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.loc[idx, ['document']].values
    text = row[0]
    

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=32,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [None]:
train_dataset=NMtrain_dataset(train)
test_dataset=NMtest_dataset(test)

                id        label
count  5000.000000  5000.000000
mean   2500.500000     0.487200
std    1443.520003     0.499886
min       1.000000     0.000000
25%    1250.750000     0.000000
50%    2500.500000     0.000000
75%    3750.250000     1.000000
max    5000.000000     1.000000
                id
count  5000.000000
mean   2500.500000
std    1443.520003
min       1.000000
25%    1250.750000
50%    2500.500000
75%    3750.250000
max    5000.000000


# Baseline

In [None]:
EPOCHS = 2
batch_size = 128
warmup_ratio = 0.1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
total_steps = len(train_loader) * EPOCHS
import random
losses = []
accuracy = []

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    scheduler.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss: ", total_loss, "Accuracy: ", correct.float() / total)
      
  losses.append(total_loss)
  accuracy.append(correct.float() / total)
  print("Train Loss :", total_loss, "Accuracy :", correct.float() / total)

answer = []
model.eval()

for input_ids_batch, attention_masks_batch in tqdm(test_loader):
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  answer.append(predicted)
    
#answers.append(answer)

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

  0%|          | 0/40 [00:00<?, ?it/s]



Train Loss : 23.383638113737106 Accuracy : tensor(0.7312, device='cuda:0')


  0%|          | 0/40 [00:00<?, ?it/s]

Train Loss : 15.17140543460846 Accuracy : tensor(0.8770, device='cuda:0')


  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
answers=[]
for i in range(len(answer)):
  answers += answer[i].tolist()

submission['label']=answers
submission.to_csv('submission.csv',index=False)

#Vaild set 

In [None]:
class NSMCDataset(Dataset):
  
  def __init__(self, dataset):
    self.dataset = dataset.dropna(axis=0) 
    self.dataset['document'].map(lambda x: re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', x))
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=32,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
EPOCHS = 2
batch_size = 128
warmup_ratio = 0.1
NUM_OF_MODELS = 5

train_set=train[:4000]
vaild_set=train[-1000:]
train_dataset=NSMCDataset(train_set)
vaild_dataset=NSMCDataset(vaild_set)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
vaild_loader = DataLoader(vaild_dataset, batch_size=1, shuffle=False)

total_steps = len(train_loader) * EPOCHS

In [None]:
import random
losses = []
accuracy = []

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=total_steps)
for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask = attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    scheduler.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss: ", total_loss, "Accuracy: ", correct.float() / total)
      
  losses.append(total_loss)
  accuracy.append(correct.float() / total)
  print("Train Loss :", total_loss, "Accuracy :", correct.float() / total)


model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(vaild_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

  0%|          | 0/32 [00:00<?, ?it/s]



Train Loss : 20.823914349079132 Accuracy : tensor(0.6710, device='cuda:0')


  0%|          | 0/32 [00:00<?, ?it/s]

Train Loss : 15.601825296878815 Accuracy : tensor(0.8410, device='cuda:0')


  0%|          | 0/1000 [00:00<?, ?it/s]

Accuracy: tensor(0.8300, device='cuda:0')
