In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import tqdm
%matplotlib inline

In [None]:
!pip install modelzoo-client[transformers]

Collecting modelzoo-client[transformers]
  Downloading modelzoo_client-0.15.0-py2.py3-none-any.whl (46 kB)
[?25l[K     |███████                         | 10 kB 23.6 MB/s eta 0:00:01[K     |██████████████▏                 | 20 kB 18.4 MB/s eta 0:00:01[K     |█████████████████████▎          | 30 kB 15.2 MB/s eta 0:00:01[K     |████████████████████████████▎   | 40 kB 13.8 MB/s eta 0:00:01[K     |████████████████████████████████| 46 kB 2.6 MB/s 
Collecting click==7.1
  Downloading click-7.1-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 1.4 MB/s 
Collecting yaspin==0.16.0
  Downloading yaspin-0.16.0-py2.py3-none-any.whl (18 kB)
Collecting names==0.3.0
  Downloading names-0.3.0.tar.gz (789 kB)
[K     |████████████████████████████████| 789 kB 51.1 MB/s 
[?25hCollecting colorama==0.4.3
  Downloading colorama-0.4.3-py2.py3-none-any.whl (15 kB)
Collecting transformers>=2.10.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |███████

In [None]:
from transformers import ElectraModel, ElectraTokenizer
from transformers import ElectraForSequenceClassification, AdamW
from transformers import get_cosine_schedule_with_warmup

model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v2-discriminator')
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

Downloading:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [None]:
badwords = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/project/badwords.csv")

In [None]:
badwords_data = badwords[['문장', '악플욕설']]

In [None]:
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
from torch import torch
from sklearn.model_selection import train_test_split

def dataSplit(dataset, y_label):

  X_train, X_val= train_test_split(dataset, test_size = 0.2, stratify = dataset[y_label], random_state =427)

  return X_train, X_val

In [None]:
X_train, X_val = dataSplit(badwords_data, '악플욕설')

In [None]:
class LoadDataset(Dataset):
    def __init__(self, df, tk):
        self.df = df
        self.tokenizer = tk
        
    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :].values
        # target이 없는경우 (즉, 문장만 입력된 경우)
        if len(row) <= 1:
            text = row[0]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask     
            
        # target이 있는 경우 (원래 코드)
        else:
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask, y

In [None]:
train_set = LoadDataset(X_train, tokenizer)
val_set = LoadDataset(X_val, tokenizer)

In [None]:
epochs = 1 #epoch 10으로 설정하고 훈련시킨 후 1로 설정해서 한번씩 돌리고 결과 보기
batch_size = 128
warmup_ratio=0.1
t_total = len(train_set) * epochs
optimizer = AdamW(model.parameters(), lr=1e-4, eps = 1e-8)
train_loader = DataLoader(train_set, batch_size=batch_size)
test_loader = DataLoader(val_set, batch_size=batch_size)
loss_f = nn.BCELoss()

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1, num_training_steps=t_total)

In [None]:
model.classifier.out_proj =  nn.Sequential( nn.Linear(256, 1),
                                           nn.Sigmoid() )

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from sklearn.metrics import roc_auc_score
losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    batches = 0
    score_list = []
    total = 0
    batches = 0


    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
        loss = loss_f(y_pred.type(torch.FloatTensor), y_batch.type(torch.FloatTensor))
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        try:
            score = roc_auc_score(y_batch.tolist(), y_pred.tolist())
            score_list.append(score)
        except: pass
        batches += 1
        if batches % 30 == 0:
            print("Batch Loss:", total_loss, "ROC_AUC:",np.mean(score_list))


    print("Train Loss:", total_loss, "epoch roc_auc:", np.mean(score_list))

  0%|          | 0/50 [00:00<?, ?it/s]

Batch Loss: 2.4844261333346367 ROC_AUC: 0.9960227905941396
Train Loss: 4.000156873837113 epoch roc_auc: 0.9960021899975874


In [None]:
from sklearn.metrics import roc_auc_score
model.eval()
valid_loader = DataLoader(val_set, batch_size=batch_size)
score_list = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
    y_pred = model(input_ids_batch, attention_mask=attention_masks_batch).logits.reshape(-1)
#    print(y_pred)
    try:
        score = roc_auc_score(y_batch.tolist(), y_pred.tolist())
        score_list.append(score)
    except: pass


print("epoch roc_auc:", np.mean(score_list))

  0%|          | 0/13 [00:00<?, ?it/s]

epoch roc_auc: 0.821451880361284


In [None]:
PATH = "/content/gdrive/MyDrive/badwords_koelectra_model.pth"

torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, PATH)