In [1]:
import pandas as pd
import numpy as np

import re
import emoji
from soynlp.normalizer import repeat_normalize

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset, TensorDataset

from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
from transformers import ElectraTokenizerFast, ElectraModel, TFElectraModel
from transformers import AdamW

from sklearn.model_selection import train_test_split




In [2]:
df = pd.read_csv('Dataset2.csv', nrows=1000)
X = df['문법교정']
y = df['malicious']

In [3]:
emojis = ''.join(emoji.EMOJI_DATA.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x): 
    x = pattern.sub(' ', x)
    x = emoji.replace_emoji(x, replace='')
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

X = X.apply(lambda x: clean(x))

In [4]:
# model = T5ForConditionalGeneration.from_pretrained('j5ng/et5-typos-corrector')
# tokenizer = T5Tokenizer.from_pretrained('j5ng/et5-typos-corrector')

# typos_corrector = pipeline(
#     "text2text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     device=0 if torch.cuda.is_available() else -1,
#     framework="pt",
# )

# X = X.apply(lambda x: typos_corrector(x,
#             max_length=128,
#             num_beams=5,
#             early_stopping=True)[0]['generated_text'])

In [5]:
# from transformers import ElectraTokenizerFast, AutoModelForSequenceClassification

# tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
# model = AutoModelForSequenceClassification.from_pretrained("kykim/electra-kor-base", num_labels=2)

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=2)

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# from transformers import BertTokenizerFast, AutoModelForSequenceClassification

# tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
# model = AutoModelForSequenceClassification.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2", num_labels=2)
# model.config.pad_token_id = model.config.eos_token_id

In [8]:
# from transformers import FunnelTokenizerFast, AutoModelForSequenceClassification

# tokenizer = FunnelTokenizerFast.from_pretrained("kykim/funnel-kor-base")
# model = AutoModelForSequenceClassification.from_pretrained("kykim/funnel-kor-base")

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_list = X.values.tolist()
y = y.values
sequences = tokenizer(X_list, padding=True, truncation=True, return_tensors="pt")

X_train, X_test, y_train, y_test = train_test_split(sequences['input_ids'], y, test_size=0.2, random_state=42)
X_train_mask, X_test_mask, _, _ = train_test_split(sequences['attention_mask'], y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, X_train_mask, torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(X_test, X_test_mask, torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_losses = []
    val_accs = []
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_losses.append(outputs.loss.item())
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            acc = (preds == labels).float().mean().item()
            val_accs.append(acc)

    val_loss = np.mean(val_losses)
    val_acc = np.mean(val_accs)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")



Epoch 1/10, Validation Loss: 0.6483003199100494, Validation Accuracy: 0.8125
Epoch 2/10, Validation Loss: 0.5770984292030334, Validation Accuracy: 0.875
Epoch 3/10, Validation Loss: 0.5615522563457489, Validation Accuracy: 0.78125
Epoch 4/10, Validation Loss: 0.43095387518405914, Validation Accuracy: 0.9375
Epoch 5/10, Validation Loss: 0.5007993280887604, Validation Accuracy: 0.78125
Epoch 6/10, Validation Loss: 0.28580276668071747, Validation Accuracy: 0.9375
Epoch 7/10, Validation Loss: 0.25589719414711, Validation Accuracy: 0.9375
Epoch 8/10, Validation Loss: 0.26561229303479195, Validation Accuracy: 0.9375
Epoch 9/10, Validation Loss: 0.2781986501067877, Validation Accuracy: 0.9375
Epoch 10/10, Validation Loss: 0.2925569759681821, Validation Accuracy: 0.9375


In [22]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader

new_data = pd.read_csv('Dataset.csv', sep='\t')
new_data.dropna(inplace=True)
new_data['lable'] = new_data['lable'].astype(int)

In [23]:
X_new = new_data['content'].tolist()
sequences_new = tokenizer(X_new, padding=True, truncation=True, return_tensors="pt")

new_dataset = TensorDataset(sequences_new['input_ids'], sequences_new['attention_mask'])
new_loader = DataLoader(dataset=new_dataset, batch_size=16, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for input_ids, attention_mask in new_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

print(predictions)

[0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 