In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pprint import pprint
import os
from glob import glob
import json
import shutil
import wandb
import gluonnlp as nlp
from datetime import datetime, timezone, timedelta
import random
from tqdm import tqdm
from attrdict import AttrDict
import re
import emoji
from soynlp.normalizer import repeat_normalize
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import *
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
import math
from torch.optim.lr_scheduler import _LRScheduler

from transformers import logging
from transformers import TrainingArguments, Trainer
from transformers import (
   get_linear_schedule_with_warmup, 
   get_cosine_schedule_with_warmup
)

from transformers import ( 
    AutoConfig,
    BertConfig,
    ElectraConfig
)

from transformers import (
    BertTokenizer,  
    AutoTokenizer,
    ElectraTokenizer,
    AlbertTokenizer

)

from transformers import (
    BertModel,
    AutoModel, 
    ElectraForSequenceClassification,
    BertForSequenceClassification,
    AlbertForSequenceClassification,
    AutoModelForSequenceClassification
)

# from kobert import get_tokenizer,  get_pytorch_kobert_model
from kobert_tokenizer import KoBERTTokenizer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print('all libraries imported succesfully')

In [None]:
# 사용할 GPU 지정
print("number of GPUs: ", torch.cuda.device_count())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
use_cuda = torch.cuda.is_available()
print("Does GPU exist? : ", use_cuda)
DEVICE = torch.device("cuda" if use_cuda else "cpu")

def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    # pl.seed_everything(seed)

seed_everything(args.seed)
print('seed setting')

In [None]:
bias_tokenizer = AutoTokenizer.from_pretrained("beomi/beep-KcELECTRA-base-bias")
bias_model = AutoModelForSequenceClassification.from_pretrained("beomi/beep-KcELECTRA-base-bias")

In [None]:
args = {    
    "run": "onlypredict",
    "data_dir": "/USER/comp4/data",
    "result_dir": "/USER/comp4/result/",
    "config_dir": "/USER/comp4/exp_config/",
    "pretrained_model": "beomi/kcbert-large",
    "architecture": "AutoModelForSequenceClassification",
    "tokenizer_class": "AutoTokenizer",
    "num_classes": 6,
    "max_seq_len": 128
}

In [None]:
test_df = pd.read_csv(os.path.join(args['data_dir'],'test.csv'),encoding = 'UTF-8-SIG')

In [None]:
test_df.comment.str.len().sort_values(ascending=False)

In [None]:
def clean(x):
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [None]:
for i,(ID,title,comment) in tqdm(enumerate(test_df.to_numpy())):
    test_df.comment[i] = clean(comment)

In [None]:
a = random.randint(1,500)
test_df.comment[a]

In [None]:
def label_to_num(label):
    label_dict = {"none": 0, "others": 1, "gender": 2}
    # label_dict = {"none": 0, "gender": 1, "others": 2}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])
    
    return num_label

In [None]:
test_df['bias'] = "none"
test_df

In [None]:
test_bias = label_to_num(test_df.bias.values)

In [None]:
tokenized_test = bias_tokenizer(
    # list(train_dataset['title']),
    list(test_df['comment']),
    return_tensors="pt",
    max_length=args['max_seq_len'], # Max_Length = 190
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [None]:
class biasDataset(Dataset):
    def __init__(self, pair_dataset, bias):
        self.pair_dataset = pair_dataset
        self.bias = bias

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['bias'] = torch.tensor(self.bias[idx])
        
        return item

    def __len__(self):
        return len(self.bias)

In [None]:
test_dataset = biasDataset(tokenized_test, test_bias)

In [None]:

bias_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

bias_model.to(DEVICE)
bias_model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(bias_dataloader)):
    with torch.no_grad():
        outputs = bias_model(
            input_ids=data['input_ids'].to(DEVICE),
            attention_mask=data['attention_mask'].to(DEVICE),
            token_type_ids=data['token_type_ids'].to(DEVICE)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
# print(pred_answer)

In [None]:
def num_to_label(label):
    label_dict = {0: "none", 1: "others", 2: "gender"}
    str_label = []

    for i, v in enumerate(label):
        str_label.append([i,label_dict[v]])
    
    return str_label

bias = num_to_label(pred_answer)

In [None]:
hate_tokenizer = AutoTokenizer.from_pretrained("beomi/beep-KcELECTRA-base-hate")
hate_model = AutoModelForSequenceClassification.from_pretrained("beomi/beep-KcELECTRA-base-hate")

In [None]:
def label_to_num(label):
    label_dict = {"none": 0, "offensive": 1,"hate":2}
    # label_dict = {"none": 0, "pos": 1}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])
    
    return num_label

In [None]:
test_df['hate'] = "none"

In [None]:
test_hate = label_to_num(test_df.hate.values)

In [None]:
tokenized_test_hate = hate_tokenizer(
    # list(train_dataset['title']),
    list(test_df['comment']),
    return_tensors="pt",
    max_length=args['max_seq_len'], # Max_Length = 190
    padding=True,
    truncation=True,
    add_special_tokens=True
)

In [None]:
class hateDataset(Dataset):
    def __init__(self, pair_dataset, hate):
        self.pair_dataset = pair_dataset
        self.hate = hate

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['hate'] = torch.tensor(self.hate[idx])
        
        return item

    def __len__(self):
        return len(self.hate)

In [None]:
hate_test_dataset = hateDataset(tokenized_test, test_hate)

In [None]:

hate_dataloader = DataLoader(hate_test_dataset, batch_size=16, shuffle=False)

hate_model.to(DEVICE)
hate_model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(hate_dataloader)):
    with torch.no_grad():
        outputs = hate_model(
            input_ids=data['input_ids'].to(DEVICE),
            attention_mask=data['attention_mask'].to(DEVICE),
            token_type_ids=data['token_type_ids'].to(DEVICE)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
# print(pred_answer)

In [None]:
def num_to_label(label):
    label_dict = {0: "none", 1: "offensive",2:"hate"}
    # label_dict = {0: "hate", 1: "none"}
    
    str_label = []

    for i, v in enumerate(label):
        str_label.append([i,label_dict[v]])
    
    return str_label

hate = num_to_label(pred_answer)

In [None]:
for i,val in enumerate(hate):
    if hate[i][1] == 'offensive':
        hate[i][1] = 'hate'
    

In [None]:
df = pd.DataFrame(bias, columns=['ID', 'bias'])
df

In [None]:
df_2 = pd.DataFrame(hate, columns=['ID', 'hate'])
df_2

In [None]:
df['hate'] = df_2['hate']
df

In [None]:
df.to_csv(os.path.join(args['result_dir'],'onlypredict.csv'))