In [None]:
!pip install transformers



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

from transformers import ElectraModel, ElectraTokenizer
from transformers import ElectraForSequenceClassification, AdamW

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm

In [None]:
#cuda = torch.device('cuda')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
model_name_list = ['sexual_minority', 'race', 'age_model','local','religion_model','other','badwords_koelectra_model','clean_koelectra_model','personal_koelectra_model','gender_model']
load_md_list = []
PATH = '/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/koelectra_model1/model/'
for md in model_name_list:
    model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v2-discriminator')
    try:
        model.load_state_dict(torch.load(PATH + f'{md}.pth')['model_state_dict'])
    except:
        model.load_state_dict(torch.load(PATH + f'{md}.pth'))
    model = model.cuda()
    load_md_list.append(model)

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [None]:
len(load_md_list)

10

1. testloader 만들기
2. 각 모델에 입력해보기
3. 앙상블하기

In [None]:
class LoadDataset(Dataset):
    def __init__(self, df, tk):
        self.df = df
        self.tokenizer = tk
        
    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=200,
            pad_to_max_length=True,
            add_special_tokens=True
            )
        
        input_ids = inputs['input_ids'][0].to(device)
        attention_mask = inputs['attention_mask'][0].to(device)

        return input_ids, attention_mask, y

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/A3_datasets/KoElectra/koelectra_datasets/unsmile.csv')

In [None]:
df['성별'] = df['여성/가족'] | df['남성']

del df['여성/가족']
del df['남성']

In [None]:
df.shape

(18742, 11)

In [None]:
tk = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
localDataset = LoadDataset(df, tk)

In [None]:
test_loader = DataLoader(localDataset, batch_size=6)

In [None]:
md1 = load_md_list[0]
md2 = load_md_list[1]
md3 = load_md_list[2]
md4 = load_md_list[3]
md5 = load_md_list[4]
md6 = load_md_list[5]
md7 = load_md_list[6]
md8 = load_md_list[7]
md9 = load_md_list[8]
md10 = load_md_list[9]

predict_proba_df = pd.DataFrame()

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):

    # 모델에 입력했을 때, 각 class일 확률 계산
    y_pred1 = F.softmax(md1(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred2 = F.softmax(md2(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred3 = F.softmax(md3(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred4 = F.softmax(md4(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred5 = F.softmax(md5(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred6 = F.softmax(md6(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred7 = F.softmax(md7(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred8 = F.softmax(md8(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred9 = F.softmax(md9(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    y_pred10 = F.softmax(md10(input_ids_batch, attention_mask=attention_masks_batch)[0])[:,1].tolist()
    # 예측 확률을 dataframe으로 만듭니다.
    tmp = pd.DataFrame([y_pred1,y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7, y_pred8,y_pred9,y_pred10]).T
    predict_proba_df = pd.concat([predict_proba_df, tmp])
#    print(y_pred)

  0%|          | 0/3124 [00:00<?, ?it/s]



In [None]:
predict_proba_df = predict_proba_df.reset_index(drop = True)

In [None]:
predict_proba_df.columns = ['성소수자','인종국적','연령','지역','종교','기타혐오','악플욕설','clean','개인지칭','성별']

In [None]:
df

Unnamed: 0,문장,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭,성별
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,0,0,0,0,0,0,0,1,0,0
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. ...,0,0,0,0,1,0,0,0,0,0
2,루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵...,0,0,0,0,0,0,0,1,0,0
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽...,0,0,0,0,0,0,0,1,0,0
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
18737,저게 시대적언어면 한남충도 시대적언어 아니노 ㅋㅋ,0,0,0,0,0,0,0,0,0,1
18738,다른것보다 눈이 어떻게 저렇게 생기노.탄식.,0,0,0,0,0,0,1,0,0,0
18739,막노동을 해도 한국에살고말지 미쳤다고 남미를가냐?차라리 자살을하겠다.,0,1,0,0,0,0,0,0,0,0
18740,‘사형을 구형하였으나 여성인 점을 감안해 25년 선고’ ???내가 뭐 잘못본건가?개...,0,0,0,0,0,0,0,0,0,1


In [None]:
predict_proba_df

Unnamed: 0,성소수자,인종국적,연령,지역,종교,기타혐오,악플욕설,clean,개인지칭,성별
0,0.181536,0.112672,0.018423,0.214790,0.003840,0.062305,0.764327,0.898057,0.461468,0.055920
1,0.771558,0.902718,0.207716,0.263284,0.994117,0.941308,0.137447,0.065520,0.512671,0.264386
2,0.190488,0.110344,0.019065,0.220058,0.003899,0.040951,0.828609,0.873279,0.505239,0.192731
3,0.182510,0.112954,0.018708,0.324422,0.003868,0.557211,0.451703,0.777508,0.493452,0.059298
4,0.204828,0.115329,0.018271,0.236906,0.003788,0.129014,0.267495,0.055939,0.484903,0.959244
...,...,...,...,...,...,...,...,...,...,...
18737,0.180230,0.116666,0.023956,0.263740,0.003646,0.043022,0.142627,0.054604,0.472886,0.957197
18738,0.180755,0.111782,0.018472,0.214900,0.004296,0.035931,0.865235,0.158980,0.454613,0.066922
18739,0.186581,0.906836,0.018780,0.368074,0.003759,0.939723,0.618448,0.100738,0.502810,0.056616
18740,0.244519,0.131564,0.018413,0.251937,0.004173,0.685772,0.161031,0.052726,0.511081,0.957046


In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
for i,j in zip(df.iloc[:,1:].columns, roc_auc_score(df.iloc[:,1:], predict_proba_df, average = None)):
    print(f'{i} roc_auc score : {j}')

성소수자 roc_auc score : 0.9450306037738133
인종/국적 roc_auc score : 0.9382700552737051
연령 roc_auc score : 0.9867499651436915
지역 roc_auc score : 0.961685089661783
종교 roc_auc score : 0.9903234482452146
기타 혐오 roc_auc score : 0.9235054331862126
악플/욕설 roc_auc score : 0.8372949238712386
clean roc_auc score : 0.9071511832192533
개인지칭 roc_auc score : 0.8518665721104692
성별 roc_auc score : 0.9551985455972152
