In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 23.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import pandas as pd
import numpy as np

from transformers import ElectraModel, ElectraTokenizer, AdamW, ElectraForSequenceClassification, get_cosine_schedule_with_warmup
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from tqdm.notebook import tqdm

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
model_name_list = ['sexual_minority', 'race', 'age_model','local','religion_model','other','badwords_koelectra_model','clean_koelectra_model','personal_koelectra_model','gender_model']
load_md_list = []
PATH = '/content/gdrive/MyDrive/Colab Notebooks/project/models/'
for md in model_name_list:
    model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-small-v2-discriminator')
    model.classifier.out_proj =  nn.Sequential( nn.Linear(256, 1), nn.Sigmoid() )
    try:
        model.load_state_dict(torch.load(PATH + f'{md}.pth', map_location = device)['model_state_dict'])
    except:
        model.load_state_dict(torch.load(PATH + f'{md}.pth', map_location = device))
    model = model.cuda()
    load_md_list.append(model)

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [None]:
len(load_md_list)

10

In [None]:
class LoadDataset(Dataset):
    def __init__(self, df, tk):
        self.df = df
        self.tokenizer = tk
        
    def __len__(self):
        return len(self.df)
  
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :].values
        # target이 없는경우 (즉, 문장만 입력된 경우)
        if len(row) <= 1:
            text = row[0]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0].to(device)
            attention_mask = inputs['attention_mask'][0].to(device)

            return input_ids, attention_mask     
            
        # target이 있는 경우 (원래 코드)
        else:
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text, 
                return_tensors='pt',
                truncation=True,
                max_length=50,
                pad_to_max_length=True,
                add_special_tokens=True
                )
            
            input_ids = inputs['input_ids'][0].to(device)
            attention_mask = inputs['attention_mask'][0].to(device)

            return input_ids, attention_mask, y

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/project/unsmile.csv')

In [None]:
df['성별'] = df['여성/가족'] | df['남성']

del df['여성/가족']
del df['남성']

In [None]:
df.shape

(18742, 11)

In [None]:
tk = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
localDataset = LoadDataset(df, tk)

In [None]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [None]:
test_loader = DataLoader(localDataset, batch_size=6)

In [None]:
md1 = load_md_list[0]
md2 = load_md_list[1]
md3 = load_md_list[2]
md4 = load_md_list[3]
md5 = load_md_list[4]
md6 = load_md_list[5]
md7 = load_md_list[6]
md8 = load_md_list[7]
md9 = load_md_list[8]
md10 = load_md_list[9]

predict_proba_df = pd.DataFrame()

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):

    # 모델에 입력했을 때, 각 class일 확률 계산
    y_pred1 = md1(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred2 = md2(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred3 = md3(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred4 = md4(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred5 = md5(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred6 = md6(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred7 = md7(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred8 = md8(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred9 = md9(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()
    y_pred10 = md10(input_ids_batch, attention_mask=attention_masks_batch)[0].tolist()

    # 예측 확률을 dataframe으로 만듭니다.
    tmp = pd.DataFrame((y_pred1,y_pred2,y_pred3,y_pred4,y_pred5,y_pred6,y_pred7, y_pred8,y_pred9,y_pred10)).T
    predict_proba_df = pd.concat([predict_proba_df, tmp]).reset_index(drop = True)
#    print(y_pred)

  0%|          | 0/3124 [00:00<?, ?it/s]

In [None]:
predict_proba_df = predict_proba_df.reset_index(drop = True)
predict_proba_df.columns = ['성소수자','인종국적','연령','지역','종교','기타혐오','악플욕설','clean','개인지칭','성별']


In [None]:
def findLabel(data):
    labels = ''
    for col in range(0, 10):
      if data[col] > 0.5: #if data[col] >= 0.9
        labels = labels + str(col)
        if col == 7:
          labels = ''
          break
    labels = labels.replace("0", "성소수자 ")
    labels = labels.replace("1", "인종/국적 ")
    labels = labels.replace("2", "연령 ")
    labels = labels.replace("3", "지역 ")
    labels = labels.replace("4", "종교 ")
    labels = labels.replace("5", "기타 ")
    labels = labels.replace("6", "악플/욕설 ")
    labels = labels.replace("8", "개인지칭 ")
    labels = labels.replace("9", "성별 ")
    labels = labels + "비하 표현이 있습니다."
    if labels == "비하 표현이 있습니다.":
      labels = ' 이 문장은 깨끗합니다! '
    print('*********************************************************************************************')
    return labels

In [None]:
predict_proba_df

Unnamed: 0,성소수자,인종국적,연령,지역,종교,기타혐오,악플욕설,clean,개인지칭,성별
0,[0.00992653053253889],[0.00781305693089962],[0.06953998655080795],[0.014694083482027054],[0.03065081499516964],[0.17827288806438446],[0.32986027002334595],[0.8694496154785156],[0.06415566802024841],[0.002027707640081644]
1,[0.8967061638832092],[0.992347002029419],[0.49917250871658325],[0.4939764440059662],[0.039723869413137436],[0.9776880145072937],[0.0051170880906283855],[0.003343375399708748],[0.5229806303977966],[0.006062234751880169]
2,[0.008039574138820171],[0.021493906155228615],[0.12716476619243622],[0.014734745025634766],[0.03373967483639717],[0.025823090225458145],[0.9533167481422424],[0.9941976070404053],[0.8775532841682434],[0.033233173191547394]
3,[0.006496177054941654],[0.05761103332042694],[0.04216475784778595],[0.014871353283524513],[0.03575221821665764],[0.16200090944766998],[0.011653698049485683],[0.9853292107582092],[0.8005574941635132],[0.0014117816463112831]
4,[0.007641895208507776],[0.011496854946017265],[0.04805586487054825],[0.014691515825688839],[0.04082565754652023],[0.019077658653259277],[0.017953427508473396],[0.002397058065980673],[0.1078900694847107],[0.9974719882011414]
...,...,...,...,...,...,...,...,...,...,...
18737,[0.006415558513253927],[0.00664200633764267],[0.04958932846784592],[0.015173877589404583],[0.03205285966396332],[0.8750902414321899],[0.004321693442761898],[0.0026993295177817345],[0.10036743432283401],[0.997204601764679]
18738,[0.006705266423523426],[0.006627983413636684],[0.04708331823348999],[0.014740893617272377],[0.030421046540141106],[0.01746710017323494],[0.9567152857780457],[0.003506176173686981],[0.058287885040044785],[0.001556604285724461]
18739,[0.008524048142135143],[0.9898770451545715],[0.6333529353141785],[0.014562509022653103],[0.033756524324417114],[0.9795077443122864],[0.016368502750992775],[0.0027663048822432756],[0.06680171191692352],[0.0014255802379921079]
18740,[0.0084783174097538],[0.014781573787331581],[0.04747341573238373],[0.014547673985362053],[0.04011748731136322],[0.3910418450832367],[0.00997236929833889],[0.0025615894701331854],[0.922313392162323],[0.997342050075531]


In [None]:
df

Unnamed: 0,문장,성소수자,인종/국적,연령,지역,종교,기타 혐오,악플/욕설,clean,개인지칭,성별
0,일안하는 시간은 쉬고싶어서 그런게 아닐까,0,0,0,0,0,0,0,1,0,0
1,아동성범죄와 페도버는 기록바 끊어져 영원히 고통 받는다. 무슬림 50퍼 근친이다. ...,0,0,0,0,1,0,0,0,0,0
2,루나 솔로앨범 나왔을 때부터 머모 기운 있었음 ㅇㅇ Keep o doin 진짜 띵...,0,0,0,0,0,0,0,1,0,0
3,홍팍에도 어버이연합인가 보내요 뭐 이런뎃글 있는데 이거 어버이연합측에 신고하면 그쪽...,0,0,0,0,0,0,0,1,0,0
4,아놔 왜 여기 댓들은 다 여자들이 김치녀라고 먼저 불렸다! 여자들은 더 심하게 그런...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
18737,저게 시대적언어면 한남충도 시대적언어 아니노 ㅋㅋ,0,0,0,0,0,0,0,0,0,1
18738,다른것보다 눈이 어떻게 저렇게 생기노.탄식.,0,0,0,0,0,0,1,0,0,0
18739,막노동을 해도 한국에살고말지 미쳤다고 남미를가냐?차라리 자살을하겠다.,0,1,0,0,0,0,0,0,0,0
18740,‘사형을 구형하였으나 여성인 점을 감안해 25년 선고’ ???내가 뭐 잘못본건가?개...,0,0,0,0,0,0,0,0,0,1


In [None]:
for idx, row in predict_proba_df.iterrows():
  text = findLabel(predict_proba_df.loc[idx,:])
  print(df.loc[idx, '문장'])
  print('-- ' +text+ '--')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
잘했다 이기야. 불법촬영은 흉자들도 예민한 사안이잖노. 이참에 모든 여초 베이스를 페미로 깔고 들어갈 수 있을 것 같노. 응원한다.
-- 기타 성별 비하 표현이 있습니다.--
*********************************************************************************************
ㅋㅋㅋ꿀팁ㄱㅅ
--  이 문장은 깨끗합니다! --
*********************************************************************************************
똥양남 혼자나 똥양남들끼리는 돈 아무리 내도 클럽 입장도 안 됨. 보지 데려가야 입장 됨 ㅋㅋ
-- 성소수자 인종/국적 기타 성별 비하 표현이 있습니다.--
*********************************************************************************************
여자=장애인=일베충
-- 연령 기타 성별 비하 표현이 있습니다.--
*********************************************************************************************
예의라도 있으면 신경 안쓰이는데틀딱 특징이 예의가 없다는 것다른 사람들은 전철 줄서서 기다리고 있으면 가운데 서서 사람들 내리기도 전에 사람들 밀치면서 전철타고 있음
-- 연령 기타 비하 표현이 있습니다.--
*********************************************************************************************
그냥 졸려서 자고 혼자 착각한거 같은데
--  이 문장은 깨끗합니다! --
*********************************************************************