## KDT_AI-classifying prejudice and discrimination texts
https://www.kaggle.com/competitions/kdtai-2/overview

In [29]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

# from torchinfo import summary

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import matplotlib.pyplot as plt

import re
from konlpy.tag import Mecab

# from gensim.models import word2vec

import sys
sys.path.append('..')

from python_scripts import data_setup, engine
# 네이버 맞춤법 검사기
from hanspell import spell_checker 



In [30]:
idx_to_class = {
  0: 'Origin(출신차별)',
  1: 'Physical(외모차별) 외모(신체, 얼굴) 및 장애인 차별 발언을 포함합니다.',
  2: 'Politics(정치성향차별)',
  3: 'Profanity(혐오욕설) 욕설,저주,혐오 단어, 비속어 및 기타 혐오 발언을 포함합니다.',
  4: 'Age(연령차별)',
  5: 'Gender(성차별) 성별 또는 성적 취향에 대한 차별 발언을 포함합니다.',
  6: 'Not Hate Speech(해당사항없음)',
}

max_length = 200

In [31]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [32]:
train_data = pd.read_csv('/Users/inho/KDT_AI/study/project/2nd_textClassification/dataset/train.csv')
train_data

Unnamed: 0,ID,text,label
0,0,유소영비호감 성형아줌마,1
1,1,나오지마라 썅,3
2,2,식상하고 지긋지긋했는데 잘 끝나네 오예 소리벗고 빤스질러~~!!!,6
3,3,성희롱 당할 얼굴이 아닌데?ㅋㅋㅋ,5
4,4,"""끝까지 해보자~쪽파리 원숭이 자한 쓰레기당""",0
...,...,...,...
65858,65858,ㅋ ㅋ 쇼~~~ 도 적당히,6
65859,65859,"""이젠 전라도 종것들 음식 불매다. 태양광 중금속 환경오염물로 만든 식품 사절이다""",0
65860,65860,조센징들은 참 피곤하게 산다,0
65861,65861,"""문빠 다모아서 빨갱이한테보내고 행복하게살라고""",2


In [33]:
max([len(s) for s in train_data['text']])

306

In [34]:
# 맞춤법 검사
def correct_text(text):
    try:
        result = spell_checker.check(text)
        return result.checked
    except Exception as e:
        print(f"Error occurred during spell checking: {e}")
        return text


In [49]:
# 예제 텍스트
text = "아버지가방에들어가신다"

# 맞춤법 및 띄어쓰기 교정
corrected_text = correct_text(text)
print("Original text:", text)
print("Corrected text:", corrected_text)


Original text: 아버지가방에들어가신다
Corrected text: 아버지가 방에 들어가신다


In [36]:
def preprocess_korean_text(text):
    # Remove 'ㅋ' characters
    text = re.sub(r'ㅋ+', '', text)
    # Remove 'ㅠ' characters
    text = re.sub(r'ㅠ+', '', text)
    # Remove punctuation and non-Korean characters characters
    text = re.sub(r"[^\u3131-\u3163\uac00-\ud7a3]+", "", text)
        
    # text 맞춤법 검사 후 저장.    
    text = correct_text(text)

    # Tokenize text using Mecab <= 이 부분은 다른 라이브러리로 바꾸어도 무방함
    mecab = Mecab()
    tokens = mecab.morphs(text)

    # Remove stop words (optional) <= 필요하다면 바꾸어 보아도 됨
    stop_words = ["은", "는", "이", "가", "을", "를", "에", "의", "로", "으로", "에서"]
    tokens = [t for t in tokens if t not in stop_words]

    # # Remove punctuation and non-Korean characters <= 필요하다면 바꾸어 보아도 됨
    # tokens = [re.sub(r"[^\u3131-\u3163\uac00-\ud7a3]+", "", t) for t in tokens]
    # tokens = [t for t in tokens if t]

    return tokens

In [38]:
preprocess_korean_text(text)

['문재원', '나', '왜', '안', '돼']

In [54]:
preprocess_korean_text('나는 지금 뭐하구 잇느냐?')

['나', '지금', '뭐', '하', '고', '있', '느냐']

In [40]:
word_index_to_key = []
word_key_to_index = {}

for i in tqdm_notebook(range(len(train_data)), 'Making word maps'):
    text = train_data.iloc[i]['text']
    tokens = preprocess_korean_text(text)

    for token in tokens:
        if token not in word_key_to_index:
            word_key_to_index[token] = len(word_index_to_key)
            word_index_to_key.append(token)

word_key_to_index['<unk>'] = len(word_index_to_key)
word_index_to_key.append('<unk>')

Making word maps:   0%|          | 0/65863 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


In [41]:
class KoreanTextDataset(Dataset):
    def __init__(self, data, preprocess_korean_text, max_length=100):
        self.data = data
        self.max_length = max_length
        self.preprocess_korean_text = preprocess_korean_text
        self.idx_to_class = sorted(data['label'].unique())
        self.class_to_idx = {}
        for i in range(len(self.idx_to_class)):
            self.class_to_idx[self.idx_to_class[i]] = i
        self.class_names = self.idx_to_class

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, "text"]
        label = self.data.loc[index, "label"]

        # Preprocess text using the preprocess_korean_text() function
        tokens = self.preprocess_korean_text(text)
        # Truncate or pad tokens to a fixed length
        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            tokens += [""] * (self.max_length - len(tokens))

        # Convert tokens to indices using the pre-trained GloVe or Word2Vec embeddings
        indices = []
        for token in tokens:
            if token in word_key_to_index:
                indices.append(word_key_to_index[token])
            else:
                indices.append(word_key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

        return torch.tensor(indices), torch.tensor(label)

In [42]:
train_dataset = KoreanTextDataset(
    data=train_data,
    preprocess_korean_text=preprocess_korean_text,
    max_length=max_length
)

train_dataset_sub, val_dataset_sub = data_setup.split_dataset(
    dataset=train_dataset,
    split_size=0.9,
    seed=42
)

[INFO] Splitting dataset of length 65863 into splits of size: 59276 and 6587


In [43]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)

        repeated_hidden = hidden.unsqueeze(0).repeat(max_len, 1, 1)

        energy = torch.tanh(self.attn(torch.cat((repeated_hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=0).unsqueeze(2)

In [44]:
class RNN_LSTM_attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pre_LSTM_layers, post_LSTM_layers, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        for param in self.embedding.parameters():
            param.requires_grad = False

        self.pre_lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=pre_LSTM_layers, bidirectional=True, dropout=dropout)
        self.post_lstm = nn.LSTM(hidden_dim * 2, hidden_dim, num_layers=post_LSTM_layers, bidirectional=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        # text = [batch size, seq len]

        embedded = self.embedding(text)
        embedded = self.dropout(embedded)
        # embedded = [batch size, seq len, emb dim]: [64, 200, 100]
        # print('embedded: ', embedded.shape)

        pre_lstm_outputs, (hidden, cell) = self.pre_lstm(embedded.permute(1, 0, 2))
        # output = [batch size, seq len, hid dim * num directions]: [200, 64, 1024]
        # hidden/cell = [num layers * num directions, batch size, hid dim]: [6, 64, 512]
        # print('outputs, hidden: ', pre_lstm_outputs.shape, hidden.shape)

        h = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # [64, 1024]
        # print('h: ', h.shape)

        attention_weights = self.attention(h, pre_lstm_outputs)
        # # attention_weights = [batch size, seq len, 1]: [200, 64, 1]
        # print('attention_weights: ', attention_weights.shape)

        context_vector = torch.bmm(pre_lstm_outputs.permute(1, 2, 0), attention_weights.permute(1, 0, 2)).squeeze(2)
        # # context_vector = [batch size, hid dim * num directions]: [64, 1024]
        # print('context_vector: ', context_vector.shape)

        _, (hidden, _) = self.post_lstm(context_vector.unsqueeze(0), (hidden, cell))
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden = [batch size, hid dim * num directions]: [64, 1024]
        # print('hidden: ', hidden.shape)

        out = self.fc(self.dropout(hidden.squeeze(0)))
        # out = [batch size, output dim]: [64, 7]
        # print('out: ', out.shape)

        return out


In [45]:
learning_rate_list = [1e-3] # 각 LR 별로 10 epoch 씩 연달아 학습 진행
weight_decay_list = [1e-4]
epochs_list = [5]
batch_size_list = [64]

In [46]:
class_names, num_classes = train_dataset.class_names, len(train_dataset.class_names)
class_names, num_classes

([0, 1, 2, 3, 4, 5, 6], 7)

In [47]:
model = RNN_LSTM_attention(
    vocab_size=len(word_index_to_key),
    embedding_dim=100,
    hidden_dim=512,
    output_dim=num_classes,
    pre_LSTM_layers=2,
    post_LSTM_layers=2,
    dropout=0.2
)

In [48]:
tuning_results = engine.HP_tune_train(
    model=model,
    model_generator=None,
    model_weights=None,
    model_name='Two_LSTM_attention_discrimination',
    train_dataset=train_dataset_sub,
    test_dataset=val_dataset_sub,
    class_names=class_names,
    learning_rate_list=learning_rate_list,
    weight_decay_list=weight_decay_list,
    epochs_list=epochs_list,
    batch_size_list=batch_size_list,
    is_tensorboard_writer=False,
    device=device,
    gradient_accumulation_num=1,
    saving_max=False,
    metric_learning=False
)

Two_LSTM_attention_discrimination_LR_0.001_WD_0.0001_BS_64_GA_1:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/927 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 0 | Train_loss: nan, Train_acc: 0.1297 | Test_loss: nan, Test_acc: 0.1275


train:   0%|          | 0/927 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 1 | Train_loss: nan, Train_acc: 0.1285 | Test_loss: nan, Test_acc: 0.1275


train:   0%|          | 0/927 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 2 | Train_loss: nan, Train_acc: 0.1287 | Test_loss: nan, Test_acc: 0.1275


train:   0%|          | 0/927 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)
Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


test:   0%|          | 0/103 [00:00<?, ?it/s]

Epoch: 3 | Train_loss: nan, Train_acc: 0.1288 | Test_loss: nan, Test_acc: 0.1275


train:   0%|          | 0/927 [00:00<?, ?it/s]

Error occurred during spell checking: Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 

In [None]:
loaded_weight = torch.load('..\models\discrimination\LSTM_attention_glove_discrimination_LR_0.0001_WD_0.0001_BS_64_GA_1_EPOCH_0_TEST-ACC_0.7751.pth')
model.load_state_dict(loaded_weight)

In [None]:
test_data = pd.read_csv('../data/Discrimination/test.csv')
labels = []

model.eval()

with torch.inference_mode():
  for i in tqdm_notebook(range(len(test_data))):
    test_text = test_data.loc[i, "text"]
    test_tokens = preprocess_korean_text(test_text)
    if len(test_tokens) > max_length:
        test_tokens = test_tokens[:max_length]
    else:
        test_tokens += [""] * (max_length - len(test_tokens))

    indices = []
    for token in test_tokens:
      if token in word_key_to_index:
        indices.append(word_key_to_index[token])
      else:
        indices.append(word_key_to_index['<unk>'])  # use the index of the <unk> token for out-of-vocabulary words

    test_logits = model(torch.tensor(indices).unsqueeze(0).to(device))
    labels.append(class_names[torch.argmax(test_logits.squeeze(0).cpu())])

In [None]:
test_data['label'] = [idx_to_class[label] for label in labels]
test_data

In [None]:
submission_data = pd.DataFrame({'ID': range(len(test_data)), 'label': labels})
submission_data.to_csv('../submissions/discrimination/submission.csv', index=False)
print('submission completed!')
submission_data.head()