In [4]:
pip3 install konlpy # 파이썬 3.x 버전

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - konlpy

Current channels:

  - https://conda.anaconda.org/conda-forge/osx-arm64
  - https://conda.anaconda.org/conda-forge/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.



Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import random
import os
import re
from konlpy.tag import Okt

In [17]:
# 데이터 전처리
def preprocess_data(text):
    # 정규식을 사용하여 특수문자 제거
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # 불필요한 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 소문자로 변환
    text = text.lower()
    
    # 자음으로만 구성된 부분 삭제 해야 하나..?
    
    # 토큰화
    okt = Okt()
    tokens = okt.morphs(text)
    
    return tokens

In [18]:
# 데이터 로드
train_data = pd.read_csv('/Users/inho/KDT_AI/study/project/2nd_textClassification/dataset/train.csv')
test_data = pd.read_csv('/Users/inho/KDT_AI/study/project/2nd_textClassification/dataset/test.csv')

# 텍스트 전처리
train_data['text'] = train_data['text'].apply(preprocess_data)
test_data['text'] = test_data['text'].apply(preprocess_data)

In [19]:
train_data.head(20)

Unnamed: 0,ID,text,label
0,0,"[유소영, 비호감, 성형, 아줌마]",1
1,1,"[나오지마라, 썅]",3
2,2,"[식상하고, 지긋지긋했는데, 잘, 끝나네, 오예, 소리, 벗고, 빤스질러]",6
3,3,"[성희롱, 당할, 얼굴, 이, 아닌데, ㅋㅋㅋ]",5
4,4,"[끝, 까지, 해보자, 쪽, 파리, 원숭이, 자, 한, 쓰레기, 당]",0
5,5,"[그냥, 이쁘다, 해주면되, 지, 악플, 들, 은, 진짜, 으, 휴, 대부분, 다,...",5
6,6,"[어우, 지겨워, 성형, 한, 애, 기사, 좀, 그만, 올려라, ㅉㅉ]",1
7,7,"[버핏, 은, 틀, 딱이라, 새로운, 걸, 못, 받아들여서, 그렇지]",4
8,8,"[전라도, 가, 일본, 인, 선조, 잖아요, 진짜, 토착, 왜구, 죠]",0
9,9,"[못, 생기, 고, 못, 난, 것, 들, 은, 혼자, 살아라, 너, 덜, 은, 쳐다...",1


In [22]:
# 어휘 사전 구축
vocab = {}
for text in train_data['text']:
    for token in text:
        if token not in vocab:
            vocab[token] = len(vocab) + 1

In [23]:
# 토큰화 함수 정의
def tokenizer(text):
    return [vocab[token] for token in text if token in vocab]

In [24]:
# 텍스트를 인덱스로 변환
train_data['text'] = train_data['text'].apply(tokenizer)
test_data['text'] = test_data['text'].apply(tokenizer)

In [25]:
train_data.head(20)

Unnamed: 0,ID,text,label
0,0,"[1, 2, 3, 4]",1
1,1,"[5, 6]",3
2,2,"[7, 8, 9, 10, 11, 12, 13, 14]",6
3,3,"[15, 16, 17, 18, 19, 20]",5
4,4,"[21, 22, 23, 24, 25, 26, 27, 28, 29, 30]",0
5,5,"[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 4...",5
6,6,"[45, 46, 3, 28, 47, 48, 49, 50, 51, 52]",1
7,7,"[53, 37, 54, 55, 56, 57, 58, 59, 60]",4
8,8,"[61, 62, 63, 64, 65, 66, 38, 67, 68, 69]",0
9,9,"[58, 70, 71, 58, 72, 73, 36, 37, 74, 75, 76, 7...",1


In [26]:
# 데이터셋 클래스
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, max_length):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]

        # 길이가 max_length가 넘지 않도록 자르고, 모자라면 padding 추가
        text = text[:self.max_length] + [0] * (self.max_length - len(text))

        return {
            'input_ids': torch.tensor(text, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# 데이터셋 생성
max_length = 128
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['text'], train_data['label'], test_size=0.2, random_state=42)
train_dataset = HateSpeechDataset(train_texts, train_labels, max_length)
val_dataset = HateSpeechDataset(val_texts, val_labels, max_length)

In [None]:
# 데이터 로더 생성 및 하이퍼파라미터 설정
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 하이퍼파라미터 설정
vocab_size = len(vocab) + 1  # +1 for padding
embedding_dim = 100
hidden_dim = 256
num_classes = 7
dropout_rate = 0.5
learning_rate = 1e-4
num_epochs = 10

# device 설정
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')



In [27]:

# 모델 정의
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, dropout_rate=0.5):
        super(BiLSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids):
        embedded = self.dropout(self.embedding(input_ids))
        lstm_output, _ = self.lstm(embedded)
        lstm_output = lstm_output[:, -1, :]
        output = self.fc(self.dropout(lstm_output))
        return output


In [30]:

# 모델, 옵티마이저, 손실함수 초기화
model = BiLSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_classes, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 학습 함수
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in dataloader:
        input_ids = data['input_ids'].to(device)
        labels = data['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct_predictions += (outputs.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), correct_predictions / len(dataloader.dataset)


In [None]:

# 평가 함수
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (outputs.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), correct_predictions / len(dataloader.dataset)


In [None]:

# 학습 및 검증
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)

    print(f'Epoch: {epoch + 1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc * 100:.2f}% | Val Loss: {val_loss:.4f} | Val Acc: {val_acc * 100:.2f}%')


In [None]:

# 테스트 데이터셋 준비
test_dataset = HateSpeechDataset(test_data['text'], [0] * len(test_data), max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 테스트 데이터셋 예측
model.eval()
predictions = []

with torch.no_grad():
    for data in test_dataloader:
        input_ids = data['input_ids'].to(device)

        outputs = model(input_ids)
        predictions.extend(outputs.argmax(1).tolist())



In [None]:
# 결과를 submission.csv 파일로 저장
submission = pd.read_csv('submission.csv')
submission['label'] = predictions
submission.to_csv('submission.csv', index=False)

print('결과가 submission.csv 파일에 저장되었습니다.')