In [4]:
!kaggle competitions download - c kdtai-2


usage: kaggle [-h] [-v] {competitions,c,datasets,d,kernels,k,config} ...
kaggle: error: unrecognized arguments: c kdtai-2


In [5]:
!unzip kdtai-2.zip - d dataset


unzip:  cannot find or open kdtai-2.zip, kdtai-2.zip.zip or kdtai-2.zip.ZIP.


In [6]:
%pip install nltk
%pip install gensim
%pip install soynlp


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os


In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
device = torch.device("mps")
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: mps


In [10]:
def preprocessing(df):
    pass

# Preprocessing 과정을 분리해야 합니다.


In [23]:
from enum import Enum
from nltk.tokenize import word_tokenize
from soynlp.hangle import decompose, character_is_korean
import re
from tqdm import tqdm


class Dataset_type(Enum):
    TRAIN = 0
    TEST = 1


class Korean_dataset(Dataset):
    def __init__(self, file_path, dataset_type: Dataset_type, embedding_model, is_split_jamo=False):
        super().__init__()
        self.file_path = file_path
        # self.transform = transform
        self.dataset_type = dataset_type
        self.data_df = pd.read_csv(self.file_path)
        self.embedding_model = embedding_model
        self.is_split_jamo = is_split_jamo
        self.embedded_src = self._embedding(self.data_df)

    def __len__(self):
        return len(self.embedded_src)

    def __getitem__(self, idx):
        src = self.embedded_src[idx]
        if self.dataset_type == Dataset_type.TRAIN:
            trg = torch.tensor(self.data_df.loc[idx, "label"]).to(device)
            return src, trg
        else:
            return src

    def _embedding(self, df):
        src_list = []
        rows_to_drop = []
        # preprocessed_df = df.copy()
        for idx, text in enumerate(df.loc[:, "text"]):
            text = self._remove_special_characters(text)
            src = word_tokenize(text)
            if not src:
                rows_to_drop.append(idx)
                continue
            src = [self._split_jamo(word)
                   for word in src] if self.is_split_jamo else src
            src = [self.embedding_model.wv[word] for word in src]
            src = torch.FloatTensor(src).to(device)
            src_list.append(src)
        df.drop(rows_to_drop, inplace=True)
        df = df.reset_index(drop=True, inplace=True)
        return src_list

    def _remove_special_characters(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def _split_jamo(self, word):

        def transform(char):
            if char == ' ':
                return char
            cjj = decompose(char)
            if len(cjj) == 1:
                return cjj
            cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
            return cjj_

        sent_ = []
        for char in word:
            if character_is_korean(char):
                sent_.append(transform(char))
            else:
                sent_.append(char)
        doublespace_pattern = re.compile('\s+')
        sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
        return sent_

In [14]:
import gensim
current_path = os.getcwd()
model_file_path = os.path.join(current_path, "embedding_model", "wiki.ko.bin")
embedding_model = gensim.models.fasttext.load_facebook_model(model_file_path)

In [24]:
from torch.utils.data import random_split

current_path = os.getcwd()
train_file_path = os.path.join(current_path, "dataset", "train.csv")
test_file_path = os.path.join(current_path, "dataset", "test.csv")

train_set = Korean_dataset(file_path=train_file_path,
                           dataset_type=Dataset_type.TRAIN,
                           is_split_jamo=False, embedding_model=embedding_model)
test_set = Korean_dataset(file_path=test_file_path,
                          dataset_type=Dataset_type.TEST,
                          is_split_jamo=False, embedding_model=embedding_model)


# train_valid_ratio = 0.9
# train_set_count = int(len(train_set) * train_valid_ratio)
# val_set_count = len(train_set) - train_set_count
# train_set, val_set = random_split(train_set, [train_set_count, val_set_count])
# print(len(train_set))
# print(len(val_set))
# print(len(test_set))

In [None]:
print(len(train_set))


65835


In [None]:
train_set[65833]


(tensor([[-0.0718,  0.0064, -0.1133,  ..., -0.0575, -0.0191, -0.1486],
         [-0.0041,  0.0910, -0.2717,  ..., -0.0592, -0.1021, -0.1022],
         [ 0.0286,  0.0155, -0.0673,  ...,  0.0514, -0.0640,  0.1243],
         [-0.0167,  0.0737, -0.2717,  ...,  0.0057,  0.0109,  0.1132]]),
 2)

In [27]:
from torch.nn.utils.rnn import pad_sequence


class LSTM_Net(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(LSTM_Net, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim,
                            num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0),
                         self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0),
                         self.hidden_dim).to(x.device)
        out, (hn, cn) = self.lstm(x, (h0, c0))
        # print(hn.shape)
        # print(hn.squeeze(0).shape)
        hn_last = hn[-1]  # 마지막 레이어의 hidden state 선택
        x = self.fc(hn_last)
        return x


input_dim = 300
hidden_dim = 128
output_dim = 7
num_layers = 1
batch_size = 64
num_of_epoch = 20
learning_rate = 0.0005

model = LSTM_Net(input_dim=input_dim, hidden_dim=hidden_dim,
                 output_dim=output_dim, num_layers=num_layers).to(device)

# 손실 함수와 최적화 알고리즘을 정의합니다.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


def train(dataset, valid_rate, batch_size, optimizer, num_of_epoch, valid_random_seed=None):

    def collate_fn(batch):
        inputs = [item[0] for item in batch]
        targets = [item[1] for item in batch]

        # 입력 데이터의 길이를 기준으로 내림차순으로 정렬
        sorted_idx = sorted(range(len(inputs)),
                            key=lambda i: len(inputs[i]), reverse=True)
        inputs = [inputs[i] for i in sorted_idx]
        targets = [targets[i] for i in sorted_idx]

        # 패딩된 새로운 텐서 생성
        inputs = pad_sequence(inputs, batch_first=True, padding_value=0.0)
        # print(inputs.shape)

        return inputs, torch.tensor(targets)

    dataset = dataset
    train_set_count = int(len(dataset) * valid_rate)
    val_set_count = len(dataset) - train_set_count

    if valid_random_seed is not None:
        torch.manual_seed(valid_random_seed)

    train_set, val_set = random_split(
        dataset, [train_set_count, val_set_count])
    train_loader = DataLoader(train_set, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_set, batch_size=batch_size,
                            shuffle=True, collate_fn=collate_fn)
    optimizer = optimizer
    criterion = nn.CrossEntropyLoss().to(device)
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in tqdm(range(num_of_epoch)):
        model.train()
        correct_train = 0
        for batch_idx, (data, target) in tqdm(enumerate(train_loader)):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            _, pred = torch.max(output.data, 1)
            correct_train += pred.eq(target.view_as(pred)).sum().item()

            if batch_idx % 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch + 1, batch_idx *
                    len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))

        train_accuracy = 100. * (correct_train / len(train_set))
        train_losses.append(loss.item())
        train_accs.append(train_accuracy)
        print(f"Epoch: {epoch + 1} - train accuracy: {train_accuracy}")

        model.eval()
        correct_val = 0
        with torch.no_grad():
            for data, target in tqdm(val_loader):
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = criterion(output, target)
                _, pred = torch.max(output.data, 1)
                correct_val += pred.eq(target.view_as(pred)).sum().item()

            val_accuracy = 100. * (correct_val / len(val_set))
            val_losses.append(loss.item())
            val_accs.append(val_accuracy)

            print('Epoch {} finished: train loss = {}, val loss = {}'.format(epoch + 1,
                                                                             train_losses[-1], val_losses[-1]))
            print(f"Epoch: {epoch + 1} - Validation accuracy: {val_accuracy}")


In [None]:
train(dataset=train_set, valid_rate=0.9, batch_size=batch_size,
      optimizer=optimizer, num_of_epoch=num_of_epoch, valid_random_seed=42)

  0%|          | 0/20 [00:00<?, ?it/s]







































926it [00:39, 23.32it/s]


Epoch: 1 - train accuracy: 37.42215321260401


100%|██████████| 103/103 [00:01<00:00, 61.24it/s]
  5%|▌         | 1/20 [00:41<13:06, 41.40s/it]

Epoch 1 finished: train loss = 1.6872869729995728, val loss = 1.8033924102783203
Epoch: 1 - Validation accuracy: 36.72539489671932










































926it [00:41, 22.08it/s]


Epoch: 2 - train accuracy: 40.25417292535147


100%|██████████| 103/103 [00:01<00:00, 62.42it/s]
 10%|█         | 2/20 [01:24<12:48, 42.69s/it]

Epoch 2 finished: train loss = 1.4264148473739624, val loss = 1.4901984930038452
Epoch: 2 - Validation accuracy: 42.20838396111786










































926it [00:40, 23.02it/s]


Epoch: 3 - train accuracy: 46.108926431621406


100%|██████████| 103/103 [00:01<00:00, 62.57it/s]
 15%|█▌        | 3/20 [02:06<11:59, 42.32s/it]

Epoch 3 finished: train loss = 1.3897002935409546, val loss = 1.352895975112915
Epoch: 3 - Validation accuracy: 47.797691373025515










































926it [00:37, 24.37it/s]


Epoch: 4 - train accuracy: 50.507164436043276


100%|██████████| 103/103 [00:01<00:00, 58.01it/s]
 20%|██        | 4/20 [02:46<11:00, 41.31s/it]

Epoch 4 finished: train loss = 1.258263349533081, val loss = 1.5595824718475342
Epoch: 4 - Validation accuracy: 53.18955042527339










































926it [00:39, 23.48it/s]


Epoch: 5 - train accuracy: 56.834483806180494


100%|██████████| 103/103 [00:01<00:00, 67.52it/s]
 25%|██▌       | 5/20 [03:27<10:17, 41.19s/it]

Epoch 5 finished: train loss = 1.1361361742019653, val loss = 1.1748541593551636
Epoch: 5 - Validation accuracy: 60.0546780072904










































926it [00:37, 24.48it/s]


Epoch: 6 - train accuracy: 61.30023121972626


100%|██████████| 103/103 [00:01<00:00, 65.19it/s]
 30%|███       | 6/20 [04:07<09:28, 40.58s/it]

Epoch 6 finished: train loss = 0.9270182847976685, val loss = 1.189815878868103
Epoch: 6 - Validation accuracy: 62.30255164034022










































926it [00:38, 23.83it/s]


Epoch: 7 - train accuracy: 64.55249700426997


100%|██████████| 103/103 [00:01<00:00, 63.65it/s]
 35%|███▌      | 7/20 [04:47<08:47, 40.55s/it]

Epoch 7 finished: train loss = 1.2002496719360352, val loss = 0.8146528601646423
Epoch: 7 - Validation accuracy: 64.33778857837181










































926it [00:38, 24.00it/s]


Epoch: 8 - train accuracy: 66.83937823834198


100%|██████████| 103/103 [00:01<00:00, 56.16it/s]
 40%|████      | 8/20 [05:27<08:06, 40.51s/it]

Epoch 8 finished: train loss = 0.7805110216140747, val loss = 0.8938431739807129
Epoch: 8 - Validation accuracy: 65.53766707168894










































926it [00:39, 23.60it/s]


Epoch: 9 - train accuracy: 69.01993215304383


100%|██████████| 103/103 [00:01<00:00, 63.77it/s]
 45%|████▌     | 9/20 [06:08<07:26, 40.62s/it]

Epoch 9 finished: train loss = 0.9068407416343689, val loss = 0.9316416382789612
Epoch: 9 - Validation accuracy: 67.51215066828675































605it [00:26, 21.68it/s][A











926it [00:40, 22.99it/s]


Epoch: 10 - train accuracy: 70.50851462422575


100%|██████████| 103/103 [00:01<00:00, 63.37it/s]
 50%|█████     | 10/20 [06:50<06:50, 41.02s/it]

Epoch 10 finished: train loss = 1.1552377939224243, val loss = 0.9961186647415161
Epoch: 10 - Validation accuracy: 68.60571081409478










































926it [00:38, 24.07it/s]


Epoch: 11 - train accuracy: 71.82326036691364


100%|██████████| 103/103 [00:01<00:00, 60.75it/s]
 55%|█████▌    | 11/20 [07:30<06:06, 40.76s/it]

Epoch 11 finished: train loss = 0.8101621866226196, val loss = 1.0594054460525513
Epoch: 11 - Validation accuracy: 69.13730255164035










































926it [00:38, 24.00it/s]


Epoch: 12 - train accuracy: 73.04686840728426


100%|██████████| 103/103 [00:01<00:00, 59.43it/s]
 60%|██████    | 12/20 [08:11<05:25, 40.63s/it]

Epoch 12 finished: train loss = 0.5926291346549988, val loss = 0.9823600053787231
Epoch: 12 - Validation accuracy: 69.60814094775213










































926it [00:37, 24.44it/s]


Epoch: 13 - train accuracy: 74.28397832947967


100%|██████████| 103/103 [00:01<00:00, 59.35it/s]
 65%|██████▌   | 13/20 [08:50<04:42, 40.32s/it]

Epoch 13 finished: train loss = 0.6557268500328064, val loss = 1.005520224571228
Epoch: 13 - Validation accuracy: 69.33475091130012










































926it [00:37, 24.67it/s]


Epoch: 14 - train accuracy: 75.18522894128368


100%|██████████| 103/103 [00:01<00:00, 65.96it/s]
 70%|███████   | 14/20 [09:29<03:59, 39.95s/it]

Epoch 14 finished: train loss = 0.6723513603210449, val loss = 0.8782876133918762
Epoch: 14 - Validation accuracy: 69.91190765492102










































926it [00:37, 24.41it/s]


Epoch: 15 - train accuracy: 76.30926060319658


100%|██████████| 103/103 [00:01<00:00, 64.06it/s]
 75%|███████▌  | 15/20 [10:09<03:19, 39.83s/it]

Epoch 15 finished: train loss = 0.5451766848564148, val loss = 0.8563019037246704
Epoch: 15 - Validation accuracy: 70.51944106925882










































926it [00:38, 24.21it/s]


Epoch: 16 - train accuracy: 77.40966397191609


100%|██████████| 103/103 [00:01<00:00, 62.01it/s]
 80%|████████  | 16/20 [10:49<02:39, 39.86s/it]

Epoch 16 finished: train loss = 0.5184881687164307, val loss = 0.8039563894271851
Epoch: 16 - Validation accuracy: 70.4131227217497










































926it [00:38, 24.33it/s]


Epoch: 17 - train accuracy: 78.55394845656613


100%|██████████| 103/103 [00:01<00:00, 64.55it/s]
 85%|████████▌ | 17/20 [11:29<01:59, 39.80s/it]

Epoch 17 finished: train loss = 0.6773077249526978, val loss = 0.9567915797233582
Epoch: 17 - Validation accuracy: 70.56500607533415










































926it [00:39, 23.50it/s]


Epoch: 18 - train accuracy: 79.27629913419183


100%|██████████| 103/103 [00:01<00:00, 61.60it/s]
 90%|█████████ | 18/20 [12:10<01:20, 40.19s/it]

Epoch 18 finished: train loss = 0.7313405871391296, val loss = 0.9675666093826294
Epoch: 18 - Validation accuracy: 70.44349939246659










































926it [00:37, 24.55it/s]


Epoch: 19 - train accuracy: 80.35644968017417


100%|██████████| 103/103 [00:01<00:00, 63.29it/s]
 95%|█████████▌| 19/20 [12:49<00:39, 39.94s/it]

Epoch 19 finished: train loss = 0.4725312888622284, val loss = 0.7491521835327148
Epoch: 19 - Validation accuracy: 70.6865127582017










































926it [00:37, 24.45it/s]


Epoch: 20 - train accuracy: 81.41128419773506


100%|██████████| 103/103 [00:01<00:00, 62.03it/s]
100%|██████████| 20/20 [13:29<00:00, 40.45s/it]

Epoch 20 finished: train loss = 0.6100606918334961, val loss = 1.0738309621810913
Epoch: 20 - Validation accuracy: 70.18529769137302





In [26]:
train(dataset=train_set, valid_rate=0.9, batch_size=batch_size,
      optimizer=optimizer, num_of_epoch=num_of_epoch, valid_random_seed=42)

0it [00:01, ?it/s]/20 [00:00<?, ?it/s]
  0%|          | 0/20 [00:01<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [17]:
!conda env config vars set PYTORCH_ENABLE_MPS_FALLBACK = 1


To make your changes take effect please reactivate your environment


In [22]:
!echo $PYTORCH_ENABLE_MPS_FALLBACK


1
