In [2]:
!kaggle competitions download - c kdtai-2


kdtai-2.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
!unzip kdtai-2.zip - d dataset


Archive:  kdtai-2.zip
  inflating: dataset/dataset/all_zero_submission.csv  
  inflating: dataset/dataset/random_submission.csv  
  inflating: dataset/dataset/submission.csv  
  inflating: dataset/dataset/test.csv  
  inflating: dataset/dataset/train.csv  


In [32]:
%pip install nltk
%pip install gensim
%pip install soynlp


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting soynlp
  Downloading soynlp-0.0.493-py3-none-any.whl (416 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.8/416.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=0.20.0
  Downloading scikit_learn-1.2.2-cp310-cp310-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn, soynlp
Successfully installed scikit-learn-1.2.2 soynlp-0.0.493 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# device = torch.device("mps")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [39]:
from enum import Enum
from nltk.tokenize import word_tokenize
from soynlp.hangle import decompose, character_is_korean
import re
from tqdm import tqdm


class Dataset_type(Enum):
    TRAIN = 0
    TEST = 1


class Korean_dataset(Dataset):
    def __init__(self, file_path, dataset_type: Dataset_type, model, is_split_jamo=False):
        super().__init__()
        self.file_path = file_path
        # self.transform = transform
        self.dataset_type = dataset_type
        self.data_df = pd.read_csv(self.file_path)
        self.model = model
        self.is_split_jamo = is_split_jamo

    def __len__(self):
        return len(self.data_df)

    def __getitem__(self, idx):
        src = self._embedding(self.data_df.loc[idx, "text"])

        if self.dataset_type == Dataset_type.TRAIN:
            trg = self.data_df.loc[idx, "label"]
            return src, trg
        else:
            return src

    def _embedding(self, text):
        text = self._remove_special_characters(text)
        src = word_tokenize(text)
        src = [self._split_jamo(word)
               for word in src] if self.is_split_jamo else src
        src = [self.model.wv[word] for word in src]
        src = torch.to_tensor(src)

        return src

    def _remove_special_characters(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def _split_jamo(self, word):

        def transform(char):
            if char == ' ':
                return char
            cjj = decompose(char)
            if len(cjj) == 1:
                return cjj
            cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
            return cjj_

        sent_ = []
        for char in word:
            if character_is_korean(char):
                sent_.append(transform(char))
            else:
                sent_.append(char)
        doublespace_pattern = re.compile('\s+')
        sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
        return sent_

In [42]:
import gensim
current_path = os.getcwd()
model_file_path = os.path.join(current_path, "embedding_model", "wiki.ko.bin")
model = gensim.models.fasttext.load_facebook_model(model_file_path)

In [43]:
from torch.utils.data import random_split

current_path = os.getcwd()
train_file_path = os.path.join(current_path, "dataset", "train.csv")
test_file_path = os.path.join(current_path, "dataset", "test.csv")

train_set = Korean_dataset(file_path=train_file_path,
                           dataset_type=Dataset_type.TRAIN,
                           is_split_jamo=True, model=model)
test_set = Korean_dataset(file_path=test_file_path,
                          dataset_type=Dataset_type.TEST,
                          is_split_jamo=True, model=model)

torch.manual_seed(42)

train_valid_ratio = 0.9
train_set_count = int(len(train_set) * train_valid_ratio)
val_set_count = len(train_set) - train_set_count
train_set, val_set = random_split(train_set, [train_set_count, val_set_count])
print(len(train_set))
print(len(val_set))
print(len(test_set))

59276
6587
13491


In [47]:
train_set[1][0].shape


AttributeError: 'list' object has no attribute 'shape'