In [26]:
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import numpy as np
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [27]:
def split_dataset(label_file, test_size=0.2, shuffle=True):
    # CSV 파일에서 이미지와 레이블 데이터를 불러옴
    labels = pd.read_csv(label_file)

    # train, test 데이터를 나눔
    train_labels, test_labels = train_test_split(labels, test_size=test_size, shuffle=shuffle)

    return train_labels, test_labels

In [54]:
jamo_vocabulary = ['ㅎ', 'ㅔ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 
             'ㅕ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 
             'ㅖ', 'ㄱ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ', 'ㅏ', 'ㅐ', 
             'ㅃ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅝ', 'ㅞ', 'ㄸ', 'ㅜ', 'ㅉ', '<sos>', '<eos>']

jamoindex = {word: i for i, word in enumerate(jamo_vocabulary)}

In [49]:
print(jamo_to_index)

{'ㅎ': 0, 'ㅔ': 1, 'ㅀ': 2, 'ㅁ': 3, 'ㅂ': 4, 'ㅄ': 5, 'ㅅ': 6, 'ㅆ': 7, 'ㅇ': 8, 'ㅈ': 9, 'ㅊ': 10, 'ㅋ': 11, 'ㅌ': 12, 'ㅍ': 13, 'ㅕ': 14, 'ㄲ': 15, 'ㄳ': 16, 'ㄴ': 17, 'ㄵ': 18, 'ㄶ': 19, 'ㄷ': 20, 'ㄹ': 21, 'ㄺ': 22, 'ㄻ': 23, 'ㄼ': 24, 'ㄽ': 25, 'ㄾ': 26, 'ㄿ': 27, 'ㅖ': 28, 'ㄱ': 29, 'ㅗ': 30, 'ㅘ': 31, 'ㅙ': 32, 'ㅚ': 33, 'ㅛ': 34, 'ㅟ': 35, 'ㅠ': 36, 'ㅡ': 37, 'ㅢ': 38, 'ㅣ': 39, 'ㅏ': 40, 'ㅐ': 41, 'ㅃ': 42, 'ㅑ': 43, 'ㅒ': 44, 'ㅓ': 45, 'ㅝ': 46, 'ㅞ': 47, 'ㄸ': 48, 'ㅜ': 49, 'ㅉ': 50, '<sos>': 51, '<eos>': 52}


In [55]:
class jamo_to_index:
  def __init__(self, jamo_index = jamoindex):
    self.jamo_list = jamo_index

  def process(self, label):
    label_indices = [self.jamo_list[char] for char in label]

    src_target = [self.jamo_list['<sos>']] + label_indices
    src_target_tensor = torch.tensor(src_target, dtype=torch.long)

    tgt_target = label_indices + [self.jamo_list['<eos>']]
    tgt_target_tensor = torch.tensor(tgt_target, dtype=torch.long)

    return src_target_tensor, tgt_target_tensor


      

In [56]:
print(jamo_to_index().process('ㅎㅔㄹ'))

(tensor([51,  0,  1, 21]), tensor([ 0,  1, 21, 52]))


In [57]:
class HangulOCRDataset(Dataset):
  def __init__(self, img_dir, labels, transform = None, max_seq_len = 16, vocabulary = None):
    self.img_dir = img_dir
    self.labels = labels
    self.transform = transform
    self.vocabulary = jamo_to_index()

  def __len__(self):
    return len(self.labels)
  
  def __getitem__(self, idx):
    # 데이터셋에서 인덱스에 해당하는 이미지와 레이블을 불러옴
    img_name = os.path.join(self.img_dir, self.labels.iloc[idx, 0])
    image = Image.open(img_name)
    label = self.labels.iloc[idx, 1]

    src_label, tgt_label = self.vocabulary.process(label)

    if self.transform:
      image = self.transform(image)
    
    return image, src_label, tgt_label
  


In [58]:
class JamoEmbedding(nn.Module):
  def __init__(self, vocab_size = 53, embedding_dim = 512, pad_idx = None):
    super(JamoEmbedding, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    
  def forward(self, x):
    return self.embedding(x)

In [None]:
class preprocess_label

In [5]:
class HangulViT(nn.Module):
    def __init__(self, encoder, decoder, emb_size, 

FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/file.json'

In [6]:
def display_image_batch(image_batch, rows=4, cols=8):
  fig, axes = plt.subplots(rows, cols, figsize=(cols * 2, rows * 2))
  for i, ax in enumerate(axes.flat):
    if i < len(image_batch):
      ax.imshow(np.asarray(image_batch[i].squeeze(0)))
      # ax.set_title(image_labels[i])
      ax.axis("off")
    else:
      ax.axis('off')

  plt.tight_layout()
  plt.show()

In [59]:
transform = transforms.Compose([
    # transforms.Resize((360, 360)),  # 이미지 크기 조정
    transforms.ToTensor(),  # 텐서로 변환
    transforms.Normalize((0.5,), (0.5,))  # 정규화
])

# CSV 파일 경로 및 이미지 폴더 설정
img_dir = 'D:/dataset/13.한국어글자체/01.손글씨/image'
label_file = 'D:/dataset/13.한국어글자체/01.손글씨/new_labels.csv'

# 데이터셋 분할
train_labels, test_labels = split_dataset(label_file, test_size=0.2, shuffle=True)

# train과 test 데이터셋 초기화
train_dataset = HangulOCRDataset(img_dir=img_dir, labels=train_labels, transform=transform)
test_dataset = HangulOCRDataset(img_dir=img_dir, labels=test_labels, transform=transform)

# DataLoader로 셔플 및 배치 설정
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 예시: 첫 번째 배치 확인
images, src_labels, tgt_labels = next(iter(train_loader))
# display_image_batch(images)

# images, _ = next(iter(test_loader))
# display_image_batch(images)

print(images.shape)
# print(labels[0]) # label은 string 타입
print(src_labels)


RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [6] at entry 3

In [8]:
print(labels)

('ㄲㅖㅄ', 'ㅇㅑㅅ', 'ㅂㅜㄱ', 'ㅁㅏㄵ', 'ㅋㅝㅍ', 'ㅎㅛㄼ', 'ㅍㅗㄽ', 'ㅊㅑㅅ', 'ㄱㅓㅈㅓㄹㅎㅏㄷㅏ', 'ㅈㅔㄷ', 'ㅃㅏㄲ', 'ㅎㅐㄺ', 'ㅎㅐㅀ', 'ㄷㅖㄹ', 'ㄷㅠㅍ', 'ㄱㅓㄹㅇㅓㄱㅏㄷㅏ', 'ㅈㅝㄱ', 'ㅂㅏㄴㅈㅣ', 'ㅁㅝㅎ', 'ㄴㅟㅈ', 'ㅃㅑㅎ', 'ㅊㅘㄹ', 'ㄲㅚㄶ', 'ㅋㅡ', 'ㅁㅕㅇ', 'ㅉㅣㅅ', 'ㄷㅜㄹㅡㄷㅏ', 'ㅁㅓㄱㄱㅗㅅㅏㄹㄷㅏ', 'ㄷㅜㄲ', 'ㅍㅕㄷ', 'ㄲㅕㅇ', 'ㅇㅗㄹㅣ')
