In [2]:
import orjson
import glob
import pandas as pd
import os
import torch
import pickle
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:

train_path = './data/Train/*.json' 
test_path = './data/Valid/*.json'
# 경로 생성
os.makedirs('./data/Train', exist_ok=True)
os.makedirs('./data/Valid', exist_ok=True)

In [4]:
table = {'train':[], 'test':[]}

# train data
for x in glob.glob(train_path):
    with open(x,'rb')as f:
        data = orjson.loads(f.read())
    data = pd.json_normalize(data['Conversation'])
    table['train'].append(data)

# test data
for x in glob.glob(test_path):
    with open(x, 'rb') as f:
        data = orjson.loads(f.read())
    data = pd.json_normalize(data['Conversation'])
    table['test'].append(data)


In [5]:
# dataframe
train_df = pd.concat(table['train'], ignore_index = True)
test_df = pd.concat(table['test'], ignore_index = True)

trainDF = train_df[['Text', 'VerifyEmotionTarget']]
testDF = test_df[['Text','VerifyEmotionTarget']]

In [6]:
minitrain = trainDF[:50000]
minitest = test_df[:10000]

In [7]:
import string
punctuation = string.punctuation

krpattern = "[^ㄱ-ㅎㅏ-ㅣ가-힣]"
trainDF['Text'] = trainDF['Text'].str.replace(krpattern,'',regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainDF['Text'] = trainDF['Text'].str.replace(krpattern,'',regex=True)


In [8]:
#불용어 사전 불러오기
from urllib.request import urlretrieve

filename = '../datas/kr_stopwords.txt'
url = "https://gist.githubusercontent.com/chulgil/d10b18575a73778da4bc83853385465c/raw/a1a451421097fa9a93179cb1f1f0dc392f1f9da9/stopwords.txt"
ret = urlretrieve(url, filename)

with open(filename, 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

In [9]:
#토크나이저
from konlpy.tag import Okt
okt = Okt()

In [10]:
for sentence in trainDF['Text']:
    print(type(sentence))
    print(okt.morphs(sentence))
    break

<class 'str'>
['음식', '을', '잘', '표현', '해', '야', '돼요']


In [11]:
# 토크나이즈 이후 불용어 제거하기
train_tokens = [okt.morphs(sentence)for sentence in minitrain['Text']] # tokenize
train_tokens = [[token for token in sent if token not in stopwords]for sent in train_tokens] # extract stopwords
test_tokens = [okt.morphs(sentence)for sentence in minitest['Text']] #tokenize
test_tokens = [[token for token in sent if token not in stopwords]for sent in test_tokens] #extract stopwords

In [12]:
backup_train = train_tokens
backup_test = test_tokens

In [13]:
train_tokens

[['음식', '잘', '표현', '해야', '돼요'],
 ['돼요', '후회'],
 ['조심한', '뭐랄지'],
 ['벌써', '시작', '했네'],
 ['안녕'],
 ['안녕'],
 [],
 ['하이루'],
 ['잘', '지냈어'],
 ['당연하지'],
 ['음', '이러고', '싶었나'],
 ['내', '보고', '싶었나'],
 ['요즘', '많이', '없어', '보이더라'],
 ['서울', '문화'],
 ['엄마', '한테', '대답', '도', '안', '하고'],
 ['는', '요즘', '취미', '생활', '즐기며', '살', '고', '있어'],
 ['죠'],
 [],
 ['신생아', '뭔', '데'],
 ['연애', '는'],
 ['예전', '에는', '취미', '생활', '잠자는', '거', '좋아했잖아'],
 ['요즘', '은', '잠자는', '걸', '줄였고', '뭘', '따른', '걸', '해야', '해'],
 ['추석', '보험', '나누게'],
 ['뭐', '잠도', '뭐', '자긴', '요즘', '에는', '독서'],
 ['뭐'],
 ['인사', '요즘'],
 ['하고', '있어'],
 ['웬일', '가을', '이라서', '독서'],
 ['원인'],
 ['아무리', '일어나서'],
 ['요'],
 ['라는', '거야'],
 ['머리', '도', '식힐', '겸', '내', '요즘', '자주', '깜빡깜빡', '해서'],
 ['멀드'],
 [],
 ['뇌', '활동', '위해', '서'],
 ['독서', '시작', '했는데', '막상', '해보니까'],
 ['감정', '적', '많은', '도움', '되었어'],
 [],
 [],
 ['요즘', '책', '읽고', '있는데'],
 ['음', '실리콘밸리', '위대한', '코치'],
 ['대해', '서', '읽고', '있으면요'],
 [],
 ['제품', '들어오니까'],
 ['제목', '들으니까'],
 ['어려운', '책', '인', '거', '같은데'],
 [],
 ['음', '실리콘

In [14]:
vocab= {'<PAD>' : 800000, '<UNK>':799999}
for sen in train_tokens:
    for word in sen:
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1
        
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [15]:
sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10]

[('<PAD>', 800000),
 ('<UNK>', 799999),
 ('는', 4115),
 ('도', 3293),
 ('거', 3102),
 ('은', 2317),
 ('뭐', 2276),
 ('.', 2007),
 ('음', 1960),
 ('안', 1693)]

In [16]:
clean = {token:value for token,value in vocab.items() if token not in stopwords}
sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:10]

[('<PAD>', 800000),
 ('<UNK>', 799999),
 ('는', 4115),
 ('도', 3293),
 ('거', 3102),
 ('은', 2317),
 ('뭐', 2276),
 ('.', 2007),
 ('음', 1960),
 ('안', 1693)]

In [17]:
# 인코딩 / 디코딩 딕셔너리 생성하기

encode = {token: idx for idx, token in enumerate(vocab)}
decode = {idx: token for idx, token in enumerate(vocab)}

UNK = encode.get('<UNK>')

In [18]:
# encode
UNK = encode.get('')

# trainid 생성
trainid = [[encode.get(token, UNK) for token in sen] for sen in train_tokens]

# testid 생성
testid = [[encode.get(token, UNK) for token in sen] for sen in test_tokens]


In [19]:
#PAD

def pad_sequences(sequences, max_len, pad_token):
    padded = []
    for seq in sequences:
        if seq is None: # 시퀀스가 none이라면 패딩하지 않고 건너뛰기
            continue
        if len(seq)<max_len:
            seq = seq+[pad_token] * (max_len - len(seq))
        else:
            seq = seq[:max_len]
        padded.append(seq)
    return padded

In [20]:
MAX_LEN = max([len(sen) for sen in trainid + testid])
PAD_ID = encode.get('<PAD>')

trainid = pad_sequences(trainid, MAX_LEN, PAD_ID)
testid = pad_sequences(testid, MAX_LEN, PAD_ID)

print(f'TRAIN ID{trainid[0]}\n\nTEST ID{testid[0]}')

TRAIN ID[2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

TEST ID[2768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:
# RNN MODEL

import numpy as np

xtrain = np.array(trainid)
ytrain = np.array(minitrain['VerifyEmotionTarget'])
xtest = np.array(testid)
ytest = np.array(minitest['VerifyEmotionTarget'])

In [22]:
# label to int
# labels: 기 쁨, 놀라움, 없음, 사랑스러움, 화남, 슬픔, 두려움
import torch

label_to_index = {'기쁨':1, '놀라움':2, '사랑스러움':3, '화남':4, '슬픔':5, '두려움':6, '없음':7}

def encode_label(label):
    try:
        return label_to_index[label]
    except KeyError:
        print(f"Unknown label: {label}")
        return label_to_index['없음']  # 레이블이 없는 경우 '없음'으로 처리

ytrain = [encode_label(label) for label in minitrain['VerifyEmotionTarget']]
ytrain = torch.LongTensor(ytrain)

ytest = [encode_label(label) for label in minitest['VerifyEmotionTarget']]
ytest = torch.LongTensor(ytest)




In [23]:
# 단어사전 크기 계산
vocab_num = len(encode)+1

print(vocab_num)

22561


In [24]:
import torch.nn as nn

# 모델 정의하기
class RNNClassifier(nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding =nn.Embedding(n_vocab, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 8) #출력 크기 = 7: 7중 클래스 분류
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        output = self.fc(output[:,-1,:])
        output = self.sigmoid(output) #sigmoid
        return output
    
#instance
model = RNNClassifier(vocab_num,embedding_dim=64, hidden_dim=128, n_layers=2, dropout=0.2)

#lossfn, optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

In [25]:
from torch.utils.data import TensorDataset, DataLoader

# ytrain의 형태를 수정하여 일치시키기
ytrain = ytrain.view(-1)
print("ytrain shape after view:", ytrain.shape)

# xtrain을 torch Tensor로 변환
xtrain_tensor = torch.LongTensor(xtrain)

# TensorDataset 다시 초기화
batch_size_value = 32
train_dataset = TensorDataset(xtrain_tensor, ytrain)
train_loader = DataLoader(train_dataset, batch_size=batch_size_value, shuffle=False)


ytrain shape after view: torch.Size([50000])


In [26]:
# xtest의 내부 데이터 타입 확인
types_set = set(type(token) for seq in xtest for token in seq)

print("Unique data types in xtest:", types_set)

# 문자열을 정수로 변환하고 None 값을 제거
xtest_np = [[encode.get(token, UNK) for token in seq] for seq in xtest if seq is not None and all(isinstance(token, str) for token in seq)]


# numpy.ndarray를 torch.Tensor로 변환
xtest_tensor = torch.LongTensor(xtest_np)

# testdataset 다시 초기화하기
#test_dataset = TensorDataset(xtest_tensor, ytest)
#test_loader = DataLoader(test_dataset, batch_size = batch_size_value, shuffle = False)


Unique data types in xtest: {<class 'NoneType'>, <class 'int'>}


In [27]:
train_loss = []
train_accs = []

In [29]:
n_epochs = 30
for ep in range(n_epochs):
    # train mode
    model.train()
    total_loss = 0
    correct = 0
    total =0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        #acc
        _, predicted= torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    # average loss & acc
    avg_train_loss = total_loss / len(train_loader)
    train_loss.append(avg_train_loss)
    avg_train_acc = correct / total * 100
    train_accs.append(avg_train_acc)

    print(f' EP [{ep+1}/{n_epochs}]\nLOSS {avg_train_loss}\n ACC {avg_train_acc}%')

 EP [1/30]
LOSS 1.7598275930089822
 ACC 39.147999999999996%
 EP [2/30]
LOSS 1.7623666278536474
 ACC 38.574000000000005%
 EP [3/30]
LOSS 1.7644658948241787
 ACC 39.147999999999996%
 EP [4/30]
LOSS 1.7615611100913773
 ACC 39.147999999999996%
 EP [5/30]
LOSS 1.7732620162225579
 ACC 38.986%
 EP [6/30]
LOSS 1.7615603014054546
 ACC 39.147999999999996%
 EP [7/30]
LOSS 1.7661781664163123
 ACC 37.962%
 EP [8/30]
LOSS 1.762148365406981
 ACC 39.147999999999996%
 EP [9/30]
LOSS 1.7613221006521564
 ACC 39.147999999999996%
 EP [10/30]
LOSS 1.7664524112385356
 ACC 38.896%
 EP [11/30]
LOSS 1.7620076216418852
 ACC 39.147999999999996%
 EP [12/30]
LOSS 1.7612017377858276
 ACC 39.147999999999996%
 EP [13/30]
LOSS 1.7615977655369277
 ACC 39.147999999999996%
 EP [14/30]
LOSS 1.761456758336844
 ACC 39.147999999999996%
 EP [15/30]
LOSS 1.7529024842909644
 ACC 39.112%
 EP [16/30]
LOSS 1.7619871962017077
 ACC 39.15%
 EP [17/30]
LOSS 1.762586829377075
 ACC 39.147999999999996%
 EP [18/30]
LOSS 1.762132209688139
 

In [33]:
torch.save(model, 'model2.pth')