In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd

# 예제

In [3]:
sentences = [
    ['한국어', '자연어', '처리를', '쉽게', '시작해봅시다'],
    ['KoNLPy를', '사용하여', '한국어', '텍스트를', '처리합니다']
]


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# Get the word index (index assigned to each morpheme)
word_index = tokenizer.word_index
print("Word Index:", word_index)

# Convert sentences to sequences of indices
sequences = tokenizer.texts_to_sequences(sentences)
print("Sequences:", sequences)

# One-hot encode the sequences
one_hot_encoded = to_categorical(sequences)
print("One-hot encoded data:")
print(one_hot_encoded)

Word Index: {'한국어': 1, '자연어': 2, '처리를': 3, '쉽게': 4, '시작해봅시다': 5, 'konlpy를': 6, '사용하여': 7, '텍스트를': 8, '처리합니다': 9}
Sequences: [[1, 2, 3, 4, 5], [6, 7, 1, 8, 9]]
One-hot encoded data:
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]]


# Data Load

In [5]:
df = pd.read_pickle('NLP_df.pkl')
df = df[df['QorA'] == 'Q']
df

Unnamed: 0,intent,sentence,QorA,NLP
0,인사,안녕하쇼,Q,"[안녕, 하, 쇼]"
1,인사,안녕,Q,[안녕]
2,인사,안녕하세요,Q,[안녕하세요]
3,인사,ㅎㅇ,Q,[ㅎㅇ]
4,인사,ㅎㅇㅇ,Q,[ㅎㅇㅇ]
...,...,...,...,...
79,대건고등학교의 교화,대건고 꽃,Q,"[대, 건고, 꽃]"
80,대건고등학교의 교화,대건고의 꽃은?,Q,"[대, 건고, 의, 꽃, 은, ?]"
81,대건고등학교의 교화,대건고등학교의 꽃은?,Q,"[대건고등학교, 의, 꽃, 은, ?]"
82,대건고등학교의 교화,대건고등학교의 꽃은 무엇이니?,Q,"[대건고등학교, 의, 꽃, 은, 무엇, 이, 니, ?]"


# Create_Word_Index


In [6]:
data = []
for s in df['NLP'].iloc[:] :
    data.append(s)
data

[['안녕', '하', '쇼'],
 ['안녕'],
 ['안녕하세요'],
 ['ㅎㅇ'],
 ['ㅎㅇㅇ'],
 ['방', '가', '링'],
 ['방', '가방', '가'],
 ['헬로우'],
 ['하이'],
 ['잘', '있', '어'],
 ['다음', '에', '보', '자'],
 ['잘', '가', '아'],
 ['나', '갈', '게'],
 ['좋은 하루', '보내', '어'],
 ['ㅂㅂ'],
 ['바이바이'],
 ['ㅂㅇ'],
 ['대건고등학교', '는', '어디', '에', '있', '니', '?'],
 ['대건고등학교', '는', '어디', '에', '위치', '하', '니', '?'],
 ['대건고등학교', '는', '어디', '에', '위치', '하', '아', '?'],
 ['대건고등학교', '는', '어디', '에', '있', '어', '?'],
 ['대건고등학교', '위치'],
 ['대건고등학교', '의', '위치'],
 ['대', '건고', '위치'],
 ['대', '건고', '의', '위치'],
 ['대', '건고', '는', '어디', '에', '위치', '하', '니', '?'],
 ['대', '건고', '는', '어디', '에', '위치', '하', '아', '있', '어', '?'],
 ['대', '건고', '의', '위치', '는', '?'],
 ['대건고등학교', '의', '교훈', '이', '뭐', '야', '?'],
 ['대건고등학교', '교훈'],
 ['대건고등학교', '는', '교훈', '이', '뭐', '야', '?'],
 ['대건고등학교', '의', '교훈', '은', '?'],
 ['대', '건고', '교훈'],
 ['대', '건고', '는', '교훈', '이', '뭐', '야', '?'],
 ['대', '건고', '의', '교훈', '이', '뭐', '야', '?'],
 ['대', '건고', '의', '교훈', '은', '?'],
 ['대건고등학교', '의', '교목', '이', '뭐', '야', '?'],
 [

In [7]:
# Create a Tokenizer and fit on the sentences
tokenizer = Tokenizer()

In [8]:
tokenizer.fit_on_texts(data)

In [9]:
# Get the word index (index assigned to each morpheme)
word_index = tokenizer.word_index
print("Word Index:", word_index)

Word Index: {'의': 1, '?': 2, '대건고등학교': 3, '대': 4, '건고': 5, '는': 6, '이': 7, '교목': 8, '무엇': 9, '교화': 10, '뭐': 11, '은': 12, '니': 13, '야': 14, 'ㄹ까': 15, '위치': 16, '교훈': 17, '에': 18, '가': 19, '어디': 20, '나무': 21, '꽃': 22, '하': 23, '있': 24, '어': 25, '아': 26, '안녕': 27, '방': 28, '잘': 29, '대건': 30, '교의': 31, '쇼': 32, '안녕하세요': 33, 'ㅎㅇ': 34, 'ㅎㅇㅇ': 35, '링': 36, '가방': 37, '헬로우': 38, '하이': 39, '다음': 40, '보': 41, '자': 42, '나': 43, '갈': 44, '게': 45, '좋은 하루': 46, '보내': 47, 'ㅂㅂ': 48, '바이바이': 49, 'ㅂㅇ': 50}


In [10]:

# Convert sentences to sequences of indices
sequences = tokenizer.texts_to_sequences(data)
print("Sequences:", sequences)

Sequences: [[27, 23, 32], [27], [33], [34], [35], [28, 19, 36], [28, 37, 19], [38], [39], [29, 24, 25], [40, 18, 41, 42], [29, 19, 26], [43, 44, 45], [46, 47, 25], [48], [49], [50], [3, 6, 20, 18, 24, 13, 2], [3, 6, 20, 18, 16, 23, 13, 2], [3, 6, 20, 18, 16, 23, 26, 2], [3, 6, 20, 18, 24, 25, 2], [3, 16], [3, 1, 16], [4, 5, 16], [4, 5, 1, 16], [4, 5, 6, 20, 18, 16, 23, 13, 2], [4, 5, 6, 20, 18, 16, 23, 26, 24, 25, 2], [4, 5, 1, 16, 6, 2], [3, 1, 17, 7, 11, 14, 2], [3, 17], [3, 6, 17, 7, 11, 14, 2], [3, 1, 17, 12, 2], [4, 5, 17], [4, 5, 6, 17, 7, 11, 14, 2], [4, 5, 1, 17, 7, 11, 14, 2], [4, 5, 1, 17, 12, 2], [3, 1, 8, 7, 11, 14, 2], [3, 1, 8], [3, 1, 8], [4, 5, 8], [4, 5, 1, 8, 7, 11, 14, 2], [4, 5, 1, 8, 12, 2], [4, 5, 6, 8, 7, 11, 14, 2], [3, 6, 8, 7, 11, 14, 2], [30, 31, 8, 12, 2], [3, 1, 8, 12, 2], [3, 1, 8, 12, 9, 7, 13, 2], [4, 5, 1, 8, 12, 9, 7, 13, 2], [3, 1, 8, 7, 11, 15, 2], [4, 5, 1, 8, 7, 11, 15], [4, 5, 1, 8, 12, 9, 7, 13], [4, 5, 1, 8, 12, 9, 7, 15], [3, 1, 8, 12, 9, 7, 13

In [11]:

# Pad sequences to the same length (optional step)
max_sequence_length = 10
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
print("Padded Sequences:", padded_sequences)

Padded Sequences: [[27 23 32  0  0  0  0  0  0  0]
 [27  0  0  0  0  0  0  0  0  0]
 [33  0  0  0  0  0  0  0  0  0]
 [34  0  0  0  0  0  0  0  0  0]
 [35  0  0  0  0  0  0  0  0  0]
 [28 19 36  0  0  0  0  0  0  0]
 [28 37 19  0  0  0  0  0  0  0]
 [38  0  0  0  0  0  0  0  0  0]
 [39  0  0  0  0  0  0  0  0  0]
 [29 24 25  0  0  0  0  0  0  0]
 [40 18 41 42  0  0  0  0  0  0]
 [29 19 26  0  0  0  0  0  0  0]
 [43 44 45  0  0  0  0  0  0  0]
 [46 47 25  0  0  0  0  0  0  0]
 [48  0  0  0  0  0  0  0  0  0]
 [49  0  0  0  0  0  0  0  0  0]
 [50  0  0  0  0  0  0  0  0  0]
 [ 3  6 20 18 24 13  2  0  0  0]
 [ 3  6 20 18 16 23 13  2  0  0]
 [ 3  6 20 18 16 23 26  2  0  0]
 [ 3  6 20 18 24 25  2  0  0  0]
 [ 3 16  0  0  0  0  0  0  0  0]
 [ 3  1 16  0  0  0  0  0  0  0]
 [ 4  5 16  0  0  0  0  0  0  0]
 [ 4  5  1 16  0  0  0  0  0  0]
 [ 4  5  6 20 18 16 23 13  2  0]
 [ 5  6 20 18 16 23 26 24 25  2]
 [ 4  5  1 16  6  2  0  0  0  0]
 [ 3  1 17  7 11 14  2  0  0  0]
 [ 3 17  0  0  0  0  0  0

In [12]:

# One-hot encode the sequences
one_hot_encoded = to_categorical(padded_sequences, num_classes=len(word_index) + 1)
print("One-hot encoded data:")
print(one_hot_encoded)

One-hot encoded data:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 1. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 

In [13]:
import pickle

In [14]:
with open('One-Hot.pkl','wb') as f:
   pickle.dump(one_hot_encoded,f)

In [15]:
with open('Word_Dict.pkl','wb') as f:
   pickle.dump(word_index,f)