#### 자연어 처리 ~
- RNN , LSTM , GRU , BERT
- 텍스트 기반(단어 , 문장)
- TensorFlow - Embedding
- 인코딩 - Sequence(WordToVec) - 토큰 - 패딩


In [38]:
import tensorflow as tf
import numpy      as np
import pandas     as pd

from   tensorflow.keras.preprocessing.text import Tokenizer
from   tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:


print('문자기반 인코딩 - ASCII code')
print('LISTEN - SILENT : NN 입력으로 사용하기 부적합하다 - ')
print('단어기반 인코딩이 필요하다 - ')

train_sentences = [
    'I love my dog' ,
    'I love my cat' ,
    'You love my dog?' , 
    'Do you think my dog is pretty?'
]

# 1. 시퀀스 작업

# 토큰 생성
tokenizer = Tokenizer(num_words = 100)
print('type - ' , type(tokenizer))

# tokenizer로 인코딩
# 단어사전 만들기
tokenizer.fit_on_texts(train_sentences)

# tokenizer.word_index
# Word to Vec
# 각 단어에 인코딩 값을 넣어주고있다.(아스키 코드 값이 아니다. 임의적인 단어에 라벨링을 붙이는 것)
# 특수문자는 인덱싱이 되지 않으며 대문자는 모두 소문자로 바뀌게 된다.
print('encoding - ',tokenizer.word_index)

# 인코딩된 값을 조합하여 문장별로 보여주는 함수
train_sequence = tokenizer.texts_to_sequences(train_sentences)
print('word_sequence' , train_sequence)

문자기반 인코딩 - ASCII code
LISTEN - SILENT : NN 입력으로 사용하기 부적합하다 - 
단어기반 인코딩이 필요하다 - 
type -  <class 'keras.preprocessing.text.Tokenizer'>
encoding -  {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'pretty': 10}
word_sequence [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [40]:
# 2 토큰

print('단어사전에 등록되지 않은 단어가 테스트 데이터로 들어온다면?')
print('미리 토큰화되지 않은 단어를 만나면? - oov_token 인자를 사용해서 특수한 값으로 처리할 수 있다.')
print()

tokenizer = Tokenizer(num_words = 100 , oov_token='<OOV>')

test_sentences = [
    'I really love my dog' ,
    'my dog loves my friend'
]

tokenizer.fit_on_texts(test_sentences)
word_index = tokenizer.word_index

test_sequences = tokenizer.texts_to_sequences(test_sentences)
print('word - encoding index ' , word_index)

print('train - ' , train_sequence)
print('test  - ' , test_sequences)

단어사전에 등록되지 않은 단어가 테스트 데이터로 들어온다면?
미리 토큰화되지 않은 단어를 만나면? - oov_token 인자를 사용해서 특수한 값으로 처리할 수 있다.

word - encoding index  {'<OOV>': 1, 'my': 2, 'dog': 3, 'i': 4, 'really': 5, 'love': 6, 'loves': 7, 'friend': 8}
train -  [[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
test  -  [[4, 5, 6, 2, 3], [2, 3, 7, 2, 8]]


In [41]:
# 3 패딩

print('패딩 설정하기')
# padding : pre / post
# maxlen  : 최대 길이 -> sequence의 길이를 제한할 수 있다.
# truncating : maxlen의 길이에 따라 잘라낼 부분을 설정한다.
train_padded = pad_sequences(train_sequence , padding = 'pre' , maxlen=6 , truncating='pre')
print('type - ' , type(train_padded))
print('train padding - \n' , train_padded)

패딩 설정하기
type -  <class 'numpy.ndarray'>
train padding - 
 [[ 0  0  4  2  1  3]
 [ 0  0  4  2  1  6]
 [ 0  0  5  2  1  3]
 [ 5  8  1  3  9 10]]


- 데이터 세트를 이요한 실습
!wget --no-check-certificate \
  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
  -O /tmp/sarcasm.json


In [42]:
!wget --no-check-certificate \
  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
  -O /tmp/sarcasm.json


--2022-12-09 01:36:57--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.45.16, 172.217.0.48, 142.251.16.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.45.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2022-12-09 01:36:57 (111 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [43]:
print('데이터세트 준비 - ')
print('headline , is_sarcastic , article_link - 각각의 리스트로')
print()

데이터세트 준비 - 
headline , is_sarcastic , article_link - 각각의 리스트로



In [50]:
import pandas as pd
import json

sarcasm_frm = pd.read_json('/tmp/sarcasm.json')

In [None]:
display(sarcasm_frm)
sarcasm_frm.info()

In [None]:
data = sarcasm_frm.drop('article_link' , axis = 1)
data

In [48]:
with open('/tmp/sarcasm.json') as f:
    js = json.loads(f.read())
df = pd.DataFrame(js)

In [70]:
# 데이터 세트 준비하기

with open('/tmp/sarcasm.json') as file:
    datasets = json.loads(file.read())

X_sentesnces = []
y_labels     = []
url          = []
print(datasets[0].keys())
for data in datasets :
  X_sentesnces.append(data['headline'])
  y_labels.append(data['is_sarcastic'])
  url.append(data['article_link'])

print('sentence - ' , X_sentesnces[0])
print('target   - ' , y_labels[0])




dict_keys(['article_link', 'headline', 'is_sarcastic'])
sentence -  former versace store clerk sues over secret 'black code' for minority shoppers
target   -  0


In [88]:
print('Tokenizer를 이용해서 단어사전 제작하기')


# 1. 시퀀스 작업

# 토큰 생성
tokenizer = Tokenizer(num_words = 10000 , oov_token='<OOV>')
print('type - ' , type(tokenizer))

# 단어사전 만들기
tokenizer.fit_on_texts(X_sentesnces)
print('encoding - ',tokenizer.word_index)

# 인코딩된 값을 조합하여 문장별로 보여주는 함수
X_sequence = tokenizer.texts_to_sequences(X_sentesnces)
print('word_sequence \n' , X_sequence)


X_padded = pad_sequences(X_sequence , padding = 'post')
print('shape' , X_padded.shape)
print('train padding - \n' , X_padded)

Tokenizer를 이용해서 단어사전 제작하기
type -  <class 'keras.preprocessing.text.Tokenizer'>
word_sequence 
 [[308, 1, 679, 3337, 2298, 48, 382, 2576, 1, 6, 2577, 8434], [4, 8435, 3338, 2746, 22, 2, 166, 8436, 416, 3112, 6, 258, 9, 1002], [145, 838, 2, 907, 1749, 2093, 582, 4719, 221, 143, 39, 46, 2, 1], [1485, 36, 224, 400, 2, 1832, 29, 319, 22, 10, 2924, 1393, 6969, 968], [767, 719, 4720, 908, 1, 623, 594, 5, 4, 95, 1309, 92], [1, 4, 365, 73], [4, 6970, 351, 6, 461, 4274, 2195, 1486], [19, 479, 39, 1168, 31, 155, 2, 99, 83, 18, 158, 6, 32, 352], [249, 3623, 6971, 555, 5274, 1995, 141], [2094, 326, 347, 401, 60, 1, 6, 4, 3896], [2925, 1680, 4721, 14, 37, 4275, 6972, 5, 2095, 1103], [286, 782, 462, 8, 1556, 1911, 9, 3624], [234, 514, 2926, 13, 9, 929, 226, 369, 2, 4276, 1, 8437], [238, 3897, 8438, 3339, 38, 235, 1, 6, 7, 173], [1, 1394, 665, 651, 5, 327, 3, 1031], [534, 2095, 1, 123, 1, 6, 1, 4722, 1912], [2578, 1395, 383, 45, 3898, 348, 319, 1032, 2, 24, 1, 20, 1104, 387, 103, 1310], [1681, 8439, 3

In [86]:
import tensorflow as tf
from tensorflow.keras.models   import Sequential
# 자연어 : Embedding
from tensorflow.keras.layers   import Dense , Activation , Embedding , Flatten
from tensorflow.keras.datasets import boston_housing , mnist
from tensorflow.keras          import optimizers
from keras.utils.np_utils      import to_categorical

import matplotlib.pyplot as plt
import numpy as np



In [87]:
# 자연어에서 입력층은 Dense 대신 Embedding을 사용한다.

print('Embedding 입력층 - \n 1. 단어사전의 크기 \n 2. 출력의 차원 \n 3. 문장의 길이')

# 단어 사전의 크기
len(tokenizer.word_index)

# 출력의 차원 : 임의로 지정

# 문장의 길이
# X_padded.shape 의 features 개수 40개


# 입력층 생성
model = Sequential()
model.add(Embedding(29657 , 128 , input_length = 40))
model.add( Flatten())

# 은닉층 생성
model.add(Dense(units = 10 , activation = 'relu'))
model.add(Dense(units = 6 ,  activation = 'relu'))
model.add(Dense(units = 4 ,  activation = 'relu'))
model.add(Dense(units = 1 ,  activation = 'sigmoid'))




Embedding 입력층 - 
 1. 단어사전의 크기 
 2. 출력의 차원 
 3. 문장의 길이


In [89]:
# 'binary_crossentropy' 이진분류 때 사용
model.compile(loss = 'binary_crossentropy' , optimizer='adam' , metrics = ['accuracy'])


In [92]:
print('type - ' , type(y_labels))
y_ary = np.array(y_labels)
print('type - ' , type(y_ary) , y_ary.shape)

type -  <class 'list'>
type -  <class 'numpy.ndarray'> (26709,)


In [93]:
model.fit(X_padded , y_ary , epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f12c3541100>

- imbd 영화리뷰실습(긍정 , 부정)
- if error, plz import ) !pip install -q tensorflow-datasets


In [94]:
import tensorflow_datasets as tfds


In [95]:
imdb, info = tfds.load('imdb_reviews' , with_info = True , as_supervised=True)

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK72ISF/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK72ISF/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK72ISF/imdb_reviews-unsupervised.tfrec…

Dataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [96]:
print('type - ' , type(imdb) , imdb.keys())

type -  <class 'dict'> dict_keys([Split('train'), Split('test'), Split('unsupervised')])


In [98]:
train_data , test_data = imdb['train'] , imdb['test']
print('train - ' , type(train_data))
print('test  - ' , type(test_data))

train -  <class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>
test  -  <class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [103]:
imdb_train_sentences = []
imdb_train_labels    = []

for s , i in train_data :
  imdb_train_sentences.append(str(s.numpy()))
  imdb_train_labels.append(str(i.numpy()))

imdb_test_sentences = []
imdb_test_labels    = []

for s , i in test_data :
  imdb_test_sentences.append(str(s.numpy()))
  imdb_test_labels.append(str(i.numpy()))

print(imdb_train_sentences[0])
print(imdb_train_labels[0])
print()
print(imdb_test_sentences[0])
print(imdb_test_labels[0])
# b means : binary

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
0

b"There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of t

In [96]:
print('문장의 단어사전 , 토큰 , 시퀀스 ,패딩')
print('모델 구성')
print('모델 컴파일')
print('모델 훈련')
print('모델 평가 (test_data 이용)')
print('모델 예측')
print('시각화')

In [104]:
print('Tokenizer를 이용해서 단어사전 제작하기')


# 1. 시퀀스 작업


tokenizer = Tokenizer(num_words = 10000 , oov_token='<OOV>')
print('type - ' , type(tokenizer))

# 단어사전 만들기
tokenizer.fit_on_texts(imdb_train_sentences)
print('encoding - ',tokenizer.word_index)

# 인코딩된 값을 조합하여 문장별로 보여주는 함수
imdb_train_sequence = tokenizer.texts_to_sequences(imdb_train_sentences)
print('word_sequence \n' , imdb_train_sequence)


imdb_train_padded = pad_sequences(imdb_train_sequence , padding = 'post')
print('shape' , imdb_train_padded.shape)
print('train padding - \n' , imdb_train_padded)

Tokenizer를 이용해서 단어사전 제작하기
type -  <class 'keras.preprocessing.text.Tokenizer'>


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



shape (25000, 2527)
train padding - 
 [[  59   12   14 ...    0    0    0]
 [ 256   28   78 ...    0    0    0]
 [   1 6175    2 ...    0    0    0]
 ...
 [7629   37   11 ...    0    0    0]
 [2677   11  216 ...    0    0    0]
 [3875    5   31 ...    0    0    0]]


In [116]:
print(imdb_train_padded[0])

[59 12 14 ...  0  0  0]


In [106]:
# 자연어에서 입력층은 Dense 대신 Embedding을 사용한다.

print('Embedding 입력층 - \n 1. 단어사전의 크기 \n 2. 출력의 차원 \n 3. 문장의 길이')

# 단어 사전의 크기
len(tokenizer.word_index)

# 출력의 차원 : 임의로 지정

# 문장의 길이
# shape 의 features 개수


# 입력층 생성
model = Sequential()
model.add(Embedding(86539 , 128 , input_length = 2527))
model.add( Flatten())

# 은닉층 생성
# units : 임의적으로 정한다.

model.add(Dense(units = 10 , activation = 'relu'))
model.add(Dense(units = 6 ,  activation = 'relu'))
model.add(Dense(units = 4 ,  activation = 'relu'))

# 출력층 생성
# 이진분류의 결과이기에 units 1개이다.

model.add(Dense(units = 1 ,  activation = 'sigmoid'))




Embedding 입력층 - 
 1. 단어사전의 크기 
 2. 출력의 차원 
 3. 문장의 길이


In [117]:
# 'binary_crossentropy' 이진분류 때 사용
model.compile(loss = 'binary_crossentropy' , optimizer='adam' , metrics = ['accuracy'])


In [112]:
print('type - ' , type(imdb_train_labels))
imdb_train_labels_ary = np.array(imdb_train_labels)
print('type - ' , type(imdb_train_labels_ary) , imdb_train_labels_ary.shape)

# label의 변수형태를 조심할 것.
imdb_train_labels_ary.astype(int)

type -  <class 'list'>
type -  <class 'numpy.ndarray'> (25000,)


In [125]:
model.fit(imdb_train_padded , imdb_train_labels_ary.astype(int) , epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f12b7c7b7c0>

In [126]:
#예측하기 위해 넣을 데이터 test. 시퀀스 작업


tokenizer = Tokenizer(num_words = 10000 , oov_token='<OOV>')
print('type - ' , type(tokenizer))

# 단어사전 만들기
tokenizer.fit_on_texts(imdb_test_sentences)
print('encoding - ',tokenizer.word_index)

# 인코딩된 값을 조합하여 문장별로 보여주는 함수
imdb_test_sequence = tokenizer.texts_to_sequences(imdb_test_sentences)
print('word_sequence \n' , imdb_test_sequence)


imdb_test_padded = pad_sequences(imdb_test_sequence , padding = 'post')
print('shape' , imdb_test_padded.shape)
print('train padding - \n' , imdb_test_padded)

type -  <class 'keras.preprocessing.text.Tokenizer'>


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



shape (25000, 2397)
train padding - 
 [[ 58  43  25 ...   0   0   0]
 [ 58   4   1 ...   0   0   0]
 [  1  16 319 ...   0   0   0]
 ...
 [ 58  91   1 ...   0   0   0]
 [ 58 105 635 ...   0   0   0]
 [ 58  34  42 ...   0   0   0]]


In [135]:
imdb_test_padded.shape

(25000, 2397)

In [127]:
model.predict(imdb_test_padded)

ValueError: ignored

In [133]:
imdb_train_padded.T

array([[  59,  256,    1, ..., 7629, 2677, 3875],
       [  12,   28, 6175, ...,   37,   11,    5],
       [  14,   78,    2, ...,   11,  216,   31],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0]], dtype=int32)