In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense 
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split 
df = pd.read_csv("./dataset/pima-indians-diabetes.csv", header=None) 
x = df.values[:,0:8] 
y = df.values[:,8]
np.unique(y, return_counts=True)

(array([0., 1.]), array([500, 268], dtype=int64))

In [2]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y,
                                                    test_size=0.3, 
                                                    random_state=789)
print(x_train.shape, y_train.shape)
print(np.unique(y_train, return_counts=True))
print(x_train[:2,:])

(537, 8) (537,)
(array([0., 1.]), array([350, 187], dtype=int64))
[[  2.     71.     70.     27.      0.     28.      0.586  22.   ]
 [  1.    199.     76.     43.      0.     42.9     1.394  22.   ]]


In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_tr_std = scaler.transform(x_train)
x_te_std = scaler.transform(x_test)

In [4]:
print(x_train[:2,:], x_tr_std[:2,:])

[[  2.     71.     70.     27.      0.     28.      0.586  22.   ]
 [  1.    199.     76.     43.      0.     42.9     1.394  22.   ]] [[-0.56142403 -1.59847063  0.07592301  0.40609133 -0.69103384 -0.47265572
   0.295433   -0.94333269]
 [-0.85187172  2.45870884  0.38440211  1.4260085  -0.69103384  1.30612243
   2.63222749 -0.94333269]]


In [5]:
model = Sequential()
model.add(Dense(20, input_dim=8, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                180       
_________________________________________________________________
dense_1 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 1,041
Trainable params: 1,041
Non-trainable params: 0
_________________________________________________________________


In [6]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping_cb = EarlyStopping(monitor='val_loss', patience=10)
check_pt = ModelCheckpoint(filepath="./pima-std-model.hdf5",
                           monitor='val_loss', verbose=1, save_best_only=True)
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=1000, 
          batch_size=200, callbacks=[early_stopping_cb, check_pt])

Epoch 1/1000
Epoch 00001: val_loss improved from inf to 1.11819, saving model to .\pima-std-model.hdf5
Epoch 2/1000
Epoch 00002: val_loss improved from 1.11819 to 0.96589, saving model to .\pima-std-model.hdf5
Epoch 3/1000
Epoch 00003: val_loss improved from 0.96589 to 0.93131, saving model to .\pima-std-model.hdf5
Epoch 4/1000
Epoch 00004: val_loss improved from 0.93131 to 0.90228, saving model to .\pima-std-model.hdf5
Epoch 5/1000
Epoch 00005: val_loss improved from 0.90228 to 0.86102, saving model to .\pima-std-model.hdf5
Epoch 6/1000
Epoch 00006: val_loss improved from 0.86102 to 0.79058, saving model to .\pima-std-model.hdf5
Epoch 7/1000
Epoch 00007: val_loss improved from 0.79058 to 0.76205, saving model to .\pima-std-model.hdf5
Epoch 8/1000
Epoch 00008: val_loss improved from 0.76205 to 0.75648, saving model to .\pima-std-model.hdf5
Epoch 9/1000
Epoch 00009: val_loss improved from 0.75648 to 0.71173, saving model to .\pima-std-model.hdf5
Epoch 10/1000
Epoch 00010: val_loss impro

Epoch 28/1000
Epoch 00028: val_loss did not improve from 0.61182
Epoch 29/1000
Epoch 00029: val_loss did not improve from 0.61182
Epoch 30/1000
Epoch 00030: val_loss improved from 0.61182 to 0.60961, saving model to .\pima-std-model.hdf5
Epoch 31/1000
Epoch 00031: val_loss did not improve from 0.60961
Epoch 32/1000
Epoch 00032: val_loss improved from 0.60961 to 0.60537, saving model to .\pima-std-model.hdf5
Epoch 33/1000
Epoch 00033: val_loss did not improve from 0.60537
Epoch 34/1000
Epoch 00034: val_loss did not improve from 0.60537
Epoch 35/1000
Epoch 00035: val_loss did not improve from 0.60537
Epoch 36/1000
Epoch 00036: val_loss did not improve from 0.60537
Epoch 37/1000
Epoch 00037: val_loss did not improve from 0.60537
Epoch 38/1000
Epoch 00038: val_loss did not improve from 0.60537
Epoch 39/1000
Epoch 00039: val_loss did not improve from 0.60537
Epoch 40/1000
Epoch 00040: val_loss did not improve from 0.60537
Epoch 41/1000
Epoch 00041: val_loss did not improve from 0.60537
Epoc

<tensorflow.python.keras.callbacks.History at 0x142d7b54748>

In [7]:
from tensorflow.keras.models import load_model

ld_model = load_model("./pima-std-model.hdf5")
ld_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                180       
_________________________________________________________________
dense_1 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_2 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 1,041
Trainable params: 1,041
Non-trainable params: 0
_________________________________________________________________


In [8]:
ld_model.evaluate(x_test, y_test)



[0.6053667664527893, 0.6753246784210205]

In [9]:
from konlpy.tag import Okt

In [10]:
okt = Okt()
print(okt.morphs("열심히 코딩한 당신. 연휴에는 여행을 가봐요~"))

['열심히', '코딩', '한', '당신', '.', '연휴', '에는', '여행', '을', '가봐요', '~']


In [11]:
text = "You say goodbay, and I say hello." 
text = text.lower() 
text = text.replace(',', '')
text = text.replace('.', '')
text

'you say goodbay and i say hello'

In [12]:
words = text.split(' ')
words

['you', 'say', 'goodbay', 'and', 'i', 'say', 'hello']

In [13]:
word_to_id = {w:i for i, w in enumerate(set(words))}
word_to_id

{'i': 0, 'hello': 1, 'goodbay': 2, 'say': 3, 'you': 4, 'and': 5}

In [14]:
corpus = [word_to_id[w] for w in words]
corpus

[4, 3, 2, 5, 0, 3, 1]

In [15]:
import nltk
nltk.download('all-corpora')
nltk.download('punkt')

[nltk_data] Downloading collection 'all-corpora'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_da

[nltk_data]    |   Package ptb is already up-to-date!
[nltk_data]    | Downloading package qc to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package qc is already up-to-date!
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package reuters is already up-to-date!
[nltk_data]    | Downloading package rte to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package rte is already up-to-date!
[nltk_data]    | Downloading package semcor to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package semcor is already up-to-date!
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]    |   Package senseval is already up-to-date!
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     C:\Users\etriai

True

In [16]:
from nltk.tokenize import word_tokenize
sentence = "Natural language processing (NLP) is a subfield of computer science. So now can I have lbububu"
print(word_tokenize(sentence))

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', '.', 'So', 'now', 'can', 'I', 'have', 'lbububu']


In [17]:
text2 = "Ph.D. Rock n  roll New York AT&T Don't $4.55 04/05/22 I'm Mr. Jone 129.254.23.13 "
print(word_tokenize(text2))

['Ph.D.', 'Rock', 'n', 'roll', 'New', 'York', 'AT', '&', 'T', 'Do', "n't", '$', '4.55', '04/05/22', 'I', "'m", 'Mr.', 'Jone', '129.254.23.13']


In [18]:
from nltk.tokenize import word_tokenize
sentence = "I am a boy! 나는 소년이다."
print(word_tokenize(sentence))


['I', 'am', 'a', 'boy', '!', '나는', '소년이다', '.']


In [19]:
import spacy

In [20]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [21]:
spacy_nlp = spacy.load("en_core_web_sm")
sentence = "Natural language processing (NLP) is a subfield of computer science. So now can I have lbububu"
text = "Ph.D. Rock n roll New York AT&T Don't $4.55 04/05/22 I'm Mr. Jone 129.254.23.13 "

In [22]:
doc = spacy_nlp(sentence)
print([token.text for token in doc])

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', '.', 'So', 'now', 'can', 'I', 'have', 'lbububu']


In [23]:
doc = spacy_nlp(text)
print([token.text for token in doc])

['Ph.D.', 'Rock', 'n', 'roll', 'New', 'York', 'AT&T', 'Do', "n't", '$', '4.55', '04/05/22', 'I', "'m", 'Mr.', 'Jone', '129.254.23.13']


In [24]:
print(word_tokenize(sentence))

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', '.', 'So', 'now', 'can', 'I', 'have', 'lbububu']


In [25]:
print(word_tokenize(text))

['Ph.D.', 'Rock', 'n', 'roll', 'New', 'York', 'AT', '&', 'T', 'Do', "n't", '$', '4.55', '04/05/22', 'I', "'m", 'Mr.', 'Jone', '129.254.23.13']


In [26]:
hangul = "제가이렇게스페이스를사용하지않아도이해되죠?"
eng = "tobeornottobethatisaquestion"

In [27]:
text = "한글 자연어 처리는 재밌다. 이제부터 열심히 해봐야지~ ㅎㅎㅎ"
print(okt.morphs(text)) #형태소 분석해서 나누기 (tokenize)
print(okt.morphs(text, stem=True)) #어근 True
print(okt.nouns(text)) #명사
print(okt.phrases(text)) #명사구
print(okt.pos(text)) #tokenize 뿐만 아니라 품사까지 튜플형태로 찍어줌 (particle은 완전한 문장 아닌거. 분자???)

['한글', '자연어', '처리', '는', '재밌다', '.', '이제', '부터', '열심히', '해봐야지', '~', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재밌다', '.', '이제', '부터', '열심히', '해보다', '~', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '이제']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']
[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재밌다', 'Adjective'), ('.', 'Punctuation'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해봐야지', 'Verb'), ('~', 'Punctuation'), ('ㅎㅎㅎ', 'KoreanParticle')]


In [34]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\etriai08\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [35]:
from nltk.tag import pos_tag
sentence = "Natural language processing (NLP) is a subfield of computer science. So now can I have lbububu"
words = word_tokenize(sentence)
pos_tag(words)

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('subfield', 'NN'),
 ('of', 'IN'),
 ('computer', 'NN'),
 ('science', 'NN'),
 ('.', '.'),
 ('So', 'RB'),
 ('now', 'RB'),
 ('can', 'MD'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('lbububu', 'VBN')]

In [36]:
import re # regular expression

In [37]:
re.split("(\w+)", "Wow, it is awesome")

['', 'Wow', ', ', 'it', ' ', 'is', ' ', 'awesome', '']

In [41]:
r = re.compile("a.c") # . 은 반드시 단어 하나 있어야 함
r.search("kkk ac")

In [46]:
r = re.compile("a?c") # ? 는 앞에 문자가 하나 있거나 없거나. 즉, a가 0개 OR 1개.
r.search("kkk bc")

<re.Match object; span=(5, 6), match='c'>

In [49]:
r = re.compile("a*c") # *은 a가 0개 이상
r.search("kkk aaaaaaaac")

<re.Match object; span=(4, 13), match='aaaaaaaac'>

In [51]:
text = """이름: 김철수
전화번호: 010 - 1234 - 5678
나이: 30
성별: 남"""

In [55]:
r = re.compile("[0-9]+")
r.findall(text)

['010', '1234', '5678', '30']

In [58]:
print(re.findall("\d+", text)) #숫자
print(re.findall("\D+", text)) #숫자가 아닌거
print(re.findall("\w+", text)) #문자
print(re.findall("\W+", text)) #문자가 아닌거


['010', '1234', '5678', '30']
['이름: 김철수\n전화번호: ', ' - ', ' - ', '\n나이: ', '\n성별: 남']
['이름', '김철수', '전화번호', '010', '1234', '5678', '나이', '30', '성별', '남']
[': ', '\n', ': ', ' - ', ' - ', '\n', ': ', '\n', ': ']


In [62]:
text = "Regular expression : A regular expression, regex or regexp[1] (someties called a rational expression[2][3] is.)"
pred_text = re.sub('[^a-zA-Z]',' ',text)
pred_text

'Regular expression   A regular expression  regex or regexp     someties called a rational expression       is  '

In [63]:
pred_text.split(" ")

['Regular',
 'expression',
 '',
 '',
 'A',
 'regular',
 'expression',
 '',
 'regex',
 'or',
 'regexp',
 '',
 '',
 '',
 '',
 'someties',
 'called',
 'a',
 'rational',
 'expression',
 '',
 '',
 '',
 '',
 '',
 '',
 'is',
 '',
 '']

In [64]:
re.split("\s+", pred_text)

['Regular',
 'expression',
 'A',
 'regular',
 'expression',
 'regex',
 'or',
 'regexp',
 'someties',
 'called',
 'a',
 'rational',
 'expression',
 'is',
 '']

In [94]:
from sklearn.feature_extraction.text import CountVectorizer

text_data = ["나는 배가 고프다",  # 이거 하나가 도큐먼트
             "내일 점심 뭐먹지",
             "내일 공부 해야겠다",
             "점심 먹고 공부 해야지"]
count_vectorizer = CountVectorizer()
count_vectorizer.fit(text_data)
print(count_vectorizer.vocabulary_)

{'나는': 2, '배가': 6, '고프다': 0, '내일': 3, '점심': 7, '뭐먹지': 5, '공부': 1, '해야겠다': 8, '먹고': 4, '해야지': 9}


In [95]:
sentence = [text_data[0]]
print(count_vectorizer.transform(sentence).toarray())

[[1 0 1 0 0 0 1 0 0 0]]


In [96]:
sentence = ["나는 나는 나는 배가 배가 고프다"]
print(count_vectorizer.transform(sentence).toarray())

[[1 0 3 0 0 0 2 0 0 0]]


In [97]:
print(count_vectorizer.transform(text_data).toarray())

[[1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 1 0 1 0 0]
 [0 1 0 1 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 1 0 1]]


In [98]:
x = np.array(count_vectorizer.transform(text_data).toarray())
y = np.array([0,0,1,1]) #먹는거, 먹는거, 공부하는거, 공부하는거 (문장의 어감이)

model = Sequential()
model.add(Dense(10, input_dim=10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_20 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_21 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_22 (Dense)             (None, 10)                110       
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 11        
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________


In [117]:
model.fit(x, y, epochs=1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1

Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000
Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/1000
Epoch 207/1000
Epoch 208/1000
Epoch 209/1000
Epoch 210/1000
Epoch 211/1000
Epoch 212/1000
Epoch 213/1000
Epoch 214/1000
Epoch 215/1000
Epoch 216/1000
Epoch 217/1000
Epoch 218/1000
Epoch 219/1000
Epoch 220/1000
Epoch 221/1000
Epoch 222/1000
Epoch 223/1000
Epoch 224/1000
Epoch 225/1000
Epoch 226/1000
Epoch 227/1000
Epoch 228/1000
Epoch 229/1000
Epoch 230/

Epoch 242/1000
Epoch 243/1000
Epoch 244/1000
Epoch 245/1000
Epoch 246/1000
Epoch 247/1000
Epoch 248/1000
Epoch 249/1000
Epoch 250/1000
Epoch 251/1000
Epoch 252/1000
Epoch 253/1000
Epoch 254/1000
Epoch 255/1000
Epoch 256/1000
Epoch 257/1000
Epoch 258/1000
Epoch 259/1000
Epoch 260/1000
Epoch 261/1000
Epoch 262/1000
Epoch 263/1000
Epoch 264/1000
Epoch 265/1000
Epoch 266/1000
Epoch 267/1000
Epoch 268/1000
Epoch 269/1000
Epoch 270/1000
Epoch 271/1000
Epoch 272/1000
Epoch 273/1000
Epoch 274/1000
Epoch 275/1000
Epoch 276/1000
Epoch 277/1000
Epoch 278/1000
Epoch 279/1000
Epoch 280/1000
Epoch 281/1000
Epoch 282/1000
Epoch 283/1000
Epoch 284/1000
Epoch 285/1000
Epoch 286/1000
Epoch 287/1000
Epoch 288/1000
Epoch 289/1000
Epoch 290/1000
Epoch 291/1000
Epoch 292/1000
Epoch 293/1000
Epoch 294/1000
Epoch 295/1000
Epoch 296/1000
Epoch 297/1000
Epoch 298/1000
Epoch 299/1000
Epoch 300/1000
Epoch 301/1000
Epoch 302/1000
Epoch 303/1000
Epoch 304/1000
Epoch 305/1000
Epoch 306/1000
Epoch 307/1000
Epoch 308/

Epoch 320/1000
Epoch 321/1000
Epoch 322/1000
Epoch 323/1000
Epoch 324/1000
Epoch 325/1000
Epoch 326/1000
Epoch 327/1000
Epoch 328/1000
Epoch 329/1000
Epoch 330/1000
Epoch 331/1000
Epoch 332/1000
Epoch 333/1000
Epoch 334/1000
Epoch 335/1000
Epoch 336/1000
Epoch 337/1000
Epoch 338/1000
Epoch 339/1000
Epoch 340/1000
Epoch 341/1000
Epoch 342/1000
Epoch 343/1000
Epoch 344/1000
Epoch 345/1000
Epoch 346/1000
Epoch 347/1000
Epoch 348/1000
Epoch 349/1000
Epoch 350/1000
Epoch 351/1000
Epoch 352/1000
Epoch 353/1000
Epoch 354/1000
Epoch 355/1000
Epoch 356/1000
Epoch 357/1000
Epoch 358/1000
Epoch 359/1000
Epoch 360/1000
Epoch 361/1000
Epoch 362/1000
Epoch 363/1000
Epoch 364/1000
Epoch 365/1000
Epoch 366/1000
Epoch 367/1000
Epoch 368/1000
Epoch 369/1000
Epoch 370/1000
Epoch 371/1000
Epoch 372/1000
Epoch 373/1000
Epoch 374/1000
Epoch 375/1000
Epoch 376/1000
Epoch 377/1000
Epoch 378/1000
Epoch 379/1000
Epoch 380/1000
Epoch 381/1000
Epoch 382/1000
Epoch 383/1000
Epoch 384/1000
Epoch 385/1000
Epoch 386/

Epoch 398/1000
Epoch 399/1000
Epoch 400/1000
Epoch 401/1000
Epoch 402/1000
Epoch 403/1000
Epoch 404/1000
Epoch 405/1000
Epoch 406/1000
Epoch 407/1000
Epoch 408/1000
Epoch 409/1000
Epoch 410/1000
Epoch 411/1000
Epoch 412/1000
Epoch 413/1000
Epoch 414/1000
Epoch 415/1000
Epoch 416/1000
Epoch 417/1000
Epoch 418/1000
Epoch 419/1000
Epoch 420/1000
Epoch 421/1000
Epoch 422/1000
Epoch 423/1000
Epoch 424/1000
Epoch 425/1000
Epoch 426/1000
Epoch 427/1000
Epoch 428/1000
Epoch 429/1000
Epoch 430/1000
Epoch 431/1000
Epoch 432/1000
Epoch 433/1000
Epoch 434/1000
Epoch 435/1000
Epoch 436/1000
Epoch 437/1000
Epoch 438/1000
Epoch 439/1000
Epoch 440/1000
Epoch 441/1000
Epoch 442/1000
Epoch 443/1000
Epoch 444/1000
Epoch 445/1000
Epoch 446/1000
Epoch 447/1000
Epoch 448/1000
Epoch 449/1000
Epoch 450/1000
Epoch 451/1000
Epoch 452/1000
Epoch 453/1000
Epoch 454/1000
Epoch 455/1000
Epoch 456/1000
Epoch 457/1000
Epoch 458/1000
Epoch 459/1000
Epoch 460/1000
Epoch 461/1000
Epoch 462/1000
Epoch 463/1000
Epoch 464/

Epoch 476/1000
Epoch 477/1000
Epoch 478/1000
Epoch 479/1000
Epoch 480/1000
Epoch 481/1000
Epoch 482/1000
Epoch 483/1000
Epoch 484/1000
Epoch 485/1000
Epoch 486/1000
Epoch 487/1000
Epoch 488/1000
Epoch 489/1000
Epoch 490/1000
Epoch 491/1000
Epoch 492/1000
Epoch 493/1000
Epoch 494/1000
Epoch 495/1000
Epoch 496/1000
Epoch 497/1000
Epoch 498/1000
Epoch 499/1000
Epoch 500/1000
Epoch 501/1000
Epoch 502/1000
Epoch 503/1000
Epoch 504/1000
Epoch 505/1000
Epoch 506/1000
Epoch 507/1000
Epoch 508/1000
Epoch 509/1000
Epoch 510/1000
Epoch 511/1000
Epoch 512/1000
Epoch 513/1000
Epoch 514/1000
Epoch 515/1000
Epoch 516/1000
Epoch 517/1000
Epoch 518/1000
Epoch 519/1000
Epoch 520/1000
Epoch 521/1000
Epoch 522/1000
Epoch 523/1000
Epoch 524/1000
Epoch 525/1000
Epoch 526/1000
Epoch 527/1000
Epoch 528/1000
Epoch 529/1000
Epoch 530/1000
Epoch 531/1000
Epoch 532/1000
Epoch 533/1000
Epoch 534/1000
Epoch 535/1000
Epoch 536/1000
Epoch 537/1000
Epoch 538/1000
Epoch 539/1000
Epoch 540/1000
Epoch 541/1000
Epoch 542/

Epoch 554/1000
Epoch 555/1000
Epoch 556/1000
Epoch 557/1000
Epoch 558/1000
Epoch 559/1000
Epoch 560/1000
Epoch 561/1000
Epoch 562/1000
Epoch 563/1000
Epoch 564/1000
Epoch 565/1000
Epoch 566/1000
Epoch 567/1000
Epoch 568/1000
Epoch 569/1000
Epoch 570/1000
Epoch 571/1000
Epoch 572/1000
Epoch 573/1000
Epoch 574/1000
Epoch 575/1000
Epoch 576/1000
Epoch 577/1000
Epoch 578/1000
Epoch 579/1000
Epoch 580/1000
Epoch 581/1000
Epoch 582/1000
Epoch 583/1000
Epoch 584/1000
Epoch 585/1000
Epoch 586/1000
Epoch 587/1000
Epoch 588/1000
Epoch 589/1000
Epoch 590/1000
Epoch 591/1000
Epoch 592/1000
Epoch 593/1000
Epoch 594/1000
Epoch 595/1000
Epoch 596/1000
Epoch 597/1000
Epoch 598/1000
Epoch 599/1000
Epoch 600/1000
Epoch 601/1000
Epoch 602/1000
Epoch 603/1000
Epoch 604/1000
Epoch 605/1000
Epoch 606/1000
Epoch 607/1000
Epoch 608/1000
Epoch 609/1000
Epoch 610/1000
Epoch 611/1000
Epoch 612/1000
Epoch 613/1000
Epoch 614/1000
Epoch 615/1000
Epoch 616/1000
Epoch 617/1000
Epoch 618/1000
Epoch 619/1000
Epoch 620/

Epoch 632/1000
Epoch 633/1000
Epoch 634/1000
Epoch 635/1000
Epoch 636/1000
Epoch 637/1000
Epoch 638/1000
Epoch 639/1000
Epoch 640/1000
Epoch 641/1000
Epoch 642/1000
Epoch 643/1000
Epoch 644/1000
Epoch 645/1000
Epoch 646/1000
Epoch 647/1000
Epoch 648/1000
Epoch 649/1000
Epoch 650/1000
Epoch 651/1000
Epoch 652/1000
Epoch 653/1000
Epoch 654/1000
Epoch 655/1000
Epoch 656/1000
Epoch 657/1000
Epoch 658/1000
Epoch 659/1000
Epoch 660/1000
Epoch 661/1000
Epoch 662/1000
Epoch 663/1000
Epoch 664/1000
Epoch 665/1000
Epoch 666/1000
Epoch 667/1000
Epoch 668/1000
Epoch 669/1000
Epoch 670/1000
Epoch 671/1000
Epoch 672/1000
Epoch 673/1000
Epoch 674/1000
Epoch 675/1000
Epoch 676/1000
Epoch 677/1000
Epoch 678/1000
Epoch 679/1000
Epoch 680/1000
Epoch 681/1000
Epoch 682/1000
Epoch 683/1000
Epoch 684/1000
Epoch 685/1000
Epoch 686/1000
Epoch 687/1000
Epoch 688/1000
Epoch 689/1000
Epoch 690/1000
Epoch 691/1000
Epoch 692/1000
Epoch 693/1000
Epoch 694/1000
Epoch 695/1000
Epoch 696/1000
Epoch 697/1000
Epoch 698/

Epoch 710/1000
Epoch 711/1000
Epoch 712/1000
Epoch 713/1000
Epoch 714/1000
Epoch 715/1000
Epoch 716/1000
Epoch 717/1000
Epoch 718/1000
Epoch 719/1000
Epoch 720/1000
Epoch 721/1000
Epoch 722/1000
Epoch 723/1000
Epoch 724/1000
Epoch 725/1000
Epoch 726/1000
Epoch 727/1000
Epoch 728/1000
Epoch 729/1000
Epoch 730/1000
Epoch 731/1000
Epoch 732/1000
Epoch 733/1000
Epoch 734/1000
Epoch 735/1000
Epoch 736/1000
Epoch 737/1000
Epoch 738/1000
Epoch 739/1000
Epoch 740/1000
Epoch 741/1000
Epoch 742/1000
Epoch 743/1000
Epoch 744/1000
Epoch 745/1000
Epoch 746/1000
Epoch 747/1000
Epoch 748/1000
Epoch 749/1000
Epoch 750/1000
Epoch 751/1000
Epoch 752/1000
Epoch 753/1000
Epoch 754/1000
Epoch 755/1000
Epoch 756/1000
Epoch 757/1000
Epoch 758/1000
Epoch 759/1000
Epoch 760/1000
Epoch 761/1000
Epoch 762/1000
Epoch 763/1000
Epoch 764/1000
Epoch 765/1000
Epoch 766/1000
Epoch 767/1000
Epoch 768/1000
Epoch 769/1000
Epoch 770/1000
Epoch 771/1000
Epoch 772/1000
Epoch 773/1000
Epoch 774/1000
Epoch 775/1000
Epoch 776/

Epoch 788/1000
Epoch 789/1000
Epoch 790/1000
Epoch 791/1000
Epoch 792/1000
Epoch 793/1000
Epoch 794/1000
Epoch 795/1000
Epoch 796/1000
Epoch 797/1000
Epoch 798/1000
Epoch 799/1000
Epoch 800/1000
Epoch 801/1000
Epoch 802/1000
Epoch 803/1000
Epoch 804/1000
Epoch 805/1000
Epoch 806/1000
Epoch 807/1000
Epoch 808/1000
Epoch 809/1000
Epoch 810/1000
Epoch 811/1000
Epoch 812/1000
Epoch 813/1000
Epoch 814/1000
Epoch 815/1000
Epoch 816/1000
Epoch 817/1000
Epoch 818/1000
Epoch 819/1000
Epoch 820/1000
Epoch 821/1000
Epoch 822/1000
Epoch 823/1000
Epoch 824/1000
Epoch 825/1000
Epoch 826/1000
Epoch 827/1000
Epoch 828/1000
Epoch 829/1000
Epoch 830/1000
Epoch 831/1000
Epoch 832/1000
Epoch 833/1000
Epoch 834/1000
Epoch 835/1000
Epoch 836/1000
Epoch 837/1000
Epoch 838/1000
Epoch 839/1000
Epoch 840/1000
Epoch 841/1000
Epoch 842/1000
Epoch 843/1000
Epoch 844/1000
Epoch 845/1000
Epoch 846/1000
Epoch 847/1000
Epoch 848/1000
Epoch 849/1000
Epoch 850/1000
Epoch 851/1000
Epoch 852/1000
Epoch 853/1000
Epoch 854/

Epoch 866/1000
Epoch 867/1000
Epoch 868/1000
Epoch 869/1000
Epoch 870/1000
Epoch 871/1000
Epoch 872/1000
Epoch 873/1000
Epoch 874/1000
Epoch 875/1000
Epoch 876/1000
Epoch 877/1000
Epoch 878/1000
Epoch 879/1000
Epoch 880/1000
Epoch 881/1000
Epoch 882/1000
Epoch 883/1000
Epoch 884/1000
Epoch 885/1000
Epoch 886/1000
Epoch 887/1000
Epoch 888/1000
Epoch 889/1000
Epoch 890/1000
Epoch 891/1000
Epoch 892/1000
Epoch 893/1000
Epoch 894/1000
Epoch 895/1000
Epoch 896/1000
Epoch 897/1000
Epoch 898/1000
Epoch 899/1000
Epoch 900/1000
Epoch 901/1000
Epoch 902/1000
Epoch 903/1000
Epoch 904/1000
Epoch 905/1000
Epoch 906/1000
Epoch 907/1000
Epoch 908/1000
Epoch 909/1000
Epoch 910/1000
Epoch 911/1000
Epoch 912/1000
Epoch 913/1000
Epoch 914/1000
Epoch 915/1000
Epoch 916/1000
Epoch 917/1000
Epoch 918/1000
Epoch 919/1000
Epoch 920/1000
Epoch 921/1000
Epoch 922/1000
Epoch 923/1000
Epoch 924/1000
Epoch 925/1000
Epoch 926/1000
Epoch 927/1000
Epoch 928/1000
Epoch 929/1000
Epoch 930/1000
Epoch 931/1000
Epoch 932/

Epoch 944/1000
Epoch 945/1000
Epoch 946/1000
Epoch 947/1000
Epoch 948/1000
Epoch 949/1000
Epoch 950/1000
Epoch 951/1000
Epoch 952/1000
Epoch 953/1000
Epoch 954/1000
Epoch 955/1000
Epoch 956/1000
Epoch 957/1000
Epoch 958/1000
Epoch 959/1000
Epoch 960/1000
Epoch 961/1000
Epoch 962/1000
Epoch 963/1000
Epoch 964/1000
Epoch 965/1000
Epoch 966/1000
Epoch 967/1000
Epoch 968/1000
Epoch 969/1000
Epoch 970/1000
Epoch 971/1000
Epoch 972/1000
Epoch 973/1000
Epoch 974/1000
Epoch 975/1000
Epoch 976/1000
Epoch 977/1000
Epoch 978/1000
Epoch 979/1000
Epoch 980/1000
Epoch 981/1000
Epoch 982/1000
Epoch 983/1000
Epoch 984/1000
Epoch 985/1000
Epoch 986/1000
Epoch 987/1000
Epoch 988/1000
Epoch 989/1000
Epoch 990/1000
Epoch 991/1000
Epoch 992/1000
Epoch 993/1000
Epoch 994/1000
Epoch 995/1000
Epoch 996/1000
Epoch 997/1000
Epoch 998/1000
Epoch 999/1000
Epoch 1000/1000


<tensorflow.python.keras.callbacks.History at 0x142fef2c508>

In [118]:
test = np.array(count_vectorizer.transform(["나는 뭐먹지"]).toarray())
test

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [119]:
model.predict(test)

array([[0.00570694]], dtype=float32)

In [130]:
from sklearn.feature_extraction.text import CountVectorizer

text_data = ["나는 배가 고프다 너는 배가 고프냐",  # 이거 하나가 도큐먼트
             "내일 점심 뭐먹지 너는 뭐먹지",
             "내일 공부 해야겠다 내일 공부 해야지",
             "점심 먹고 공부 해야지 너는 공부 해야지"]
count_vectorizer = CountVectorizer()
count_vectorizer.fit(text_data)
print(count_vectorizer.vocabulary_)

{'나는': 3, '배가': 8, '고프다': 1, '너는': 5, '고프냐': 0, '내일': 4, '점심': 9, '뭐먹지': 7, '공부': 2, '해야겠다': 10, '해야지': 11, '먹고': 6}


In [131]:
print(count_vectorizer.transform(text_data).toarray())

[[1 1 0 1 0 1 0 0 2 0 0 0]
 [0 0 0 0 1 1 0 2 0 1 0 0]
 [0 0 2 0 2 0 0 0 0 0 1 1]
 [0 0 2 0 0 1 1 0 0 1 0 2]]


In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidt_vectorizer = TfidfVectorizer()
tfidt_vectorizer.fit(text_data)
print(tfidt_vectorizer.transform(text_data).toarray())
print(tfidt_vectorizer.vocabulary_)

[[0.36742339 0.36742339 0.         0.36742339 0.         0.23452159
  0.         0.         0.73484678 0.         0.         0.        ]
 [0.         0.         0.         0.         0.33166972 0.26851522
  0.         0.84136197 0.         0.33166972 0.         0.        ]
 [0.         0.         0.61404114 0.         0.61404114 0.
  0.         0.         0.         0.         0.38941666 0.30702057]
 [0.         0.         0.59590882 0.         0.         0.24121977
  0.37791739 0.         0.         0.29795441 0.         0.59590882]]
{'나는': 3, '배가': 8, '고프다': 1, '너는': 5, '고프냐': 0, '내일': 4, '점심': 9, '뭐먹지': 7, '공부': 2, '해야겠다': 10, '해야지': 11, '먹고': 6}


In [134]:
cv = count_vectorizer.transform(text_data).toarray()
tv = tfidt_vectorizer.transform(text_data).toarray() # cv, tv의 벡터 공간은 동일해야한다

In [135]:
print(cv, tv)

[[1 1 0 1 0 1 0 0 2 0 0 0]
 [0 0 0 0 1 1 0 2 0 1 0 0]
 [0 0 2 0 2 0 0 0 0 0 1 1]
 [0 0 2 0 0 1 1 0 0 1 0 2]] [[0.36742339 0.36742339 0.         0.36742339 0.         0.23452159
  0.         0.         0.73484678 0.         0.         0.        ]
 [0.         0.         0.         0.         0.33166972 0.26851522
  0.         0.84136197 0.         0.33166972 0.         0.        ]
 [0.         0.         0.61404114 0.         0.61404114 0.
  0.         0.         0.         0.         0.38941666 0.30702057]
 [0.         0.         0.59590882 0.         0.         0.24121977
  0.37791739 0.         0.         0.29795441 0.         0.59590882]]


In [143]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

points = np.array([[1,1], [5,5]])
euclidean_distances(points[0:1], points[1:2]) # 멀먼 멀 수록 숫자가 큼

array([[5.65685425]])

In [144]:
cosine_similarity(points[0:1], points[1:2]) # 유사하면 1, 다르면 0 에 가깝게 나옴

array([[1.]])

In [146]:
manhattan_distances(points[0:1], points[1:2]) # 제곱에 루트 씌우는게 아니고. x속성 차, y속성 차 를 각각 절대값을 더함 (4+4)

array([[8.]])

In [151]:
print(euclidean_distances(cv[0:1], cv[1:2])) # 1번 문장과 2번 문장
print(euclidean_distances(cv[0:1], cv[2:3])) # 1번 문장과 3번 문장
print(euclidean_distances(tv[0:1], tv[1:2]))
print(euclidean_distances(tv[0:1], tv[2:3]))
print(cosine_similarity(cv[0:1], cv[1:2])) # 1번 문장과 2번 문장
print(cosine_similarity(cv[0:1], cv[2:3])) # 1번 문장과 3번 문장
print(cosine_similarity(tv[0:1], tv[1:2]))
print(cosine_similarity(tv[0:1], tv[2:3]))

[[3.60555128]]
[[4.24264069]]
[[1.3689612]]
[[1.41421356]]
[[0.13363062]]
[[0.]]
[[0.06297262]]
[[0.]]


In [148]:
print(cv[0:1])

[[1 1 0 1 0 1 0 0 2 0 0 0]]
