In [None]:
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
!pip install konlpy
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git 
%cd Mecab-ko-for-Google-Colab/
!bash install_mecab-ko_on_colab190912.sh
%cd ../

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.5 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 51.9 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0
Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 115, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 115 (delta 11), reused 10 (delta 3), pack-reused 91[K
Receiving objects: 100% (115/115), 1.27 MiB | 10.16 MiB/s, done.
Resolving deltas: 100% (50/50), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/cola

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt") # train
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt") # test
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt") # train + test

('ratings.txt', <http.client.HTTPMessage at 0x7fef89ad89d0>)

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
train_data.head(10)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
5,5403919,막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.,0
6,7797314,원작의 긴장감을 제대로 살려내지못했다.,0
7,9443947,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...,0
8,7156791,액션이 없는데도 재미 있는 몇안되는 영화,1
9,5912145,왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?,1


# Mecab

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
def tokenize(data, tokenizer):
    result = []

    for sentence in data:
        curr_data = []
        curr_data = tokenizer.morphs(sentence) # mecab 형태소 분석 tokenizer
        # curr_data = [word for word in curr_data if not word in stopwords] # 불용어 제거
        result.append(curr_data)
    return result

In [None]:
def load_data(train_data, test_data, num_words=10000):

    # 중복 제거
    train_data.drop_duplicates(subset=['document'], inplace=True)
    test_data.drop_duplicates(subset=['document'], inplace=True)

    # Nan 결측치 제거
    train_data = train_data.dropna(how='any')
    test_data = test_data.dropna(how='any')

    # 토큰화 및 불용어 제거
    x_train = tokenize(train_data['document'],mecab)
    x_test = tokenize(test_data['document'], mecab)

    # 단어장 만드는 중...
    words = np.concatenate(x_train).tolist()
    counter = Counter(words)
    counter = counter.most_common(10000-4)
    vocab = ['<PAD>', '<BOS>', '<UNK>', '<UNUSED>'] + [key for key, _ in counter]
    word_to_index = {word:index for index, word in enumerate(vocab)}

    def wordlist_to_indexlist(wordlist):
        return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in wordlist]
    
    x_train = list(map(wordlist_to_indexlist, x_train))
    x_test = list(map(wordlist_to_indexlist, x_test))

    return x_train, np.array(list(train_data['label'])), x_test, np.array(list(test_data['label'])), word_to_index

In [None]:
X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data)

In [None]:
index_to_word = {index:word for word, index in word_to_index.items()}

In [None]:
# 문장 1개를 활용할 딕셔너리와 함께 주면, 단어 인덱스 리스트 벡터로 변환해 주는 함수
# 단, 모든 문장은 <BOS>로 시작하는 것을 말합니다.

def get_encoded_sentence(sentence, word_to_index): ##### 텍스트 -> 숫자
    return [word_to_index['<BOS>']] + [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in sentence.split()]

# 여러 개의 문장 리스트를 한꺼번에 단어 인덱스 리스트 벡터로 encode해주는 함수입니다.
def get_encoded_sentences(sentences, word_to_index):
    return [get_encoded_sentence(sentence, word_to_index) for sentence in sentences]

# 숫자 벡터로 encode된 문장을 원래대로 decode하는 함수입니다. ##### 숫자 -> 텍스트
def get_decoded_sentence(encoded_sentence, index_to_word):
    return ' '.join(index_to_word[index] if index in index_to_word else '<UNK>' for index in encoded_sentence[1:])

# 여러개의 숫자 벡터로 encode된 문장을 한꺼번에 원래대로 decode하는 함수입니다.
def get_decoded_sentences(encoded_sentences, index_to_word):
    return [get_decoded_sentence(encoded_sentence, index_to_word) for encoded_sentence in encoded_sentences]

In [None]:
get_decoded_sentence(X_train[5], index_to_word)

'<UNK> <UNK> 3 세 부터 초등 학교 1 학년 생 인 8 살 용 영화 . ㅋㅋㅋ . .. 별반 개 도 아까움 .'

In [None]:
# 데이터 셋 내 문장 길이 분포
total_data_text = list(X_train) + list(X_test)

# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)

# 문장 길이의 평균값, 최대값, 표준편차를 계산
print('문장길이 평균 :', np.mean(num_tokens))
print('문장길이 최대 :', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를 들면 최대길이를 (평균 + 2*표준편차)로 한다면,
max_tokens = np.mean(num_tokens) +2 * np.std(num_tokens)

maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('전체 문장의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens)/len(num_tokens)))

문장길이 평균 : 18.722963668289488
문장길이 최대 : 116
문장길이 표준편차 :  15.329504488772837
pad_sequences maxlen :  49
전체 문장의 0.9346725436292804%가 maxlen 설정값 이내에 포함됩니다. 


In [None]:
# 패딩 추가
X_train = pad_sequences(X_train, value=word_to_index['<PAD>'], padding='pre', maxlen=maxlen)
X_test = pad_sequences(X_test, value=word_to_index['<PAD>'], padding='pre', maxlen=maxlen)

In [None]:
print(X_train.shape)
print(X_test.shape)

(146182, 49)
(49157, 49)


In [None]:
vocab_size = 10000
word_vector_dim = 200 # 2의 배수

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 200)         2000000   
                                                                 
 lstm (LSTM)                 (None, 8)                 6688      
                                                                 
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 2,006,769
Trainable params: 2,006,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
X_val = X_train[:50000]
y_val = y_train[:50000]

partial_X_train = X_train[50000:]
partial_y_train = y_train[50000:]

In [None]:
# 얼리스탑
es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
epochs= 100
history = model.fit(partial_X_train, partial_y_train, epochs=epochs, batch_size=512, validation_data=(X_val, y_val), callbacks=[es], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


In [None]:
results = model.evaluate(X_test, y_test, verbose=2)

1537/1537 - 4s - loss: 0.3532 - accuracy: 0.8460 - 4s/epoch - 3ms/step


In [None]:
print(results)

[0.35317349433898926, 0.8460239768028259]


# Sentencepiece
# 커스텀 함수 선언

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 4.7 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [None]:
import sentencepiece as spm
import csv

In [None]:
def spm_write_and_tokenize(data, file_name, model = 'bpe', vocab_size = 8000):
  corpus = file_name + '.txt'
  with open(corpus, 'w', encoding='utf8') as f:
    f.write('\n'.join(data))
  
  prefix = file_name+'_vocab_'+str(vocab_size)+"_"+model
  spm.SentencePieceTrainer.Train(
      
      f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size}" +
      "--model_type=" + "model" +     # (unigram(default), bpe, char, word)
      "--max_sentence_length=999999"  #문장 최대 길이
  )
  result = pd.read_csv(prefix+'.vocab', sep='\t', header = None,
                       quoting=csv.QUOTE_NONE)
  
  return result


In [None]:
def sp_tokenize(s, corpus):
    
    # 텐서를 저장할 리스트 초기화
    tensor = []
    # 받아온 문장들을 한 문장씩 가져와서
    for sen in corpus:
      # tensor리스트에 인덱스화를 시킨 채로 append
        tensor.append(s.EncodeAsIds(sen))
    # 단어장을 불러옴
    with open(vocab_name, 'r') as f:
        vocab = f.readlines()

    word_index = {}
    index_word = {}

    for idx, line in enumerate(vocab):
      # 단어장을 한줄단위로 불러와 탭을 기준으로 split을 한 상태에서 첫번째 인자를 word로 저장
        word = line.split("\t")[0]  
      # 딕셔너리 형태로 인덱스:단어, 단어:인덱스 형태로 저장
        word_index.update({idx:word})
        index_word.update({word:idx})
    
    # tensor 패딩(앞)을 진행
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='pre')

    return tensor, word_index, index_word

In [None]:
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
train_data_t = train_data.dropna(how='any')
test_data_t = test_data.dropna(how='any')

# vocab_size = 8000    
# model_type = bpe

In [None]:
vs = 8000
md = 'bpe'
result_df = spm_write_and_tokenize(train_data_t['document'], 'naver', model = md, vocab_size=vs)

In [None]:
result_df

Unnamed: 0,0,1
0,<unk>,0.00000
1,<s>,0.00000
2,</s>,0.00000
3,▁,-3.23816
4,.,-3.48155
...,...,...
7995,떻,-13.81930
7996,렸,-13.81940
7997,렇,-13.81950
7998,봤,-13.81960


In [None]:
model_name = 'naver_vocab_'+str(vs)+'_'+md+'.model'
vocab_name = 'naver_vocab_'+str(vs)+'_'+md+'.vocab'
sp = spm.SentencePieceProcessor()
sp.load(model_name)

True

In [None]:
tensor, word_index, index_word = sp_tokenize(sp, train_data_t['document'])
tensor_test, word_index_test, index_word_test = sp_tokenize(sp, test_data_t['document'])

In [None]:
print(tensor)

[[   0    0    0 ... 2193   66 1850]
 [   0    0    0 ... 7039  823  407]
 [   0    0    0 ... 2266 1658  325]
 ...
 [   0    0    0 ... 3790   92   20]
 [   0    0    0 ...  299  163  135]
 [   0    0    0 ...  446 5882    7]]


In [None]:
print(tensor_test)

[[   0    0    0 ...    0 2855  317]
 [   0    0    0 ... 4466 4602 1911]
 [   0    0    0 ...  734 7264 6496]
 ...
 [   0    0    0 ... 5555   85 2885]
 [   0    0    0 ...  345  865   44]
 [   0    0    0 ...   12  194 4302]]


In [None]:
vocab_size = 10000
word_vector_dim = 200 # 2의 배수

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 200)         2000000   
                                                                 
 lstm_1 (LSTM)               (None, 8)                 6688      
                                                                 
 dense_2 (Dense)             (None, 8)                 72        
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 2,006,769
Trainable params: 2,006,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_val = y_train[:50000]
y_tr = y_train[50000:]

In [None]:
X_val = tensor[:50000]
X_train = tensor[50000:]

In [None]:
X_train

array([[   0,    0,    0, ..., 3858, 1762,    5],
       [   0,    0,    0, ...,    3, 1195,    7],
       [   0,    0,    0, ..., 2857, 6133,    4],
       ...,
       [   0,    0,    0, ..., 3790,   92,   20],
       [   0,    0,    0, ...,  299,  163,  135],
       [   0,    0,    0, ...,  446, 5882,    7]], dtype=int32)

In [None]:
y_train

array([0, 1, 0, ..., 0, 1, 0])

In [None]:
X_test = tensor_test

In [None]:
print(X_train.shape)
print(y_train.shape)

(96182, 134)
(146182,)


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
epochs= 100
history = model.fit(X_train, y_tr, epochs=epochs, batch_size=512, validation_data=(X_val, y_val), callbacks=[es], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [None]:
results = model.evaluate(X_test, y_test, verbose=2)

1537/1537 - 6s - loss: 0.3507 - accuracy: 0.8474 - 6s/epoch - 4ms/step


In [None]:
print(results)

[0.35067451000213623, 0.8474276065826416]


# vocab_size = 5000
# model_type = bpe

In [None]:
vs = 5000
md = 'bpe'
result_df = spm_write_and_tokenize(train_data_t['document'], 'naver', vocab_size = vs, model=md)

In [None]:
model_name = 'naver_vocab_'+str(vs)+'_'+md+'.model'
vocab_name = 'naver_vocab_'+str(vs)+'_'+md+'.vocab'
sp = spm.SentencePieceProcessor()
sp.load(model_name)

True

In [None]:
tensor, word_index, index_word = sp_tokenize(sp, train_data_t['document'])
tensor_test, word_index_test, index_word_test = sp_tokenize(sp, test_data_t['document'])

In [None]:
vocab_size = 10000
word_vector_dim = 200 # 2의 배수

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 200)         2000000   
                                                                 
 lstm_2 (LSTM)               (None, 8)                 6688      
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 2,006,769
Trainable params: 2,006,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_val = y_train[:50000]
y_tr = y_train[50000:]
X_val = tensor[:50000]
X_train = tensor[50000:]

X_test = tensor_test

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
epochs= 100
history = model.fit(X_train, y_tr, epochs=epochs, batch_size=512, validation_data=(X_val, y_val), callbacks=[es], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [None]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

1537/1537 - 7s - loss: 0.3560 - accuracy: 0.8435 - 7s/epoch - 4ms/step
[0.3559766411781311, 0.8435014486312866]


# vocab_size = 5000
# model_type = unigram

In [None]:
vs = 5000
md = 'unigram'
result_df = spm_write_and_tokenize(train_data_t['document'], 'naver', vocab_size = vs, model=md)

In [None]:
model_name = 'naver_vocab_'+str(vs)+'_'+md+'.model'
vocab_name = 'naver_vocab_'+str(vs)+'_'+md+'.vocab'
sp = spm.SentencePieceProcessor()
sp.load(model_name)

True

In [None]:
tensor, word_index, index_word = sp_tokenize(sp, train_data_t['document'])
tensor_test, word_index_test, index_word_test = sp_tokenize(sp, test_data_t['document'])

In [None]:
vocab_size = 10000
word_vector_dim = 200 # 2의 배수

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(8))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 200)         2000000   
                                                                 
 lstm_3 (LSTM)               (None, 8)                 6688      
                                                                 
 dense_6 (Dense)             (None, 8)                 72        
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
Total params: 2,006,769
Trainable params: 2,006,769
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_val = y_train[:50000]
y_tr = y_train[50000:]
X_val = tensor[:50000]
X_train = tensor[50000:]

X_test = tensor_test

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
epochs= 100
history = model.fit(X_train, y_tr, epochs=epochs, batch_size=512, validation_data=(X_val, y_val), callbacks=[es], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


In [None]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

1537/1537 - 7s - loss: 0.3550 - accuracy: 0.8454 - 7s/epoch - 4ms/step
[0.3550426661968231, 0.8454136848449707]


# -----
# Mecab으로 토큰화 후 sentencepiece

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
X_train, y_train, X_test, y_test, word_to_index = load_data(train_data, test_data)

In [None]:
index_to_word = {index:word for word, index in word_to_index.items()}

In [None]:
sen = get_decoded_sentences(X_train, index_to_word)

In [None]:
# (unigram(default), bpe, char, word)
vs = 5000
md = 'bpe'
result_df = spm_write_and_tokenize(sen, 'naver', vocab_size = vs, model=md)

In [None]:
tensor, word_index, index_word = sp_tokenize(sp, train_data_t['document'])
tensor_test, word_index_test, index_word_test = sp_tokenize(sp, test_data_t['document'])

In [None]:
vocab_size = 5000
word_vector_dim = 32 # 2의 배수

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(128))
# model.add(keras.layers.LSTM(8, return_sequences = True))
# model.add(keras.layers.LSTM(8, return_sequences = True))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, None, 32)          160000    
                                                                 
 lstm_19 (LSTM)              (None, 128)               82432     
                                                                 
 dense_30 (Dense)            (None, 8)                 1032      
                                                                 
 dense_31 (Dense)            (None, 1)                 9         
                                                                 
Total params: 243,473
Trainable params: 243,473
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_val = y_train[:50000]
y_tr = y_train[50000:]
X_val = tensor[:50000]
X_train = tensor[50000:]

X_test = tensor_test

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
epochs= 100
history = model.fit(X_train, y_tr, epochs=epochs, batch_size=512, validation_data=(X_val, y_val), callbacks=[es], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


In [None]:
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

1537/1537 - 8s - loss: 0.3605 - accuracy: 0.8409 - 8s/epoch - 5ms/step
[0.360478013753891, 0.8409178853034973]
