# Ma LSTM

In [1]:
import sys
import tensorflow as tf
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split

import json

# 시각화

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# 학습 데이터 파일 로드

In [None]:
## 미리 Global 변수를 지정하자. 파일 명, 파일 위치, 디렉토리 등이 있다.

DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.

BATCH_SIZE = 16
EPOCH = 2
HIDDEN = 64
BUFFER_SIZE = 10000

NUM_LAYERS = 3
DROPOUT_RATIO = 0.3

TEST_SPLIT = 0.1
RNG_SEED = 13371447
EMBEDDING_DIM = 128
MAX_SEQ_LEN = 31

In [None]:
## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다.

q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [None]:
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

q1_data = np.load(open(data_in_path + TRAIN_Q1_DATA_FILE, 'rb'))
q2_data = np.load(open(data_in_path + TRAIN_Q2_DATA_FILE, 'rb'))
labels = np.load(open(data_in_path + TRAIN_LABEL_DATA_FILE, 'rb'))
prepro_configs = None

with open(data_in_path + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [None]:
VOCAB_SIZE = prepro_configs['vocab_size']

In [None]:
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])

In [None]:
## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는
## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다.

X = np.stack((q1_data, q2_data), axis=1)
y = labels
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)

train_Q1 = train_X[:,0]
train_Q2 = train_X[:,1]
test_Q1 = test_X[:,0]
test_Q2 = test_X[:,1]

In [None]:
def mapping_fn(base, hypothesis, labels=None):
    features = {"base": base, "hypothesis": hypothesis}
    if labels is not None:
        return features, labels
    else:
        return features

dataset = tf.data.Dataset.from_tensor_slices((train_q1, train_q2, train_y))
dataset = dataset.shuffle(len(train_q1))
dataset = dataset.batch(batch_size) 
dataset = dataset.map(mapping_fn)

validation_dataset = tf.data.Dataset.from_tensor_slices((valid_q1, valid_q2, valid_y))
validation_dataset = validation_dataset.batch(batch_size) 
validation_dataset = validation_dataset.map(mapping_fn)

# 모델 정의

In [14]:
from tensorflow.keras import layers

In [15]:
class Model(tf.keras.Model):
    
    def __init__(self, **kargs):
        super(Model, self).__init__(name=model_name)
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],
                                     output_dim=kargs['embedding_size'])
        self.lstm = layers.LSTM(units=kargs['lstm_dimension'])
        
    def call(self, x):
        x1 = x['base']
        x2 = x['hypothesis']
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)
        x1 = self.lstm(x1)
        x2 = self.lstm(x2)
        x = tf.exp(-tf.reduce_sum(tf.abs(x1 - x2), axis=1))
        
        return x
    
    def build_graph(self, input_shape):
        input_shape_wo_batch = input_shape[1:]
        self.build(input_shape)
        inputs = tf.keras.Input(shape=input_shape_wo_batch)
        
        _ = self.call(inputs)

In [16]:
max_length = q1_data.shape[1]

kargs = {'vocab_size': prepro_configs['vocab_size'],
        'embedding_size': 300,
        'lstm_dimension': 300}

In [17]:
model = Model(**kargs)

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.build_graph(input_shape=(batch_size, max_length))
model.summary()

In [None]:
checkpoint_path = data_out_path + model_name + '/weights.{epoch:02d}-{val_loss:.2f}'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True)

history = model.fit(dataset, epochs=num_epochs,
                 validation_data=validation_dataset,
                 callbacks=[cp_callback])

In [None]:
plot_graphs(history, 'accuracy')

In [28]:
TEST_Q1_DATA_FILE = 'test_q1.npy'
TEST_Q2_DATA_FILE = 'test_q2.npy'
TEST_ID_DATA_FILE = 'test_id.npy'

test_q1_data = np.load(open(data_in_path + TEST_Q1_DATA_FILE, 'rb'))
test_q2_data = np.load(open(data_in_path + TEST_Q2_DATA_FILE, 'rb'))
test_id_data = np.load(open(data_in_path + TEST_ID_DATA_FILE, 'rb'))

In [35]:
test_dataset = tf.data.Dataset.from_tensor_slices((test_q1_data, test_q2_data))
test_dataset = test_dataset.batch(batch_size) 
test_dataset = test_dataset.map(mapping_fn)

In [36]:
predictions = model.predict(test_dataset)

KeyboardInterrupt: 

In [0]:
print(len(predictions)) #2345796

output = pd.DataFrame(data={"test_id":test_id_data, "is_duplicate": list(predictions)})
output.to_csv(f"{data_out_path}rnn_predict.csv", index=False, quoting=3)

2345796
