## Data Preparation

In [None]:
import tensorflow as tf
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import os
import params

embed_lookup = KeyedVectors.load_word2vec_format(params.w2v_path, 
                                                 binary=True)

print('변경 전')
print(embed_lookup.vectors.shape)

# 
# # <pad> vector, index 추가
# 

pad_vectors = np.zeros_like(embed_lookup.vectors[0])
embed_lookup.vectors = np.insert(embed_lookup.vectors, 0, pad_vectors,axis=0)
embed_lookup.index2word.insert(0,'<pad>')

print('변경 후')
print(embed_lookup.vectors.shape)

In [2]:
# store pretrained vocab
pretrained_words = []
for word in embed_lookup.vocab:
    pretrained_words.append(word)

pretrained_words.insert(0,'<pad>')

In [3]:
row_idx = 1

# get word/embedding in that row
word = pretrained_words[row_idx] # get words by index
embedding = embed_lookup[word] # embeddings by word

# vocab and embedding info
print("Size of Vocab: {}\n".format(len(pretrained_words)))
print('Word in vocab: {}\n'.format(word))
print('Length of embedding: {}\n'.format(len(embedding)))
#print('Associated embedding: \n', embedding)

Size of Vocab: 41722

Word in vocab: g

Length of embedding: 200



In [4]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

train_df = pd.read_csv('data/tarin_prodNm.csv', encoding='euc-kr')
train_df = shuffle(train_df, random_state=33).reset_index(drop=True)
train_df.head()

Unnamed: 0,prodNm,cleaned_prodNm,label
0,"피아토스 감자칩 바베큐맛 85g 피아토스,감자칩,수입감자칩,수입과자",피아토스 감자칩 바베큐맛 g 피아토스 감자칩 수입감자칩 수입과자,0
1,청우 왕사탕 500g/사탕/왕사탕/캔디/왕캔디/간식,청우 왕사탕 g 사탕 왕사탕 캔디 왕캔디 간식,0
2,ACER Swift3 SF314-52G-59WM용 저반사필름,acer swift sf - g- wm용 저반사필름,1
3,"앤디스 크림 데 민트 띤 132g 고급초콜렛,수입초콜렛,수입사탕",앤디스 크림 데 민트 띤 g 고급초콜렛 수입초콜렛 수입사탕,0
4,맥심 오리지널 20T 24입 커피/차/꿀 무료배송,맥심 오리지널 t 입 커피 차 꿀 무료배송,0


In [5]:
cleaned_prodNm = train_df['cleaned_prodNm'].values.tolist()
encoded_labels = train_df['label'].values

In [6]:
# convert prodNm to tokens

def tokenize_all_prodNm(embed_lookup, cleaned_prodNm):
    
    # split each prodNm into a list of words
    prodNm_words = [prodNm.split() for prodNm in cleaned_prodNm]

    tokenized_prodNms = []
    for prodNm in prodNm_words:
        ints = []
        for word in prodNm:
            try:
                idx = embed_lookup.vocab[word].index +1
            except: 
                idx = 0
            ints.append(idx)
            
        tokenized_prodNms.append(ints)
    
    return tokenized_prodNms

tokenize_all_cleaned_prodNms = tokenize_all_prodNm(embed_lookup, cleaned_prodNm)
tokenize_all_cleaned_prodNms[0]

[4029, 480, 977, 1, 4029, 480, 6626, 183]

In [7]:
# testing code and printing a tokenized review
print(tokenize_all_cleaned_prodNms[0])

[4029, 480, 977, 1, 4029, 480, 6626, 183]


In [8]:
# fill padding into toekized_prodNms

def pad_features(tokenize_all_cleaned_prodNms, seq_length):
    ''' Return features of tokenized_prodNms, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenize_all_cleaned_prodNms), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(tokenize_all_cleaned_prodNms):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

features = pad_features(tokenize_all_cleaned_prodNms,15)
features[0]

array([   0,    0,    0,    0,    0,    0,    0, 4029,  480,  977,    1,
       4029,  480, 6626,  183])

## Data loader

In [9]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(11411, 15) 
Validation set: 	(1426, 15) 
Test set: 		(1427, 15)


In [22]:
## data save
np.savez('data/food_cat_prdNm.npz',
         train_x=train_x, train_y=train_y, 
         val_x=val_x, val_y=val_y, 
         test_x=test_x, test_y=test_y)

In [11]:
path = 'food_cat_prdNm.npz'
with np.load(path) as data:
    train_x = data['train_x']
    train_y = data['train_y']
    val_x = data['val_x']
    val_y = data['val_y']
    test_x = data['test_x']
    test_y = data['test_y']

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y))
test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y))

In [13]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 100

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

## Define model

In [14]:
class TxtCNN(tf.keras.Model):
    """
    The embedding layer + CNN model that will be used to perform prodNm analysis.
    """
    
    def __init__(self, embed_model, vocab_size, output_size, embedding_dim, feature_size,
                num_filters=100, kernel_sizes=[3,4,5], freeze_embeddings= True, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        
        super(TxtCNN, self).__init__()
        
        # set class vars
        self.num_filters = num_filters
        self.embedding_dim = embedding_dim
        self.kernerl_sizes = kernel_sizes
        self.feature_size = feature_size
        
        # 1. embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                              embedding_dim,
                                              embeddings_initializer=tf.keras.initializers.Constant(embed_lookup.vectors),
                                              trainable = freeze_embeddings
                                             )
        # 2. convolutional layers
        self.conv_layers = []
        
        for kernerl_size in self.kernerl_sizes:
            """
            Convolutional + max pooling layer
            """
            conv_block = tf.keras.Sequential()
            kernel_shape = (kernerl_size, embedding_dim)
                
            conv =  tf.keras.layers.Conv2D(filters = 100, 
                                        kernel_size=kernel_shape, 
                                        padding='valid',
                                        strides = (1,1), 
                                        activation='relu',
                                        name = 'conv_layer_{0}'.format(kernerl_size))
            
            maxpool = tf.keras.layers.MaxPool2D(pool_size = (self.feature_size - kernerl_size + 1, 1),
                                                padding = 'valid',
                                                strides = (1,1),
                                                name = 'maxPool_layer_{0}'.format(kernerl_size))
            
            conv_block.add(conv)
            conv_block.add(maxpool)
            
            self.conv_layers.append(conv_block)
        
        # 3. final fully-connected layer for classification
        self.fc = tf.keras.layers.Dense(output_size, 
                                        activation='sigmoid',
                                        name='predictions')

        # 4. reshape and flatten & dropout layers
        self.reshape = tf.keras.layers.Reshape((feature_size, embedding_dim, 1))
        self.flatten = tf.keras.layers.Flatten(name='flatten')
        self.dropout = tf.keras.layers.Dropout(drop_prob, name='dropout')
        
    
    
    def call(self, inputs, training = None):
        
        """
        Defines how a batch of inputs, x, passes through the model layers.
        Returns a single, sigmoid-activated class score as output.
        """
        
        embeds = self.embedding(inputs)
        embeds = self.reshape(embeds)
        pool_outputs = [conv_block(embeds) for conv_block in self.conv_layers]
        
        pool_outputs = tf.keras.layers.concatenate(pool_outputs, axis=-1, name='concatenate')
        pool_outputs = self.flatten(pool_outputs)

        if training:
            pool_outputs = self.dropout(pool_outputs)
            
        logit = self.fc(pool_outputs)
        
        return logit

## Training

In [15]:
vocab_size = len(pretrained_words)
output_size = 1 # binary class (1 or 0)
embedding_dim = len(embed_lookup[pretrained_words[1]]) # 200-dim vectors
num_filters = 100
kernel_sizes = [3, 4, 5]
feature_size = 15

In [16]:
# multi gpu usage

mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    net = TxtCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   feature_size, num_filters, kernel_sizes)
    
    # Specify the training configuration (optimizer, loss, metrics)

    net.compile(optimizer=tf.optimizers.Adam(),
                loss=tf.keras.losses.BinaryCrossentropy(),
                metrics=['accuracy'])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [17]:
history = net.fit(train_dataset,
                    epochs=10,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data = val_dataset)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Train for 179 steps, validate for 23 steps
Epoch 1/10
INFO:tensorflow:batch_all_reduce: 8 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica

In [21]:
net.summary()

Model: "txt_cnn_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  8344400   
_________________________________________________________________
sequential_3 (Sequential)    multiple                  60100     
_________________________________________________________________
sequential_4 (Sequential)    multiple                  80100     
_________________________________________________________________
sequential_5 (Sequential)    multiple                  100100    
_________________________________________________________________
predictions (Dense)          multiple                  301       
_________________________________________________________________
reshape_1 (Reshape)          multiple                  0         
_________________________________________________________________
flatten (Flatten)            multiple                  0 

In [18]:
# single gpu usage

net = TxtCNN(embed_lookup, vocab_size, output_size, embedding_dim,
               feature_size, num_filters, kernel_sizes)

# Specify the training configuration (optimizer, loss, metrics)

net.compile(optimizer=tf.optimizers.Adam(),
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=['accuracy'])

In [19]:
history = net.fit(train_x, train_y,
                    batch_size=32,
                    epochs=1,
                    # We pass some validation for
                    # monitoring validation loss and metrics
                    # at the end of each epoch
                    validation_data=(val_x, val_y))

Train on 11411 samples, validate on 1426 samples


In [20]:
# tf.backend clear
tf.keras.backend.clear_session()