In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pp
import seaborn as sb
import tensorflow as tf
import re
from sklearn.preprocessing import StandardScaler
pp.style.use("fivethirtyeight")

In [None]:
train_df = pd.read_csv("/Users/jeanbaptiste/Downloads/train.tsv", sep = "\t")
train_df = train_df.sample(frac = 1.0)
print(train_df.columns)
print(train_df.shape[0])
train_df.loc[pd.isnull(train_df["item_description"]), "item_description"] = ""
train_df["total_text"] = train_df["item_description"] + " " + train_df["name"]
train_df.drop(["item_description", "name"], axis = 1, inplace = True)
pp.show(sb.kdeplot(np.log(1 + train_df["price"])))

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\.", " \. ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

train_df["total_text"] = train_df["total_text"].apply(clean_str)
print(train_df["total_text"].head())

In [None]:
class GloveModel :
    def __init__(self, path) :
        i = 1
        self.index_word = []
        self.embeddings = []
        self.word_index = {}
        with open(path, "r") as file :
            row = file.readline().split(" ")
            while row != [""] :
                self.index_word.append(row[0])
                self.embeddings.append(np.array(row[1:], dtype = np.float32))
                self.word_index[row[0]] = i
                i += 1
                row = file.readline().split(" ")
        self.index_words = [""] + self.index_word
        self.embeddings = [np.zeros_like(self.embeddings[0])] + self.embeddings

stanford_glove = GloveModel("/Users/jeanbaptiste/Downloads/glove/glove.6B.300d.txt")

In [None]:
def text_to_sequence(string, dict_words) :
    splitted = string.split(" ")
    result = []
    for word in splitted :
        try :
            result.append(dict_words[word])
        except KeyError :
            pass
    return result

In [None]:
sequences = list(train_df["total_text"].apply(text_to_sequence, args = (stanford_glove.word_index,)))
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = 150)
sequences = np.array(sequences).astype(np.int32)
print(sequences[0])

In [None]:
target = np.log(1 + train_df["price"].values)
y_train = target[:1000000]
y_test = target[1000000:]
scaler = StandardScaler()
scaler.fit(y_train.reshape((-1, 1)))
y_train_scaled = scaler.transform(y_train.reshape((-1, 1))).flatten().clip(-2.5, 2.5)
y_test_scaled = scaler.transform(y_test.reshape((-1, 1))).flatten()
X_train = sequences[:1000000, :]
X_test = sequences[1000000:, :]

In [None]:
class BasicTextCNN :
    def __init__(self) :
        pass
        
    def create_conv_model(self, embedding_model, filter_shape, nb_filters, keep_prob) :
        
        self.input_x = tf.placeholder(dtype = tf.int32, shape = [None, 150], name = "input_x")
        self.input_y = tf.placeholder(dtype = tf.float32, shape = [None], name = "input_y")
        
        with tf.device("/gpu:0") :
            #Create embedding with pretrained model weights
            self.embedding = tf.Variable(np.vstack(embedding_model.embeddings),
                                         dtype = tf.float32,
                                         trainable = False,
                                         name = "sf_embedding"
                                         )            
            self.embedding_lookup = tf.nn.embedding_lookup(self.embedding, self.input_x, name = "embedding_lookup")
            #Embedding expansion for compatibility with conv layer
            self.expanded_embedding = tf.expand_dims(self.embedding_lookup, -1, name = "expanded_embedding")
            
            #convolution layer
            self.conv1 = tf.layers.conv2d(inputs = self.expanded_embedding,
                                          padding = "same", 
                                          filters = nb_filters,
                                          kernel_size = filter_shape,
                                          name = "conv_layer1",
                                          activation = tf.nn.leaky_relu
                                          )
            #first max pooling layer
            self.max_pool1 = tf.layers.max_pooling2d(self.conv1,
                                                     pool_size = [3, 3],
                                                     strides = [3, 3],
                                                     name = "max_pool_layer1"
                                                    )
            #second convolution layer
            self.conv2 = tf.layers.conv2d(inputs = self.max_pool1,
                                          padding = "same", 
                                          filters = nb_filters,
                                          kernel_size = filter_shape,
                                          name = "conv_layer2",
                                          activation = tf.nn.leaky_relu
                                         )
            #second max pooling layer
            self.max_pool2 = tf.layers.max_pooling2d(self.conv2,
                                                     pool_size = [1, 2],
                                                     strides = [1, 2],
                                                     name = "max_pool_layer2"
                                                     )
            #third conv layer
            self.conv3 = tf.layers.conv2d(inputs = self.max_pool2,
                                          padding = "same", 
                                          filters = nb_filters * 2,
                                          kernel_size = filter_shape,
                                          name = "conv_layer3",
                                          activation = tf.nn.leaky_relu
                                         )
            #pool flatting to enable dense connexion
            self.flat_pool = tf.reshape(self.conv3, [-1, 50*50*nb_filters*2])
            #first relu activation with dropout
            self.W1 = tf.Variable(tf.random_normal(stddev = 0.1, shape = [50*50*nb_filters*2, 2500]),
                                                   dtype = tf.float32,
                                                   name = "W1"
                                                   )
            self.b1 = tf.Variable(tf.random_normal(stddev = 0.1, shape = [2500]), dtype = tf.float32, name = "b1")
            self.relu_activation1 = tf.nn.leaky_relu(tf.matmul(self.flat_pool, self.W1) + self.b1)
            self.relu_activation1_regularized = tf.nn.dropout(self.relu_activation1,
                                                              keep_prob = keep_prob,
                                                              name = "relu1_regularized"
                                                             )
            #second relu activation with dropout
            self.W2 = tf.Variable(tf.random_normal(stddev = 0.1,
                                                   shape = [2500, 500]),
                                                   dtype = tf.float32,
                                                   name = "W2"
                                                   )
            self.b2 = tf.Variable(tf.random_normal(stddev = 0.1, shape = [500]),
                                                   dtype = tf.float32,
                                                   name = "b2")
            self.relu_activation2 = tf.nn.leaky_relu(tf.matmul(self.relu_activation1_regularized, self.W2) + self.b2,
                                                     name = "relu_activation2")
            self.relu_activation2_regularized = tf.nn.dropout(self.relu_activation2,
                                                              keep_prob = keep_prob, name = "relu2_regularized")
            #output vector definition
            self.W3 = tf.Variable(tf.random_normal(stddev = 0.1, shape = [500, 1]), name = "output_weights")
            self.b3 = tf.Variable(tf.random_normal(stddev = 0.1 , shape = [1]), name = "output_biases")
            self.output = tf.matmul(self.relu_activation2_regularized, self.W3) + self.b3
            #loss definition
            self.loss = tf.reduce_mean(tf.square(self.input_y - self.output), name = "loss")
    
    def set_optimizer(self, learning_rate = 0.0001, decay_rate = 0.99999) :
        
        self.global_step = tf.Variable(0, trainable = False)
        
        evolutive_lr = tf.train.exponential_decay(learning_rate = learning_rate,
                                                  global_step = self.global_step,
                                                  decay_rate = decay_rate,
                                                  decay_steps = 1
                                                 )
        
        self.optimizer = tf.train.AdamOptimizer(evolutive_lr,
                                                epsilon = 1.0,
                                                name = "optimizer"
                                               )
        
        self.global_step_inc = tf.assign(self.global_step, self.global_step+1)
        
        self.train_op = self.optimizer.minimize(self.loss, global_step = self.global_step)
                        
    def fit_with_early_stopping(self, x, y, x_test, y_test, nb_epoch, early_stopping = 10, batch_size = 20,
                                step = 50000, init = True) :
        
        nb_obs_train = len(x)
        nb_obs_test = len(x_test)
        self.test_errors_log = []
        early_stopping_count = 0
        self.train_error_log = []
        saver = tf.train.Saver()
        #we use step because is step is not a multiple of batch size (i % step == 0) will always be false
        _step = step % batch_size
        
        with tf.Session() as sess :
            if init :
                _init = tf.global_variables_initializer()
                sess.run(_init)
            for epoch in range(nb_epoch) :
                batch_errors = []
                avg_cost = 0.0
                i = 0
                test_loss_improvement = 0
                # fitting on train sample
                while (i + batch_size <= nb_obs_train - 1) :
                    _, c, gs = sess.run([self.train_op, self.loss, self.global_step_inc], 
                                    feed_dict = {self.input_x : x[i : i + batch_size, :],
                                                 self.input_y : y[i : i + batch_size]})
                    #saving training loss
                    batch_errors.append(c)
                    i += batch_size
                    if (i % step == _step) :
                        j = 0
                        test_errors = []
                        #testing performance on test sample
                        while (j + batch_size <= nb_obs_test - 1) :
                            test_error = sess.run([self.loss],
                                                  feed_dict = {self.input_x : x_test[j : j + batch_size, :],
                                                               self.input_y : y_test[j : j + batch_size]})
                            test_errors.append(test_error)
                            j += batch_size
                        test_error_mean = np.mean(test_errors)
                        self.test_errors_log.append(test_error_mean)
                        batch_errors_mean = np.mean(batch_errors)
                        self.train_error_log.append(batch_errors_mean)
                        #reset batch errors log
                        batch_errors = []
                        print("train error : " + str(batch_errors_mean) + 
                                         " test error : " + str(test_error_mean))
                        #saving model if loss is the best on test set
                        if test_error_mean == min(self.test_errors_log) :
                            saver.save(sess, "/Users/jeanbaptiste/TextCNN")
                            early_stopping_count = 0
                        else : 
                            early_stopping_count += 1
                    #stop if early stopping limit has been reached
                    if early_stopping_count == early_stopping :
                        print("early stopping reached, best test value : " + str(min(self.test_errors_log)))
                        return None

In [None]:
first_text_cnn = BasicTextCNN() 
first_text_cnn.create_conv_model(stanford_glove, filter_shape = (3, 3), nb_filters = 3, keep_prob = 0.8)
first_text_cnn.set_optimizer()
first_text_cnn.fit_with_early_stopping(X_train, y_train, X_test, y_test, 25, batch_size = 20)