In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

read train dataset and test dataset

In [2]:
train_file = './data/train.csv'
test_file = './data/test.csv'

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [3]:
train[:5]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
test[:5]

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [5]:

train_data = train.values[:,3:-1]
train_label = train.values[:, -1]
print ("train dataset size: %d" % len(train_data))

test_data = test.values[:,1:]
print ("test dataset size: %d" % len(test_data))

train dataset size: 404290
test dataset size: 2345796


In [6]:
train_questions = train_data.reshape((1,2*len(train_data)))
test_questions = test_data.reshape((1,2*len(test_data)))

In [7]:
def clean_words(words):
    """
    :param words: a list of raw words.
    :return: a list of words where each word is cleaned from special symbols.
    """
    for w in words:
        w = w.strip('".\'?)(:,!\\[]=/')
        if w.endswith('\'s'):
            w = w[:len(w)-2]
        if w is not '':
            yield w

In [8]:
words = set()
for q in train_questions[0]:
    words.update(clean_words(str(q).strip().lower().split(' ')))
for q in test_questions[0]:
    words.update(clean_words(str(q).strip().lower().split(' ')))
words = list(words)
words.sort()
print (len(words))

175283


In [9]:
import itertools
word2index = dict()
word2index.update(zip(words, itertools.count()))

In [10]:
positive_train_data = [train_data[i] for i in range(len(train_data)) if train_label[i] == 1]
negative_train_data = [train_data[i] for i in range(len(train_data)) if train_label[i] == 0]
print ("positive train data: ",len(positive_train_data))
print ("negative train data: ",len(negative_train_data))

positive train data:  149263
negative train data:  255027


In [11]:
import scipy.sparse as sparse

y_positive_tuple = list()
y_pair_positive_tuple = list()

y_negative_tuple = list()
y_pair_negative_tuple = list()

line_ctr = itertools.count()
for line in positive_train_data:
    if isinstance(line[0], str)  and isinstance(line[1], str):
        l = next(line_ctr)
        y_positive_tuple.extend([(1, l, word2index[w]) for w in clean_words(line[0].rstrip().lower().split(' '))])
        y_pair_positive_tuple.extend([(1, l, word2index[w]) for w in clean_words(line[1].rstrip().lower().split(' '))])
    else:
        print (line)
y_positive_data, y_positive_row, y_positive_col = zip(*y_positive_tuple)
y__pair_positive_data, y_pair_positive_row, y_pair_positive_col = zip(*y_pair_positive_tuple)

M = next(line_ctr)
O = len(word2index.keys())
y_positive = sparse.csr_matrix((y_positive_data, (y_positive_row, y_positive_col)), shape=(M, O))
y_pair_positive =  sparse.csr_matrix((y__pair_positive_data, (y_pair_positive_row, y_pair_positive_col)), shape=(M, O))


line_ctr = itertools.count()

for line in negative_train_data:
    if isinstance(line[0], str)  and isinstance(line[1], str):
        l = next(line_ctr)
        y_negative_tuple.extend([(1, l, word2index[w]) for w in clean_words(line[0].rstrip().lower().split(' '))])
        y_pair_negative_tuple.extend([(1, l, word2index[w]) for w in clean_words(line[1].rstrip().lower().split(' '))])
    else:
        print (line)
        
y_negative_data, y_negative_row, y_negative_col = zip(*y_negative_tuple)
y__pair_negative_data, y_pair_negative_row, y_pair_negative_col = zip(*y_pair_negative_tuple)

M = next(line_ctr)
O = len(word2index.keys())
y_negative = sparse.csr_matrix((y_negative_data, (y_negative_row, y_negative_col)), shape=(M, O))
y_pair_negative =  sparse.csr_matrix((y__pair_negative_data, (y_pair_negative_row, y_pair_negative_col)), shape=(M, O))



['How can I develop android app?' nan]
['How can I create an Android app?' nan]


In [12]:
def add_gradient_noise(t, stddev=1e-3, name=None):
    """
    Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
    The input Tensor `t` should be a gradient.
    The output will be `t` + gaussian noise.
    0.001 was said to be a good fixed value for memory networks [2].
    """
    with tf.name_scope(name, "add_gradient_noise", [t, stddev]) as name:
        t = tf.convert_to_tensor(t, name="t")
        gn = tf.random_normal(tf.shape(t), stddev=stddev)
        return tf.add(t, gn, name=name)

def normalize_vector(vector_to_normalize):
    norm = tf.sqrt(tf.reduce_sum(tf.square(vector_to_normalize), 1, keep_dims=True))
    normalized_vector = vector_to_normalize/norm
    return normalized_vector

def cosine(labels, predictions):
    normalized_labels = normalize_vector(labels)
    normalized_predictions = normalize_vector(predictions)
    similarity = tf.matmul(normalized_labels, tf.transpose(normalized_predictions))
    return similarity

In [13]:
class quora_embedding(object):
    def __init__(self, embedding_size, batch_size, vocabulary_size,sess):
        self._embedding_size = embedding_size
        self._batch_size = batch_size
        self._vocabulary_size = vocabulary_size
        self._lambda = 1
        self._sess = sess
        
        self._opt = tf.train.AdadeltaOptimizer(1e-9)
        
        self.Wv = tf.Variable(initializer([15, O]), name="Wv")
        
        self.y_positive = tf.placeholder(tf.float32, [None, self._vocabulary_size], name="y_positive")
        self.y_pair_positive = tf.placeholder(tf.float32, [None, self._vocabulary_size], name="y_pair_positive")
        self.y_negative = tf.placeholder(tf.float32, [None, self._vocabulary_size], name="y_negative")
        self.y_pair_negative = tf.placeholder(tf.float32, [None, self._vocabulary_size], name="y_pair_negative")
        
        cosine_pair_positive_questions = self.cosine_similarity(self.y_positive, self.y_pair_positive)
        cosine_pair_negative_questions = self.cosine_similarity(self.y_negative, self.y_pair_negative)
        loss = tf.add(tf.subtract(self._lambda, cosine_pair_positive_questions), cosine_pair_negative_questions)
    
        self.cosine_pair_positive_questions = cosine_pair_positive_questions
        self.cosine_pair_negative_questions = cosine_pair_negative_questions
        
        loss_sum = tf.reduce_sum(loss, name="loss_sum")
        self.loss= loss
        self.loss_op = loss_sum
        self.train_op = self._opt.minimize(self.loss_op)
        
        prediction = tf.sign(tf.nn.relu(cosine_pair_positive_questions))
        self.prediction_op = prediction
        
        init_op = tf.global_variables_initializer()
        self._sess.run(init_op)
        
    def cosine_similarity(self, question1, question2):
        q1 = tf.matmul(self.Wv,tf.transpose(question1))
        q2 = tf.matmul(self.Wv,tf.transpose(question2))
        cosine_sim =  cosine(tf.transpose(q1), tf.transpose(q2))
        return tf.diag_part(cosine_sim)
        
    def batch_fit(self, y_positive, y_pair_positive, y_negative, y_pair_negative):
        feed_dict = {self.y_positive: y_positive, self.y_pair_positive: y_pair_positive, self.y_negative:y_negative, self.y_pair_negative: y_pair_negative}
        loss, loss_list, cosine_positive, cosine_negative, _ = self._sess.run([self.loss_op, self.loss, self.cosine_pair_positive_questions, self.cosine_pair_negative_questions, self.train_op], feed_dict=feed_dict)
        return loss, loss_list, cosine_positive, cosine_negative
    
    def predict(self, y_positive, y_pair_positive):
        feed_dict = {self.y_positive: y_positive, self.y_pair_positive: y_pair_positive}
        pair = self._sess.run([self.prediction_op], feed_dict=feed_dict)
        return pair

In [14]:
embedding_size = 15
batch_size = 64
epochs = 100

batches = zip(range(0, len(positive_train_data)-batch_size, batch_size), range(batch_size, len(positive_train_data), batch_size))
batches = [(start, end) for start, end in batches]

negative_batches = zip(range(0, len(negative_train_data)-batch_size, batch_size), range(batch_size, len(negative_train_data), batch_size))
negative_batches = [(start, end) for start, end in negative_batches]

initializer = tf.random_normal_initializer(mean=0, stddev=1/embedding_size)

with tf.Session() as sess:
    model = quora_embedding(embedding_size, batch_size, O, sess)
    
    for i in range(1, epochs):
        np.random.shuffle(batches)
        np.random.shuffle(negative_batches)
        
        nombre = 0
        total_cost = 0.0
        for (start, end) in batches:
            nombre += end-start
            q1 = y_positive[start: end].astype(np.float32)
            q2 = y_pair_positive[start: end].astype(np.float32)
            q3 = y_negative[start: end].astype(np.float32)
            q4 = y_pair_negative[start: end].astype(np.float32)
            cost, cost_list, cosine_positive, cosine_negative = model.batch_fit(q1.todense(), q2.todense(), q3.todense(), q4.todense())
            total_cost += cost
            if cost == 0:
                print (start, '/', end)
                print (cost_list)
                print (cosine_positive)
                print (cosine_negative)
                break
            if nombre%100 == 0:
                print ("Epoch: ", i, "percentage: ", nombre,"/", len(positive_train_data), " average cost: ", cost/batch_size)
            
        print('-----------------------')
        print('Epoch', i)
        print('Total Cost:', total_cost)
        print('average cost:', total_cost/nombre)
        print('-----------------------')
        
    
        

Epoch:  1 percentage:  1600 / 149263  average cost:  1.7050511837
Epoch:  1 percentage:  3200 / 149263  average cost:  1.71260595322
Epoch:  1 percentage:  4800 / 149263  average cost:  1.73577737808
Epoch:  1 percentage:  6400 / 149263  average cost:  1.65698385239
Epoch:  1 percentage:  8000 / 149263  average cost:  1.73009967804
Epoch:  1 percentage:  9600 / 149263  average cost:  1.68857681751
Epoch:  1 percentage:  11200 / 149263  average cost:  1.74583721161
Epoch:  1 percentage:  12800 / 149263  average cost:  1.73630142212
Epoch:  1 percentage:  14400 / 149263  average cost:  1.80946075916
Epoch:  1 percentage:  16000 / 149263  average cost:  1.6277089119
Epoch:  1 percentage:  17600 / 149263  average cost:  1.70625591278
Epoch:  1 percentage:  19200 / 149263  average cost:  1.74169957638
Epoch:  1 percentage:  20800 / 149263  average cost:  1.73946344852
Epoch:  1 percentage:  22400 / 149263  average cost:  1.73744702339
Epoch:  1 percentage:  24000 / 149263  average cost:  1.

KeyboardInterrupt: 