# Detecting Duplicate Questions in Quora

### Obtaining Data

Dataset:
    https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs

In [None]:
!wget http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv

In [None]:
!mkdir data

In [None]:
!mv quora_duplicate_questions.tsv data

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

In [None]:
!mv glove.840B.300d.zip data

### Feature Engineering

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)

In [3]:
# length based features
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))

# difference in lengths of two questions
data['diff_len'] = data.len_q1 - data.len_q2

# character length based features
data['len_char_q1'] = data.question1.apply(lambda x: 
                    len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: 
                    len(''.join(set(str(x).replace(' ', '')))))

# word length based features
data['len_word_q1'] = data.question1.apply(lambda x: 
                                           len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: 
                                           len(str(x).split()))

# common words in the two questions
data['common_words'] = data.apply(lambda x: 
                           len(set(str(x['question1'])
                           .lower().split())
                           .intersection(set(str(x['question2'])
                           .lower().split()))), axis=1)

In [4]:
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words']

### Fuzzy features

In [5]:
from fuzzywuzzy import fuzz

fuzz.QRatio("Why did Trump win the Presidency?", 
            "How did Donald Trump win the 2016 Presidential Election")

67

In [6]:
fuzz.QRatio("How can I start an online shopping (e-commerce) website?", 
            "Which web technology is best suitable for building a big E-Commerce website?")

60

In [7]:
from fuzzywuzzy import fuzz

fuzz.partial_ratio("Why did Trump win the Presidency?", 
   "How did Donald Trump win the 2016 Presidential Election")

73

In [8]:
fuzz.partial_ratio("How can I start an online shopping (e-commerce) website?", 
                   "Which web technology is best suitable for building a big E-Commerce website?")

57

In [9]:
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(
    str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(
    str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_partial_ratio'] = data.apply(lambda x: 
                    fuzz.partial_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: 
                    fuzz.partial_token_set_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: 
                    fuzz.partial_token_sort_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_token_set_ratio'] = data.apply(lambda x: 
                    fuzz.token_set_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

data['fuzz_token_sort_ratio'] = data.apply(lambda x: 
                    fuzz.token_sort_ratio(str(x['question1']), 
                    str(x['question2'])), axis=1)

In [10]:
fs_2 = ['fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

### TF-IDF and SVD features

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from copy import deepcopy

In [12]:
tfv_q1 = TfidfVectorizer(min_df=3, 
                         max_features=None, 
                         strip_accents='unicode', 
                         analyzer='word', 
                         token_pattern=r'\w{1,}',
                         ngram_range=(1, 2), 
                         use_idf=1, 
                         smooth_idf=1, 
                         sublinear_tf=1,
                         stop_words='english')

tfv_q2 = deepcopy(tfv_q1)

In [13]:
q1_tfidf = tfv_q1.fit_transform(data.question1.fillna(""))
q2_tfidf = tfv_q2.fit_transform(data.question2.fillna(""))

In [14]:
from sklearn.decomposition import TruncatedSVD
svd_q1 = TruncatedSVD(n_components=180)
svd_q2 = TruncatedSVD(n_components=180)

In [15]:
question1_vectors = svd_q1.fit_transform(q1_tfidf)
question2_vectors = svd_q2.fit_transform(q2_tfidf)

In [16]:
from scipy.stats import skew, kurtosis
data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

In [17]:
from scipy import sparse

# obtain features by stacking the sparse matrices together
fs3_1 = sparse.hstack((q1_tfidf, q2_tfidf))

In [18]:
tfv = TfidfVectorizer(min_df=3, 
                      max_features=None, 
                      strip_accents='unicode', 
                      analyzer='word', 
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 2), 
                      use_idf=1, 
                      smooth_idf=1, 
                      sublinear_tf=1,
                      stop_words='english')

# combine questions and calculate tf-idf
q1q2 = data.question1.fillna("") 
q1q2 += " " + data.question2.fillna("")
fs3_2 = tfv.fit_transform(q1q2)

In [19]:
# obtain features by stacking the matrices together
fs3_3 = np.hstack((question1_vectors, question2_vectors))

In [20]:
fs3_4 = ['skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']

In [21]:
del([tfv_q1, tfv_q2, tfv, q1q2, question1_vectors, question2_vectors, svd_q1, svd_q2, q1_tfidf, q2_tfidf])

In [22]:
import gc
gc.collect()

7

### Word2Vec embeddings

In [None]:
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [None]:
!mv GoogleNews-vectors-negative300.bin.gz data

In [23]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [24]:
import nltk
try:
    nltk.download('punkt')
    nltk.download('stopwords')
except:
    pass

from nltk.corpus import stopwords
from nltk import word_tokenize

stop_words = set(stopwords.words('english'))

def sent2vec(s, model):  
    M = []
    words = word_tokenize(str(s).lower())
    for word in words:
        #It shouldn't be a stopword
        if word not in stop_words:
            #nor contain numbers
            if word.isalpha():
                #and be part of Word2Vec
                if word in model:
                    M.append(model[word])
    M = np.array(M)
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    else:
        return model.get_vector('null')

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [25]:
w2v_q1 = np.array([sent2vec(q, model) 
                   for q in data.question1])
w2v_q2 = np.array([sent2vec(q, model) 
                   for q in data.question2])

In [26]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)]
data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]

In [27]:
fs4_1 = ['cosine_distance', 'cityblock_distance', 
         'jaccard_distance', 'canberra_distance', 
         'euclidean_distance', 'minkowski_distance',
         'braycurtis_distance']

In [28]:
w2v = np.hstack((w2v_q1, w2v_q2))

In [29]:
del([w2v_q1, w2v_q2])
gc.collect()

49

In [30]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [31]:
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [32]:
model.init_sims(replace=True) # Precomputes L2-normalized vectors.
data['norm_wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [33]:
fs4_2 = ['wmd', 'norm_wmd']

In [34]:
del([model])
gc.collect()

217

### Building ML Models

In [35]:
import psutil
psutil.virtual_memory()

svmem(total=16713633792, available=11904913408, percent=28.8, used=4407255040, free=6221172736, active=4979257344, inactive=5065949184, buffers=122793984, cached=5962412032, shared=50638848)

In [36]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [37]:
scaler = StandardScaler()

In [38]:
y = data.is_duplicate.values
y = y.astype('float32').reshape(-1, 1)

In [39]:
X = data[fs_1+fs_2+fs3_4+fs4_1+fs4_2].replace([np.inf, -np.inf], np.nan).fillna(0).values
X = scaler.fit_transform(X)
X = np.hstack((X, fs3_3))

In [40]:
np.random.seed(42)
 
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
 
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
 
x_train = X[idx_train]
y_train = np.ravel(y[idx_train])
 
x_val = X[idx_val]
y_val = np.ravel(y[idx_val])

In [41]:
logres = linear_model.LogisticRegression(C=0.1, solver='sag', max_iter=1000)

In [42]:
logres.fit(x_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
lr_preds = logres.predict(x_val)

In [44]:
log_res_accuracy = np.sum(lr_preds == y_val) / len(y_val)

In [45]:
print("Logistic regression accuracy: %0.3f" % log_res_accuracy)

Logistic regression accuracy: 0.746


In [46]:
params = dict()
params['objective'] = 'binary:logistic'
params['eval_metric'] = ['logloss', 'error']
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_val, label=y_val)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=100)

[0]	train-logloss:0.687516	train-error:0.297339	valid-logloss:0.687545	valid-error:0.297583
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
[100]	train-logloss:0.501644	train-error:0.26063	valid-logloss:0.503612	valid-error:0.263029
[200]	train-logloss:0.467449	train-error:0.244195	valid-logloss:0.470249	valid-error:0.246531
[300]	train-logloss:0.451133	train-error:0.234026	valid-logloss:0.454439	valid-error:0.236909
[400]	train-logloss:0.440901	train-error:0.227532	valid-logloss:0.44492	valid-error:0.231171
[500]	train-logloss:0.433247	train-error:0.222181	valid-logloss:0.437893	valid-error:0.225828
[600]	train-logloss:0.427306	train-error:0.218281	valid-logloss:0.432537	valid-error:0.222217
[700]	train-logloss:0.422431	train-error:0.215096	valid-logloss:0.428285	valid-error:0.221203
[800]	train-logloss:0.418332	train-error:0.212108	valid-logloss:0.424808	valid-error:0.219397
[900]	train

In [47]:
xgb_preds = (bst.predict(d_valid) >= 0.5).astype(int)
xgb_accuracy = np.sum(xgb_preds == y_val) / len(y_val)
print(xgb_accuracy)

0.7996240322540751


### Building an LSTM Model

In [48]:
import zipfile

from tqdm import tqdm_notebook as tqdm
tqdm.monitor_interval = 0
import tensorflow as tf

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [50]:
Tokenizer = tf.keras.preprocessing.text.Tokenizer
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [51]:
try:
    df = data[['question1', 'question2', 'is_duplicate']]
except:
    df = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
    df = df.drop(['id', 'qid1', 'qid2'], axis=1)
    
df = df.fillna('')
y = df.is_duplicate.values
y = y.astype('float32').reshape(-1, 1)

In [52]:
tk = Tokenizer(num_words=200000)
 
max_len = 40
tk.fit_on_texts(list(df.question1) + list(df.question2))
x1 = tk.texts_to_sequences(df.question1)
x1 = pad_sequences(x1, maxlen=max_len)
 
x2 = tk.texts_to_sequences(df.question2)
x2 = pad_sequences(x2, maxlen=max_len)
 
word_index = tk.word_index

In [53]:
embedding_matrix = np.zeros((len(word_index) + 1, 300), dtype='float32')
 
glove_zip = zipfile.ZipFile('data/glove.840B.300d.zip')
glove_file = glove_zip.filelist[0]
 
f_in = glove_zip.open(glove_file)
for line in tqdm(f_in):
    values = line.split(b' ')
    word = values[0].decode()
    if word not in word_index:
        continue
    i = word_index[word]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_matrix[i, :] = coefs
 
f_in.close()
glove_zip.close()




In [54]:
def prepare_batches(seq, step):
    n = len(seq)
    res = []
    for i in range(0, n, step):
        res.append(seq[i:i+step])
    return res

In [55]:
def dense(X, size, activation=None):
    he_std = np.sqrt(2 / int(X.shape[1]))
    out = tf.layers.dense(X, units=size, activation=activation,
                     kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    return out
 
def conv1d(inputs, num_filters, filter_size, padding='same'):
    he_std = np.sqrt(2 / (filter_size * num_filters))
    out = tf.layers.conv1d(
        inputs=inputs, filters=num_filters, padding=padding,
        kernel_size=filter_size,
        activation=tf.nn.relu,
        kernel_initializer=tf.random_normal_initializer(stddev=he_std))
    return out
 
def maxpool1d_global(X):
    out = tf.reduce_max(X, axis=1)
    return out
 
def time_distributed_dense(X, dense_size):
    shape = X.shape.as_list()
    assert len(shape) == 3
    _, w, d = shape
 
    X_reshaped = tf.reshape(X, [-1, d])
    H = dense(X_reshaped, dense_size, tf.nn.relu)
 
    return tf.reshape(H, [-1, w, dense_size])

def lstm(X, size_hidden, size_out):
    with tf.variable_scope('lstm_%d' % np.random.randint(0, 100)):
        he_std = np.sqrt(2 / (size_hidden * size_out))
        W = tf.Variable(tf.random_normal([size_hidden, size_out], stddev=he_std))
        b = tf.Variable(tf.zeros([size_out]))
 
        size_time = int(X.shape[1])
        X = tf.unstack(X, size_time, axis=1)
 
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(size_hidden, forget_bias=1.0)
        outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, X, dtype='float32')
        out = tf.matmul(outputs[-1], W) + b
 
        return out

In [56]:
max_features = 200000
filter_length = 5
nb_filter = 64
pool_length = 4
learning_rate = 0.001

In [57]:
graph = tf.Graph()
graph.seed = 1
 
with graph.as_default():
    place_q1 = tf.placeholder(tf.int32, shape=(None, max_len))
    place_q2 = tf.placeholder(tf.int32, shape=(None, max_len))
    place_y = tf.placeholder(tf.float32, shape=(None, 1))
    place_training = tf.placeholder(tf.bool, shape=())
 
    glove = tf.Variable(embedding_matrix, trainable=False)
    q1_glove_lookup = tf.nn.embedding_lookup(glove, place_q1)
    q2_glove_lookup = tf.nn.embedding_lookup(glove, place_q2)
 
    emb_size = len(word_index) + 1
    emb_dim = 300
    emb_std = np.sqrt(2 / emb_dim)
    emb = tf.Variable(tf.random_uniform([emb_size, emb_dim], -emb_std, emb_std))
    q1_emb_lookup = tf.nn.embedding_lookup(emb, place_q1)
    q2_emb_lookup = tf.nn.embedding_lookup(emb, place_q2)
   
    model1 = q1_glove_lookup
    model1 = time_distributed_dense(model1, 300)
    model1 = tf.reduce_sum(model1, axis=1)
 
    model2 = q2_glove_lookup
    model2 = time_distributed_dense(model2, 300)
    model2 = tf.reduce_sum(model2, axis=1)
 
    model3 = q1_glove_lookup
    model3 = conv1d(model3, nb_filter, filter_length, padding='valid')
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = conv1d(model3, nb_filter, filter_length, padding='valid')
    model3 = maxpool1d_global(model3)
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = dense(model3, 300)
    model3 = tf.layers.dropout(model3, rate=0.2, training=place_training)
    model3 = tf.layers.batch_normalization(model3, training=place_training)
 
    model4 = q2_glove_lookup
    model4 = conv1d(model4, nb_filter, filter_length, padding='valid')
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = conv1d(model4, nb_filter, filter_length, padding='valid')
    model4 = maxpool1d_global(model4)
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = dense(model4, 300)
    model4 = tf.layers.dropout(model4, rate=0.2, training=place_training)
    model4 = tf.layers.batch_normalization(model4, training=place_training)
 
    model5 = q1_emb_lookup
    model5 = tf.layers.dropout(model5, rate=0.2, training=place_training)
    model5 = lstm(model5, size_hidden=300, size_out=300)
 
    model6 = q2_emb_lookup
    model6 = tf.layers.dropout(model6, rate=0.2, training=place_training)
    model6 = lstm(model6, size_hidden=300, size_out=300)
 
    merged = tf.concat([model1, model2, model3, model4, model5, model6], axis=1)
    #merged = tf.concat([model1, model2], axis=1)
    merged = tf.layers.batch_normalization(merged, training=place_training)
 
    for i in range(5):
        merged = dense(merged, 300, activation=tf.nn.relu)
        merged = tf.layers.dropout(merged, rate=0.2, training=place_training)
        merged = tf.layers.batch_normalization(merged, training=place_training)
 
    merged = dense(merged, 1, activation=tf.nn.sigmoid)
   
    loss = tf.losses.log_loss(place_y, merged)
 
    prediction = tf.round(merged)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(place_y, prediction), 'float32'))
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
 
    # for batchnorm
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        step = opt.minimize(loss)
 
    init = tf.global_variables_initializer()
 
session = tf.Session(config=None, graph=graph)
session.run(init)

In [58]:
np.random.seed(1)
 
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
 
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
 
x1_train = x1[idx_train]
x2_train = x2[idx_train]
y_train = y[idx_train]
 
x1_val = x1[idx_val]
x2_val = x2[idx_val]
y_val = y[idx_val]

In [61]:
val_idx = np.arange(y_val.shape[0])
val_batches = prepare_batches(val_idx, 5000)

no_batches = 50 # ideally we should run it with 200 batches 
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481

for i in range(no_batches):
    np.random.seed(i)
    train_idx_shuffle = np.arange(y_train.shape[0])
    np.random.shuffle(train_idx_shuffle)
    batches = prepare_batches(train_idx_shuffle, 384)
   
    progress = tqdm(total=len(batches))
    for idx in batches:
        feed_dict = {
            place_q1: x1_train[idx],
            place_q2: x2_train[idx],
            place_y: y_train[idx],
            place_training: True,
        }
        _, acc, l = session.run([step, accuracy, loss], feed_dict)
        progress.update(1)
        progress.set_description('%.3f / %.3f' % (acc, l))
 
 
    y_pred = np.zeros_like(y_val)
    for idx in val_batches:
        feed_dict = {
            place_q1: x1_val[idx],
            place_q2: x2_val[idx],
            place_y: y_val[idx],
            place_training: False,
        }
        y_pred[idx, :] = session.run(prediction, feed_dict)
 
    print('batch %02d, accuracy: %0.3f' % (i, np.mean(y_val == y_pred)))


batch 00, accuracy: 0.816



batch 01, accuracy: 0.816



batch 02, accuracy: 0.820



batch 03, accuracy: 0.818



batch 04, accuracy: 0.822



batch 05, accuracy: 0.821



batch 06, accuracy: 0.821



batch 07, accuracy: 0.822



batch 08, accuracy: 0.822



batch 09, accuracy: 0.822



batch 10, accuracy: 0.820



batch 11, accuracy: 0.823



batch 12, accuracy: 0.820



batch 13, accuracy: 0.818



batch 14, accuracy: 0.817



batch 15, accuracy: 0.820



batch 16, accuracy: 0.821



batch 17, accuracy: 0.822



batch 18, accuracy: 0.817



batch 19, accuracy: 0.823



batch 20, accuracy: 0.820



batch 21, accuracy: 0.821



batch 22, accuracy: 0.821



batch 23, accuracy: 0.821



batch 24, accuracy: 0.824



batch 25, accuracy: 0.823



batch 26, accuracy: 0.821



batch 27, accuracy: 0.823



batch 28, accuracy: 0.823



batch 29, accuracy: 0.823



batch 30, accuracy: 0.824



batch 31, accuracy: 0.823



batch 32, accuracy: 0.826



batch 33, accuracy: 0.821



batch 34, accuracy: 0.821



batch 35, accuracy: 0.824



batch 36, accuracy: 0.822



batch 37, accuracy: 0.825



batch 38, accuracy: 0.824



batch 39, accuracy: 0.825



batch 40, accuracy: 0.826



batch 41, accuracy: 0.825



batch 42, accuracy: 0.822



batch 43, accuracy: 0.822



batch 44, accuracy: 0.825



batch 45, accuracy: 0.825



batch 46, accuracy: 0.825



batch 47, accuracy: 0.825



batch 48, accuracy: 0.822



batch 49, accuracy: 0.823


### Assessing Model

In [62]:
def convert_text(txt, tokenizer, padder):
    x = tokenizer.texts_to_sequences(txt)
    x = padder(x, maxlen=max_len)
    return x  

def evaluate_questions(a, b, tokenizer, padder, pred):
    feed_dict = {
            place_q1: convert_text([a], tk, pad_sequences),
            place_q2: convert_text([b], tk, pad_sequences),
            place_y: np.zeros((1,1)),
            place_training: False,
        }
    return session.run(pred, feed_dict)
    
isduplicated = lambda a, b: evaluate_questions(a, b, tk, pad_sequences, prediction)

a = "Why are there so many duplicated questions on Quora?"
b = "Why do people ask similar questions on Quora multiple times?"

print("Answer: %0.2f" % isduplicated(a, b))

Answer: 1.00
