In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy # for NLP
import en_core_web_md #word2vec model from spacy
import time
import pickle

In [2]:
df_train = pd.read_csv("data/train.csv")

In [3]:
df_train

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
question_1, question_2, labels = df_train['question1'].astype(str), df_train['question2'].astype(str), df_train['is_duplicate'].astype(int)

In [5]:
question_1

0         What is the step by step guide to invest in sh...
1         What is the story of Kohinoor (Koh-i-Noor) Dia...
2         How can I increase the speed of my internet co...
3         Why am I mentally very lonely? How can I solve...
4         Which one dissolve in water quikly sugar, salt...
5         Astrology: I am a Capricorn Sun Cap moon and c...
6                                       Should I buy tiago?
7                            How can I be a good geologist?
8                           When do you use シ instead of し?
9         Motorola (company): Can I hack my Charter Moto...
10        Method to find separation of slits using fresn...
11              How do I read and find my YouTube comments?
12                     What can make Physics easy to learn?
13              What was your first sexual experience like?
14        What are the laws to change your status from a...
15        What would a Trump presidency mean for current...
16                             What does

In [6]:

nlp = en_core_web_md.load()  # load best-matching version for Glove

In [12]:
def load_glove_embeddings(vocab, num_unknown=100):
    #num_unknown: Specify the number of unknown words for putting in the embedding matrix
    if not isinstance(vocab, spacy.vocab.Vocab):
        raise TypeError("The input `vocab` must be type of 'spacy.vocab.Vocab', not %s." % type(vocab))

    max_vector_length = max(lex.rank for lex in vocab) + 1  # index start from 1
    matrix = np.zeros((max_vector_length + num_unknown + 2, vocab.vectors_length), dtype='float32')  # 2 for <PAD> and <EOS>

    # Normalization
    for lex in vocab:
        if lex.has_vector:
            matrix[lex.rank + 1] = lex.vector / lex.vector_norm

    return matrix


In [8]:
print('Vector dimension is: ;', nlp.vocab.vectors_length)

Vector dimension is: ; 300


In [9]:
nlp.vocab[81]

<spacy.lexeme.Lexeme at 0x7f709f3d65a0>

In [10]:
print('Non zero Vector index starts at : 81 ')
nlp.vocab[81].vector

Non zero Vector index starts at : 81 


array([-0.36815   , -0.13229001, -0.13530999, -0.54198003, -0.26120999,
       -0.17507   ,  0.25560001, -0.86040998, -0.002097  ,  0.64282   ,
       -0.27179   ,  0.15383001,  0.29422   , -0.17186999, -0.11058   ,
       -0.19616   , -0.25121999,  0.67374998,  0.23431   ,  0.02363   ,
        0.18122   , -0.11498   , -0.10616   , -0.27720001,  0.19796   ,
       -0.69883001, -0.027885  , -0.20543   ,  0.23604999,  0.022083  ,
       -0.061039  ,  0.37228   ,  0.35822999, -0.060372  , -0.19607   ,
        0.20029999,  0.62234002,  0.58771998,  0.054737  , -0.17061   ,
       -0.079896  , -0.21762   , -0.13936999,  0.088916  ,  0.15880001,
        0.31191   ,  0.52108997, -0.40202999, -0.19809   ,  0.16421001,
        0.14969   ,  0.12556   ,  0.51266998, -0.0027256 , -0.30496001,
        0.16698   , -0.1534    , -0.14580999,  0.59897   ,  0.15702   ,
       -0.10578   , -0.18849   , -0.16946   ,  0.30807999,  0.0050163 ,
       -0.31298   , -0.16359   , -0.13022999,  0.12756   ,  0.12

In [13]:
embedding_matrix = load_glove_embeddings(nlp.vocab) 

In [14]:
embedding_matrix.shape

(1340396, 300)

In [None]:
pickle.dump(embedding_matrix, open('spacy_embedding_matrix.p', 'wb'))

In [15]:
def _get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
    Xs = np.zeros((len(docs), max_length), dtype='int32')

    for i, doc in enumerate(docs):
        if tree_truncate:
            if isinstance(doc, Span):
                queue = [doc.root]
            else:
                queue = [sent.root for sent in doc.sents]
        else:
            queue = list(doc)
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            if token.has_vector:
                Xs[i, j] = token.rank + 1
            else:
                Xs[i, j] = (token.shape % (nr_unk - 1)) + 2
            j += 1
            if j >= max_length:
                break
        else:
            Xs[i, len(words)] = 1
    return Xs


def convert_questions_to_word_ids(question_1, question_2, nlp, max_length=50, n_threads=10, batch_size=128, encode=False, tree_truncate=False):
    Xs = []
    for texts in (question_1, question_2):
        Xs.append(_get_word_ids(list(nlp.pipe(texts, n_threads=n_threads, batch_size=batch_size)),
                                max_length=max_length,
                                rnn_encode=encode,
                                tree_truncate=tree_truncate))

    return Xs[0], Xs[1]

In [33]:
t1 = time.time()
q1_train, q2_train = convert_questions_to_word_ids(question_1, question_2, nlp)
t2 = time.time()

print('Elapsed tiem is: ', t2-t1)

Elapsed tiem is:  284.2206566333771


In [35]:
question_1[0], q1_train[0]

('What is the step by step guide to invest in share market in india?',
 array([  169,    12,     4,  1049,    84,  1049,  2490,     6,  3509,
           15,   977,   813,    15, 20120,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0], dtype=int32))

In [36]:
question_2[0], q2_train[0]

('What is the step by step guide to invest in share market?',
 array([ 169,   12,    4, 1049,   84, 1049, 2490,    6, 3509,   15,  977,
         813,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0], dtype=int32))

In [22]:
import dhira.input_preprocessor as ip

In [28]:
inp = ip.QuoraInputProcessor()
batches = inp.train_batch_iter(64)

In [30]:
# for batch in batches:
#     print(batch)
    

Generating batches for data of shape (363861, 3).
Number of batches per epoch :  5686
[[ array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,   65,   15,    4,   27,    6, 1119, 1086,
         10,  366,   14, 5716,  405,    9,  463,    1], dtype=int32)
  array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,    65,    15,     4,
          27,     6, 10946,  5515,    10,   366,    22,  5716,   405,
           9,   463,     1], dtype=int32)
  1]
 [ array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    3,   35,    4,
         11,   52,   44,    5,   32,  236, 4190,    1], dtype=int32)
  array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,    60,     4,    26,
         177, 12423,     9,     6,   236,  4190,    47,    35,     4,
       

KeyboardInterrupt: 

In [39]:
embedding_matrix = pickle.load(open('embedding_matrix.p', "rb"))

In [40]:
embedding_matrix.shape

(167999, 300)