In [5]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os
from os.path import expanduser

import json

In [6]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25

Q1_TRAINING_DATA_FILE = 'train_q1.npy'
Q2_TRAINING_DATA_FILE = 'train_q2.npy'
LABEL_TRAINING_DATA_FILE = 'train_label.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

In [7]:
quora_path = expanduser('~/.kaggle/competitions/quora-question-pairs/')
quora_train = os.path.join(quora_path, 'train.csv')
quora_test = os.path.join(quora_path, 'test.csv')

In [8]:
df_train = pd.read_csv(quora_train, encoding='utf-8')
df_train = df_train.dropna() #drop empty

In [9]:
questions = list(df_train.question1) + list(df_train.question2)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index

question1_word_sequences = tokenizer.texts_to_sequences(df_train.question1)
question2_word_sequences = tokenizer.texts_to_sequences(df_train.question2)

print("Words in index: %d" % len(word_index))

Words in index: 95595


In [10]:
question1_word_sequences

[[2, 3, 1, 1222, 57, 1222, 2581, 7, 576, 8, 763, 383, 8, 35],
 [2, 3, 1, 559, 10, 14300, 13598, 5, 21311, 4565],
 [4, 13, 5, 217, 1, 440, 10, 17, 361, 1827, 200, 146, 6, 2773],
 [16, 72, 5, 2774, 312, 2757, 4, 13, 5, 649, 19],
 [23, 49, 7131, 8, 231, 35496, 1891, 2047, 10570, 12, 1928, 10924, 6456],
 [2371,
  5,
  72,
  6,
  9925,
  940,
  4451,
  813,
  12,
  4451,
  5037,
  2,
  21,
  28,
  238,
  46,
  60],
 [31, 5, 126, 31238],
 [4, 13, 5, 24, 6, 42, 25874],
 [37, 9, 15, 74, 42131, 482, 10, 42132],
 [6939, 186, 13, 5, 445, 17, 7896, 55635, 42133],
 [959, 7, 87, 4663, 10, 31239, 146, 31240, 55636],
 [4, 9, 5, 223, 12, 87, 17, 286, 1727],
 [2, 13, 52, 609, 632, 7, 71],
 [2, 59, 34, 94, 1403, 299, 39],
 [2,
  11,
  1,
  888,
  7,
  179,
  34,
  1161,
  33,
  6,
  234,
  517,
  7,
  6,
  653,
  233,
  8,
  1,
  105,
  4,
  9,
  66,
  168,
  7,
  1,
  2287,
  888,
  8,
  523],
 [2, 43, 6, 92, 1137, 101, 14, 475, 495, 8504, 288, 20, 32, 3502, 517],
 [2, 21, 11495, 101],
 [16, 9, 307, 99,

In [11]:
is_duplicate = df_train.is_duplicate

In [12]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404287, 25)
Shape of question2 data tensor: (404287, 25)
Shape of label tensor: (404287,)


In [13]:
data_prepro_configs = {'vocab': tokenizer.word_index, 'vocab_size': len(tokenizer.word_counts)}

In [14]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_data)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)

In [15]:
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump(data_prepro_configs, f)