#  Paraphrase Detection Data Preprocessing
MSRP Corpus

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
''' This script is to preprocess data from the MSRP dataset for paraphrase detection
Adapted from Keras example at https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py
'''
import os
import numpy as np
import datetime, time, json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
!pip install simplejson as json

Collecting simplejson
[?25l  Downloading https://files.pythonhosted.org/packages/a8/04/377418ac1e530ce2a196b54c6552c018fdf1fe776718053efb1f216bffcd/simplejson-3.17.2-cp37-cp37m-manylinux2010_x86_64.whl (128kB)
[K     |██▌                             | 10kB 11.5MB/s eta 0:00:01[K     |█████                           | 20kB 16.7MB/s eta 0:00:01[K     |███████▋                        | 30kB 10.9MB/s eta 0:00:01[K     |██████████▏                     | 40kB 8.6MB/s eta 0:00:01[K     |████████████▊                   | 51kB 4.4MB/s eta 0:00:01[K     |███████████████▎                | 61kB 4.9MB/s eta 0:00:01[K     |█████████████████▉              | 71kB 5.0MB/s eta 0:00:01[K     |████████████████████▍           | 81kB 5.3MB/s eta 0:00:01[K     |███████████████████████         | 92kB 5.7MB/s eta 0:00:01[K     |█████████████████████████▌      | 102kB 5.8MB/s eta 0:00:01[K     |████████████████████████████    | 112kB 5.8MB/s eta 0:00:01[K     |██████████████████████████

In [None]:
t0 = time.time()
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, '/content/drive/MyDrive/ParaphraseDetection/')
MSR_DIR = os.path.join(BASE_DIR, '/content/drive/MyDrive/ParaphraseDetection/')
MSR_FILE = 'msr_paraphrase_train_test.txt'
GLOVE_FILE = 'glove.6B.200d.txt'
MAX_SEQUENCE_LENGTH = 30
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.2

# Extract sentence pairs

In [None]:
# Process sentence pairs from MSRP corpus

print("Processing", MSR_FILE)

sentence1 = []
sentence2 = []
label = []

with open(MSR_DIR + MSR_FILE, 'r', encoding='utf8') as f:
    f.readline()  # skipping the header of the file
    for line in f:
        text = line.strip().split('\t')
        sentence1.append(text[3])
        sentence2.append(text[4])
        label.append(int(text[0]))
        
print ('Sentence pairs: %d' % len(sentence1))

Processing msr_paraphrase_train_test.txt
Sentence pairs: 5800


# Build tokenized word index

In [None]:
# Build tokenized word index

sentences = sentence1 + sentence2
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(sentences)
sentence1_word_sequences = tokenizer.texts_to_sequences(sentence1)
sentence2_word_sequences = tokenizer.texts_to_sequences(sentence2)
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))

Words in index: 16537


# Download and process GloVe embeddings

In [None]:

print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, GLOVE_FILE), encoding="utf8") as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Processing glove.6B.200d.txt
Found 400000 word vectors.


# Prepare word embedding matrix

In [None]:
print('Preparing embedding matrix.')

num_words = min(MAX_NUM_WORDS, len(word_index))
word_embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Preparing embedding matrix.
Null word embeddings: 1483


# Prepare training data tensors

In [None]:
s1_data = pad_sequences(sentence1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
s2_data = pad_sequences(sentence2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(label, dtype=int)
print('Shape of sentence1 data tensor:', s1_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of sentence1 data tensor: (5800, 30)
Shape of label tensor: (5800,)


# Save data to files

In [None]:
S1_TRAINING_DATA_FILE = '/content/drive/MyDrive/ParaphraseDetection/paraphrase4/s1_train.npy'
S2_TRAINING_DATA_FILE = '/content/drive/MyDrive/ParaphraseDetection/paraphrase4/s2_train.npy'
LABEL_TRAINING_DATA_FILE = '/content/drive/MyDrive/ParaphraseDetection/paraphrase4/label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = '/content/drive/MyDrive/ParaphraseDetection/paraphrase4/word_embedding_matrix.npy'
NUM_WORDS_DATA_FILE = '/content/drive/MyDrive/ParaphraseDetection/paraphrase4/num_words.json'

np.save(open(S1_TRAINING_DATA_FILE, 'wb'), s1_data)
np.save(open(S2_TRAINING_DATA_FILE, 'wb'), s2_data)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NUM_WORDS_DATA_FILE, 'w') as f:
    json.dump({'num_words': num_words}, f)

In [None]:
print(type(s1_data))
print(s1_data[0].shape)
print(s2_data.shape)

<class 'numpy.ndarray'>
(30,)
(5800, 30)


In [None]:
t1 = time.time()
print("Preprocessing ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Preprocessing ended at 2021-04-12 08:23:04.175777
Minutes elapsed: 22.460537
