In [1]:
import logging
import numpy as np

from gensim.models import KeyedVectors
from nltk.tokenize.stanford import StanfordTokenizer
from sklearn.preprocessing import LabelEncoder

from utils import data_reader
from utils import processing
from utils import opf_helper

logging.basicConfig(level=logging.INFO)

In [2]:
%%capture
# /\ hidding Stanford Parser warning messages

TOK_PATH    = '../tokenizer/stanford-corenlp-3.9.0.jar'
MODEL_PATH  = '../vsms/wglove.840B.300d.bin'

DATASET_T = '../datasets/clean/maptask_train.tsv'
DATASET_D = '../datasets/clean/maptask_dev.tsv'
DATASET_E = '../datasets/clean/maptask_test.tsv'

FEATURES_FILE = './maptask_opf/maptask_samples.txt'
MIN_WORD_FREQ = 2

X_train, y_train = data_reader.read_dataset(DATASET_T)
X_dev,   y_dev   = data_reader.read_dataset(DATASET_D)
X_test,  y_test  = data_reader.read_dataset(DATASET_E)

model     = KeyedVectors.load(MODEL_PATH)
tokenizer = StanfordTokenizer(TOK_PATH)

X_tok_t, word_freq = processing.tokenize_stanford(X_train, tokenizer)
X_tok_d, _         = processing.tokenize_stanford(X_dev, tokenizer)
X_tok_e, _         = processing.tokenize_stanford(X_test, tokenizer)

INFO:root:Total of 18450 samples
INFO:root:acknowledge - 3946
INFO:root:instruct - 2934
INFO:root:reply_y - 2225
INFO:root:check - 1503
INFO:root:explain - 1475
INFO:root:align - 1283
INFO:root:ready - 1281
INFO:root:query_yn - 1173
INFO:root:clarify - 840
INFO:root:reply_n - 646
INFO:root:reply_w - 607
INFO:root:query_w - 537
INFO:root:Total of 4426 samples
INFO:root:acknowledge - 913
INFO:root:instruct - 686
INFO:root:reply_y - 490
INFO:root:ready - 426
INFO:root:explain - 414
INFO:root:check - 375
INFO:root:query_yn - 264
INFO:root:align - 241
INFO:root:clarify - 228
INFO:root:reply_w - 163
INFO:root:reply_n - 117
INFO:root:query_w - 109
INFO:root:Total of 3282 samples
INFO:root:acknowledge - 625
INFO:root:instruct - 566
INFO:root:reply_y - 440
INFO:root:ready - 291
INFO:root:query_yn - 283
INFO:root:align - 235
INFO:root:explain - 210
INFO:root:check - 199
INFO:root:reply_w - 121
INFO:root:clarify - 111
INFO:root:query_w - 101
INFO:root:reply_n - 100
INFO:gensim.utils:loading Eucli

In [3]:
pruned_vocab = processing.keep_common_words(word_freq, MIN_WORD_FREQ)

X_emb_t = processing.tok_sentence_to_vec(X_tok_t, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_d = processing.tok_sentence_to_vec(X_tok_d, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_e = processing.tok_sentence_to_vec(X_tok_e, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)

encoder = LabelEncoder()
y_emb_t = encoder.fit_transform(y_train)
y_emb_d = encoder.transform(y_dev)
y_emb_e = encoder.transform(y_test)

INFO:root:Vocab size:   1790
INFO:root:Keeping:      1052 (58.77%)
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [4]:
X = np.vstack([X_emb_t, X_emb_d, X_emb_e])
y = np.vstack([y_emb_t[:, None], y_emb_d[:, None], y_emb_e[:, None]]).flatten()

# writing all features file, so distances can be computed
opf_helper.write_opf_format(
    X,
    y + 1, # OPF indices start at 1
    FEATURES_FILE
)