In [1]:
import logging
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import KeyedVectors
from nltk.tokenize.stanford import StanfordTokenizer
from sklearn.preprocessing import LabelEncoder

from utils import data_reader
from utils import processing
from utils import opf_helper

logging.basicConfig(level=logging.INFO)


In [2]:
TOK_PATH    = '../tokenizer/stanford-corenlp-3.9.0.jar'
MODEL_PATH  = '../vsms/wglove.840B.300d.bin'

# just ensure that these files contain punctuation
ICSI_EVEN_T = '../datasets/clean/icsi_train.tsv'
ICSI_EVEN_D = '../datasets/clean/icsi_dev.tsv'
ICSI_EVEN_E = '../datasets/clean/icsi_test.tsv'

FEATURES_FILE = './icsi_opf/icsi_samples.txt'
MIN_CLUSTERS = 2
MIN_WORD_FREQ = 3

In [3]:
%%capture
# /\ hidding Stanford Parser warning messages

X_train, y_train = data_reader.read_dataset(ICSI_EVEN_T)
X_dev,   y_dev   = data_reader.read_dataset(ICSI_EVEN_D)
X_test,  y_test  = data_reader.read_dataset(ICSI_EVEN_E)

model     = KeyedVectors.load(MODEL_PATH)
tokenizer = StanfordTokenizer(TOK_PATH)

X_tok_t, word_freq = processing.tokenize_stanford(X_train, tokenizer)
X_tok_d, _         = processing.tokenize_stanford(X_dev, tokenizer)
X_tok_e, _         = processing.tokenize_stanford(X_test, tokenizer)

INFO:root:Total of 75434 samples
INFO:root:S - 45088
INFO:root:D - 10201
INFO:root:B - 10178
INFO:root:F - 5328
INFO:root:Q - 4639
INFO:root:Total of 15436 samples
INFO:root:S - 8936
INFO:root:D - 2385
INFO:root:B - 2033
INFO:root:Q - 1043
INFO:root:F - 1039
INFO:root:Total of 15177 samples
INFO:root:S - 8565
INFO:root:D - 2228
INFO:root:B - 1953
INFO:root:F - 1316
INFO:root:Q - 1115
INFO:gensim.utils:loading EuclideanKeyedVectors object from ../vsms/wglove.840B.300d.bin
INFO:gensim.utils:loading syn0 from ../vsms/wglove.840B.300d.bin.syn0.npy with mmap=None
INFO:gensim.utils:setting ignored attribute syn0norm to None
INFO:gensim.utils:loaded ../vsms/wglove.840B.300d.bin


In [4]:
pruned_vocab = processing.keep_common_words(word_freq, MIN_WORD_FREQ)
# pruned_vocab = word_freq.keys()

X_emb_t = processing.tok_sentence_to_vec(X_tok_t, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_d = processing.tok_sentence_to_vec(X_tok_d, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_e = processing.tok_sentence_to_vec(X_tok_e, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)

encoder = LabelEncoder()
y_emb_t = encoder.fit_transform(y_train)
y_emb_d = encoder.transform(y_dev)
y_emb_e = encoder.transform(y_test)

INFO:root:Vocab size:  10425
INFO:root:Keeping:      5004 (48.0%)
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [5]:
X = np.vstack([X_emb_t, X_emb_d, X_emb_e])
y = np.vstack([y_emb_t[:, None], y_emb_d[:, None], y_emb_e[:, None]]).flatten()

# writing all features file, so distances can be computed
opf_helper.write_opf_format(
    X,
    y + 1,
    FEATURES_FILE
)