In [1]:
import logging
import numpy as np

from gensim.models import KeyedVectors
from nltk.tokenize.stanford import StanfordTokenizer
from sklearn.preprocessing import LabelEncoder

from utils import data_reader
from utils import processing
from utils import opf_helper

logging.basicConfig(level=logging.INFO)

In [2]:
%%capture
# /\ hidding Stanford Parser warning messages

TOK_PATH    = '../tokenizer/stanford-corenlp-3.9.0.jar'
MODEL_PATH  = '../vsms/wglove.840B.300d.bin'

DATASET_T = '../datasets/clean/nps_train.tsv'
DATASET_D = '../datasets/clean/nps_dev.tsv'
DATASET_E = '../datasets/clean/nps_test.tsv'

FEATURES_FILE = './nps_opf/nps_samples.txt'
MIN_WORD_FREQ = 1

X_train, y_train = data_reader.read_dataset(DATASET_T)
X_dev,   y_dev   = data_reader.read_dataset(DATASET_D)
X_test,  y_test  = data_reader.read_dataset(DATASET_E)

model     = KeyedVectors.load(MODEL_PATH)
tokenizer = StanfordTokenizer(TOK_PATH)

X_tok_t, word_freq = processing.tokenize_stanford(X_train, tokenizer)
X_tok_d, _         = processing.tokenize_stanford(X_dev, tokenizer)
X_tok_e, _         = processing.tokenize_stanford(X_test, tokenizer)

INFO:root:Total of 7743 samples
INFO:root:Statement - 2338
INFO:root:System - 1714
INFO:root:Greet - 1202
INFO:root:Emotion - 849
INFO:root:ynQuestion - 402
INFO:root:whQuestion - 379
INFO:root:Bye - 162
INFO:root:Accept - 155
INFO:root:Continuer - 124
INFO:root:Emphasis - 122
INFO:root:Reject - 104
INFO:root:yAnswer - 80
INFO:root:nAnswer - 55
INFO:root:Clarify - 30
INFO:root:Other - 27
INFO:root:Total of 1412 samples
INFO:root:System - 535
INFO:root:Statement - 442
INFO:root:Emotion - 101
INFO:root:Greet - 88
INFO:root:whQuestion - 57
INFO:root:ynQuestion - 53
INFO:root:Accept - 35
INFO:root:Emphasis - 23
INFO:root:Bye - 22
INFO:root:Reject - 19
INFO:root:yAnswer - 12
INFO:root:nAnswer - 10
INFO:root:Continuer - 8
INFO:root:Clarify - 5
INFO:root:Other - 2
INFO:root:Total of 1412 samples
INFO:root:Statement - 405
INFO:root:System - 383
INFO:root:Emotion - 156
INFO:root:whQuestion - 97
INFO:root:ynQuestion - 95
INFO:root:Greet - 73
INFO:root:Emphasis - 45
INFO:root:Accept - 43
INFO:roo

In [3]:
def replace_rbp_rpar(tokenized):
    new_tok = list()
    for sentence in tokenized:
        new_sentence = list()
        for word in sentence:
            word = word.replace('-RRB-', ')')
            new_sentence.append(word)
        new_tok.append(new_sentence)
        
    return new_tok

X_tok_t = replace_rbp_rpar(X_tok_t)
X_tok_d = replace_rbp_rpar(X_tok_d)
X_tok_e = replace_rbp_rpar(X_tok_e)

In [4]:
pruned_vocab = processing.keep_common_words(word_freq, MIN_WORD_FREQ)

X_emb_t = processing.tok_sentence_to_vec(X_tok_t, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_d = processing.tok_sentence_to_vec(X_tok_d, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)
X_emb_e = processing.tok_sentence_to_vec(X_tok_e, pruned_vocab, model, normalize_sentence=4,
                                         normalize_word=False, show_logs=1)

encoder = LabelEncoder()
y_emb_t = encoder.fit_transform(y_train)
y_emb_d = encoder.transform(y_dev)
y_emb_e = encoder.transform(y_test)

INFO:root:Vocab size:   4024
INFO:root:Keeping:      3885 (96.55%)
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors


In [5]:
X = np.vstack([X_emb_t, X_emb_d, X_emb_e])
y = np.vstack([y_emb_t[:, None], y_emb_d[:, None], y_emb_e[:, None]]).flatten()

# writing all features file, so distances can be computed
opf_helper.write_opf_format(
    X,
    y + 1, # OPF indices start at 1
    FEATURES_FILE
)