In [159]:
import fasttext
from gensim.models.wrappers import FastText
import gensim.models.fasttext
import pandas as pd
import numpy as np
import string

from os import listdir
from os.path import isfile, join

## Check Embedding Performance

In [35]:
model = FastText.load_fasttext_format('models/fasttext_skipgram_300_10.bin')
wv = model.wv

In [31]:
model.most_similar('slow', topn=25)

[('slowslow', 0.9461556673049927),
 ('slowww', 0.9150160551071167),
 ('sloww', 0.914790689945221),
 ('slowy', 0.9127247333526611),
 ('slowthe', 0.9061232209205627),
 ('slowwww', 0.9044946432113647),
 ('slowfast', 0.9032155275344849),
 ('slownot', 0.9016188383102417),
 ('slowno', 0.9011406898498535),
 ('slowand', 0.8921894431114197),
 ('slowmo', 0.8902242183685303),
 ('slowit', 0.8754894733428955),
 ('slowely', 0.8689318299293518),
 ('slowi', 0.8646904230117798),
 ('slowwwww', 0.8630250096321106),
 ('slowe', 0.8573843836784363),
 ('slowring', 0.8450390696525574),
 ('slowed', 0.8330438137054443),
 ('slower', 0.830303430557251),
 ('slowly', 0.8275282382965088),
 ('fastslow', 0.8224830627441406),
 ('slowest', 0.8189854621887207),
 ('laggyslow', 0.8144257068634033),
 ('slows', 0.8049894571304321),
 ('slugish', 0.8032950162887573)]

In [100]:
def get_feedback_vector(text, wv):
    tokens = text.split(" ") 
    tokens = [token for token in tokens if token in wv.vocab] 
    vectors = np.array([wv[token] for token in tokens])
    return np.mean(vectors, axis=0)

In [101]:
test_text = "hello this is printer jfiena"
test_text_tokens = test_text.split(" ")

In [103]:
fb_vec = get_feedback_vector(test_text, wv)
print(fb_vec.shape)

(300,)


In [105]:
%%timeit
fb_vec = get_feedback_vector(test_text, wv)

35.9 µs ± 6.62 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Generate Feedback Embeddings

In [149]:
fb = pd.read_csv('data/stack_labeled_feedback.txt', sep='\t')

In [150]:
# text preprocessing
fb = fb[['ConformedFeedbackId', 'verbatim']]
fb['verbatim'] = fb['verbatim'].apply(lambda text: str(text))
fb['verbatim'] = fb['verbatim'].apply(lambda text: text.lower())
fb['verbatim'] = fb['verbatim'].apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))

In [164]:
def get_embeddings(fb, wv):
    fb['embeddings'] = fb['verbatim'].apply(lambda text: get_feedback_vector(text, wv))
    fb[['x_{}'.format(i) for i in range(300)]] = fb['embeddings'].apply(pd.Series)
    return fb_embeddings[['ConformedFeedbackId'] + ['x_{}'.format(i) for i in range(300)]]

In [186]:
# get all the models
model_files = [f for f in listdir('models/') if isfile(join('models/', f))]

for model_file in model_files:
    model_name = model_file.split("_")[1:]
    model_name = "_".join(model_name)[:-4]
    embeddings = get_embeddings(fb, wv)
    embeddings_save_path = 'embeddings/embeddings_{}.csv'.format(model_name)
    fb_embeddings.to_csv(embeddings_save_path, index=False)
    print('Saved model {}'.format(embeddings_save_path))

100k_cbow_300_10


  out=out, **kwargs)


Saved model embeddings/embeddings_100k_cbow_300_10.csv
100k_cbow_600_10
Saved model embeddings/embeddings_100k_cbow_600_10.csv
100k_cbow_600_5
Saved model embeddings/embeddings_100k_cbow_600_5.csv
100k_skipgram_300_10
Saved model embeddings/embeddings_100k_skipgram_300_10.csv
100k_skipgram_300_5
Saved model embeddings/embeddings_100k_skipgram_300_5.csv
100k_skipgram_600_10
Saved model embeddings/embeddings_100k_skipgram_600_10.csv
100k_skipgram_600_5
Saved model embeddings/embeddings_100k_skipgram_600_5.csv
cbow_300_10
Saved model embeddings/embeddings_cbow_300_10.csv
cbow_300_5
Saved model embeddings/embeddings_cbow_300_5.csv
cbow_600_10
Saved model embeddings/embeddings_cbow_600_10.csv
cbow_600_5
Saved model embeddings/embeddings_cbow_600_5.csv
skipgram_300_10
Saved model embeddings/embeddings_skipgram_300_10.csv
skipgram_300_5
Saved model embeddings/embeddings_skipgram_300_5.csv
skipgram_600_10
Saved model embeddings/embeddings_skipgram_600_10.csv
skipgram_600_5
Saved model embeddin