In [1]:
import sim
import score

In [None]:
train_texts = "Data/train_corpus.txt"
test_words = [x.strip().split(",") for x in open("Data/test_tokens.txt", encoding='utf8')]
test_values = [x.strip() for x in open("Data/test_values.txt", encoding='utf8')]

In [3]:
baseline_model = sim.Baseline() 
term_doc_model = sim.Term_document(corpus=train_texts)
window_model = sim.Window(corpus=train_texts, context_size=1)
word2vec_model = sim.Word2Vec()

In [4]:
baseline_results = [baseline_model.calc_sim(word_1=x[0], word_2=x[1]) for x in test_words]

In [5]:
term_doc_results = [term_doc_model.calc_sim(word_1=x[0], word_2=x[1]) for x in test_words]

In [6]:
window_results = [window_model.calc_sim(word_1=x[0], word_2=x[1]) for x in test_words]

In [7]:
word2vec_results = [word2vec_model.calc_sim(word_1=x[0], word_2=x[1]) for x in test_words]

In [9]:
print("Mean Squared Errors")
print(f"Baseline: {score.MSE(preds=baseline_results, golds=test_values)}")
print(f"Term-Doc: {score.MSE(preds=term_doc_results, golds=test_values)}")
print(f"Window: {score.MSE(preds=window_results, golds=test_values)}")
print(f"Word2Vec: {score.MSE(preds=word2vec_results, golds=test_values)}")

Mean Squared Errors
Baseline: 7.02184094094094
Term-Doc: 22.831604704704702
Window: 13.556392992992993
Word2Vec: 6.417409509509505


The results are a bit surprising, showing that the baseline model of just guessing the middle value is one of the best tested.

A few solutions that could work to fix this are:
- A better tokenizer. The one being used is just a very basic one. Puncuaction could be causing issues, as well as text like URLs.
- Some sort of weight system where more frequent words have less of an impact as frequent words have a higher similarity score compared to less frequent words.
- Stop words (and filler words) could be removed as well to only use more 'complex' words but that could cause issues for words that should be considered similar to them.