# Imports

In [None]:
# Imports
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
from nltk.corpus.reader.tagged import word_tokenize
from scipy import spatial

from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Testing

In [None]:
# Testing to make sure I can get the training set in the correct format
question_titles_test = []
question_titles_test.append(["this", "is", "a", "test"])
question_titles_test.append(["hello", "world"])
question_titles_test.append(word_tokenize("I am currently, testing the tokenizer"))

question_titles_test
# common_texts

[['this', 'is', 'a', 'test'],
 ['hello', 'world'],
 ['I', 'am', 'currently', ',', 'testing', 'the', 'tokenizer']]

# Initiaize and train the word2vec model

In [None]:
# initialize a list that will store all the question titles
question_titles = []

# go thru all the questions, tokenize the title, and add it to the list
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  question_titles.append(word_tokenize(question.title))

In [None]:
# initialize the word2vec model
model = Word2Vec(sentences=question_titles, window=5, min_count=1, workers=4)
model.save("word2vec.model")



In [None]:
# train the word2vec model with the question titles
model = Word2Vec.load("word2vec.model")
model.train(question_titles, total_examples=len(question_titles), epochs=10)



(95218, 137140)

### Testing

In [None]:
# testing vector adding and dividing and such
vector1 = model.wv['caffeine']
vector2 = model.wv['tool']
vector3 = vector1 + vector2
print(vector3 / 2)
print(1 - spatial.distance.cosine(vector1, vector2))

[ 1.76101387e-01  1.73281178e-01 -5.34324467e-01  2.20231235e-01
  1.88666880e-01 -4.25964519e-02  1.45008668e-01 -4.84522730e-01
 -1.56262755e-01 -8.76702890e-02 -2.76686698e-01 -4.42019552e-01
  3.17836478e-02  1.62685721e-03 -1.54981360e-01  2.46341735e-01
  1.98340714e-02  2.52772689e-01  7.99055770e-02  1.34656399e-01
  1.46367162e-01 -7.20187128e-02  1.09330401e-01  5.03382348e-02
  3.73441309e-01  2.68004090e-01  4.13864553e-01 -2.08852828e-01
 -2.92208344e-01  3.87656838e-01 -5.43877520e-02 -5.13146259e-02
  7.32969418e-02  2.29930431e-01 -1.14152461e-01  4.86652330e-02
 -1.27297223e-01  3.28291357e-01  1.51310498e-02 -1.49643183e-01
 -1.87796071e-01 -3.42664689e-01 -1.25121042e-01  1.56036705e-01
 -1.19946383e-01 -3.36952716e-01 -7.23981708e-02  4.06094342e-01
 -3.81843358e-01 -2.65016854e-01  2.73881763e-01 -1.53535903e-01
 -9.72105041e-02 -9.63442475e-02  1.39155209e-01 -1.50697052e-01
 -1.14740498e-01  1.15297966e-01 -6.28181621e-02  3.22430849e-01
  1.82407558e-01  8.05336

# Getting and Storing the Vectors of Questions

In [None]:
# initialize a dicitonary that will store all the vectors for the questions
question_vectors = {}

for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  title = word_tokenize(question.title)

  # initialize the vector for the question
  vector = model.wv[title[0]]
  vector = vector - vector

  # go thru all the words in the question and add their vectors
  for word in title:
    vector = vector + model.wv[word]

  # find the average for all the words
  vector = vector / len(title)

  # add the question to the dictionary of vectors
  question_vectors[question_id] = vector.copy()

# Calculate Cosine Similarity Based off of a Query

In [None]:
# store a query to compare in a string and find the vector representation of it
query = "When does coffee go off?"
query_tokens = word_tokenize(query)
query_vector = model.wv[query_tokens[0]]
query_vector = query_vector - query_vector
for word in query_tokens:
  query_vector = query_vector + model.wv[word]
query_vector = query_vector / len(title)

In [None]:
# initialize a dict which will store the cosine similarity between the query and all titles by id
query_cosine = {}

# iterate thru all the question vectors
for key in question_vectors:
  # calculate and store cosine similary value for the id
  query_cosine[key] = 1 - spatial.distance.cosine(query_vector, question_vectors[key])

# Results

In [None]:
# sort the cosine dict by value and print top 5
cosine_sorted = dict(sorted(query_cosine.items(), reverse=True, key=lambda item: item[1]))
for x in list(cosine_sorted)[0:5]:
    print ("docid {}, value {} ".format(x,  cosine_sorted[x]))

docid 123, value 1 
docid 473, value 0.9999468922615051 
docid 3582, value 0.9999434947967529 
docid 4358, value 0.9999416470527649 
docid 5713, value 0.999941349029541 
