In [11]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import Text8Corpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import gensim 
import logging
import io
from pathlib import Path
# read in some helpful libraries
import pandas as pd               # pandas dataframe
import re                         # regular expression
from gensim.test.utils import get_tmpfile
import re
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
# Bigram is the model to detect and group words into phrases
def prepare_bigram(corpus_file_name):
    # Read file and return sentences as a list of lists of string
    # where each list of string represents a sentence
    sentences = Text8Corpus(('../corpus/merged.txt'))
    
    # Detect and group phrases
    phrases = Phrases(sentences, min_count=1, threshold=5,common_terms=['in','of'])
    bigram = Phraser(phrases)
    return bigram

In [13]:
def intrinsic_evaluate(data,model):
    '''
    data contains 3 columns: Term1, Term2, score which is the relatedness/similarity score
    between two terms judged by clinical experts
    the output is the correlation coefficient between the predicting score by the model and the expert judgement score
    '''
    has_vocab_condition = (umn_sim['Term1'].isin(model.wv.vocab)) & (umn_sim['Term2'].isin(model.wv.vocab))
    data = data[has_vocab_condition]
    data['pred_score']=data[['Term1','Term2']].apply(lambda row: cosine_sim(row['Term1'],row['Term2'],model),axis=1)
    return data['pred_score'].corr(data['score'])

In [14]:
def cosine_sim(w1,w2, model):
    vectA = np.array(model[w1])
    vectB = np.array(model[w2])
    return (vectA.dot(vectB))/(np.linalg.norm(vectA) * np.linalg.norm(vectB))

In [19]:
def read_corpus(fname, bigram, tokens_only=False):
    with open(fname,encoding='utf-8') as f:
        for i, line in enumerate(f):
            line = re.sub("-","_",str(line))
            if tokens_only:
                yield bigram[gensim.utils.simple_preprocess(line)]
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(bigram[gensim.utils.simple_preprocess(line)], [i])

In [None]:
path_to_model = get_tmpfile("doc2vec_model_phr")
if(not Path(path_to_model).is_file()): 
    bigram = prepare_bigram("../corpus/merged.txt")
    documents = list(read_corpus("../corpus/merged.txt",bigram))
    model = gensim.models.doc2vec.Doc2Vec(vector_size=250, min_count=2, epochs=500)
    model.build_vocab(documents)
    %time model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    model.save(path_to_model)
    
else:
    model = Doc2Vec.load(path_to_model)
    

2018-11-25 01:29:40,869 : INFO : collecting all words and their counts
2018-11-25 01:29:40,871 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2018-11-25 01:29:47,507 : INFO : collected 1615674 word types from a corpus of 4752594 words (unigram + bigrams) and 476 sentences
2018-11-25 01:29:47,508 : INFO : using 1615674 counts as vocab in Phrases<0 vocab, min_count=1, threshold=5, max_vocab_size=40000000>
2018-11-25 01:29:47,508 : INFO : source_vocab length 1615674
2018-11-25 01:29:49,476 : INFO : Phraser added 50000 phrasegrams
2018-11-25 01:29:52,432 : INFO : Phraser added 100000 phrasegrams
2018-11-25 01:29:58,715 : INFO : Phraser added 150000 phrasegrams
2018-11-25 01:29:59,973 : INFO : Phraser built with 155098 155098 phrasegrams
2018-11-25 01:30:14,479 : INFO : collecting all words and their counts
2018-11-25 01:30:14,480 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-11-25 01:30:14,507 : INFO : PROGRESS: at example #1000

In [42]:
umn_sim = pd.read_csv('UMNSRS_sim.csv')
intrinsic_evaluate(umn_sim,model)

2018-11-24 23:24:00,202 : INFO : saving Doc2Vec object under C:\Users\AD\AppData\Local\Temp\doc2vec_model_phr, separately None
2018-11-24 23:24:00,203 : INFO : storing np array 'syn1neg' to C:\Users\AD\AppData\Local\Temp\doc2vec_model_phr.trainables.syn1neg.npy
2018-11-24 23:24:01,130 : INFO : storing np array 'vectors' to C:\Users\AD\AppData\Local\Temp\doc2vec_model_phr.wv.vectors.npy
2018-11-24 23:24:02,105 : INFO : storing np array 'vectors_docs' to C:\Users\AD\AppData\Local\Temp\doc2vec_model_phr.docvecs.vectors_docs.npy
2018-11-24 23:24:04,416 : INFO : saved C:\Users\AD\AppData\Local\Temp\doc2vec_model_phr
