# Doc2Vec

In [1]:
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import joblib
nltk.download('punkt')

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package punkt to C:\Users\Juan
[nltk_data]     José\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def saveModel(clf, name):
    path = "../../Models/" + name + ".pkl"
    joblib.dump(clf, path) 

In [3]:
def loadModel(name):    
    path = "../../Models/" + name + ".pkl"
    clf = joblib.load(path)
    
    return clf;

In [4]:
def load_data(data):
    """
    Input  : path and file_name
    Purpose: loading text file
    Output : list of paragraphs/documents and
             title(initial 100 words considred as title of document)
    """
    titles = []
    titles.append(data['Diagnoses'][0:min(len(data['Diagnoses']),2)] )
    return data['Diagnoses'],titles

In [5]:
def tokenize(df, column):
    return df[column].fillna('').apply(lambda x: x.split())

In [6]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    
    dictionary.filter_extremes(no_below=10)
    dictionary.filter_tokens(bad_ids=[3,2])
        
    return dictionary,doc_term_matrix

In [7]:
def create_Doc2vec_Matrix(model, tagged_data):
    vectors = []
    
    for i in range(len(tagged_data)):
        vectors.append(model.docvecs[str(i)])
    
    return np.asarray(vectors)    

In [8]:
def saveMatrix(df, name):
    path = '../../Tables/'
    path += name + '.h5'
    df.to_hdf(path, key='df', index=False)
    return 

In [9]:
data = pd.read_table("../../Dataset/10k_1Col_NoCarEsp_LSA.csv", sep=',')
tok = tokenize(data, 'Diagnoses')

In [10]:
a, b = load_data(data)

In [11]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(a)]
tagged_data

[TaggedDocument(words=['adjuvant', 'therapy', 'breast', 'cancer', 'diagnosis', 'ecog', 'performance', 'status', 'zero', 'two'], tags=['0']),
 TaggedDocument(words=['dacarbazine', 'childhood', 'central', 'nervous', 'system', 'mixed', 'germ', 'cell', 'tumor', 'diagnosis', 'alt', 'lessthan', 'one', 'hundred', 'ten', 'l'], tags=['1']),
 TaggedDocument(words=['bortezomib', 'unspecified', 'adult', 'solid', 'tumor', 'protocol', 'specific', 'diagnosis', 'concurrent', 'prophylactic', 'colony', 'stimulating', 'factor'], tags=['2']),
 TaggedDocument(words=['antibodies', 'monoclonal', 'unspecified', 'adult', 'solid', 'tumor', 'protocol', 'specific', 'diagnosis', 'creatinine', 'ratio', 'greaterthan', 'doc', 'must', 'proteinuria', 'lessthan', 'one', 'zero', 'mg', 'twentyfour', 'hour', 'urine', 'collect'], tags=['3']),
 TaggedDocument(words=['isophosphamide', 'mustard', 'fibrohistiocytic', 'neoplasm', 'diagnosis', 'ineligible', 'sit'], tags=['4']),
 TaggedDocument(words=['etoposide', 'multiple', 'mye

In [12]:
len(tagged_data)

10000

In [10]:
max_epochs = 50 #100
vec_size = 300  #20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#model.save("d2v.model")
#print("Model Saved")

iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49


In [42]:
model.infer_vector(['folic', 'acid', 'stage', 'iiic', 'breast', 'cancer', 'diagnosis', 'serum', 'albumin', 'greaterthan', 'equalthan', 'three', 'dl'])

array([-0.13923925,  0.04857475, -0.02482549, -0.03641296, -0.01884176,
       -0.09419191,  0.00599955,  0.00265694, -0.05290237, -0.04125157,
        0.02664829,  0.0611458 ,  0.09083222, -0.01179402,  0.02340947,
       -0.03302941,  0.04541457, -0.03641178, -0.00650875,  0.13285163,
        0.01286417,  0.02290238,  0.02883044,  0.04364819, -0.02127558,
        0.02322309,  0.02344325,  0.11655007, -0.00195627,  0.0979571 ,
        0.12985852, -0.03864945,  0.00691147, -0.12520194,  0.12916368,
       -0.05534489,  0.02487529, -0.10502327,  0.01917484, -0.15935956,
       -0.09735978, -0.06395842,  0.02183324,  0.02989723, -0.01113326,
       -0.00701946, -0.02685608, -0.09346429, -0.07368178,  0.03943922,
       -0.04664085, -0.02292855, -0.01936883, -0.0165473 , -0.07201749,
        0.04138156,  0.0768683 ,  0.00199254,  0.04892664,  0.08188631,
       -0.02141121,  0.00624092, -0.0593432 , -0.01915173,  0.05204141,
        0.01142365,  0.10751498,  0.01787897, -0.06837559, -0.07

In [15]:
print(model['cancer'])

[-0.27721253 -0.41162303 -1.3845915   0.92542744 -0.5732816   0.6783094
  0.3241714  -0.5596067   0.19555813 -0.60451484 -0.53941613  0.37112114
  0.44883534 -1.045212    0.41064802  0.39049485 -1.3845297   0.02294517
  0.48506227  0.8074566   0.92427504 -0.6645168   0.01823809  0.7868746
 -0.43717915 -0.42167807 -1.5902789   0.4350175   0.20823684  0.11194181
  1.1406487   0.812162    0.5279389   0.8170727   0.5230697  -0.4903664
 -0.18622768 -0.81620747 -0.2616431   0.348867    0.35234982 -1.0479448
  0.34009084 -1.9269276  -0.1363759   0.15681045 -0.4220864   0.7788224
  0.8430343  -0.16369763 -1.1575655  -1.1746949  -0.66763765 -0.28098592
  0.22356239  1.3631684   0.72394973  0.27796987 -1.3306179   0.8287773
 -0.2048613  -0.11961751 -0.09585119 -0.2521633  -0.91793185 -0.1908757
 -0.0393174   1.0752643  -1.8571606  -1.2377589  -0.448239   -0.5212446
 -0.09750924 -0.08874225 -0.05625968 -0.15219377 -0.2121923  -0.37623522
 -0.54359066  0.46414548 -0.17655008  0.61477596 -1.6180047

In [49]:
model.docvecs['1']

array([-1.36819303e-01, -1.59721479e-01,  3.49936858e-02, -2.50243872e-01,
        7.21697211e-01, -4.07734007e-01,  4.37046885e-01, -3.74466687e-01,
       -8.92055511e-01,  7.93188393e-01,  6.27538040e-02,  6.29400194e-01,
        9.31877673e-01,  6.92616850e-02, -4.30757940e-01,  1.96703136e-01,
       -2.47208402e-01, -6.99074090e-01, -4.96689975e-03,  2.19885960e-01,
        6.80234194e-01, -1.25181198e-01, -1.50629652e+00,  2.35087186e-01,
       -9.94187221e-02, -8.73813570e-01, -2.58035660e-01,  5.41164652e-02,
       -2.86417037e-01,  5.06331623e-01,  1.38742352e+00, -1.78927362e-01,
       -1.17338169e+00, -8.07606876e-01, -2.23709509e-01, -4.26489234e-01,
       -4.35059071e-01,  3.78100127e-02,  6.81862295e-01, -4.38375026e-01,
       -4.20103759e-01, -3.24061871e-01,  4.12666589e-01,  7.57902384e-01,
       -2.66584277e-01,  9.10824895e-01,  6.92859218e-02,  4.50721264e-01,
       -1.25781012e+00,  2.06159368e-01,  3.48079056e-01,  2.55267382e-01,
        1.39182433e-01,  

In [48]:
model.docvecs()

TypeError: 'Doc2VecKeyedVectors' object is not callable

In [25]:
model['cancer'] + model['breast']

array([-1.7431778e+00, -9.7518957e-01, -5.3901839e-01,  1.1354862e+00,
       -2.7368793e+00,  7.1701908e-01, -8.5084009e-01, -4.5731136e-01,
        2.0793037e+00, -7.6736808e-01, -4.8883960e-01,  2.8568465e-01,
        1.4373602e+00, -9.0072602e-01,  1.8773235e+00, -1.4903188e+00,
       -3.0328012e+00,  5.5269164e-01,  7.7006638e-01,  1.6925793e+00,
       -3.7967563e-01,  1.4765680e-02,  5.5024821e-01, -1.0125059e-01,
       -1.2471163e+00,  3.2680404e-01, -6.1675709e-01,  1.6456482e+00,
        6.2006092e-01, -6.0114458e-02,  9.8866498e-01,  5.5363536e-01,
        1.6756654e-03,  1.9343295e+00, -7.0377642e-01,  7.0865029e-01,
        3.9226955e-01, -1.5705400e+00, -8.9389104e-01, -1.1632524e+00,
        9.4988614e-01, -8.9902884e-01,  2.4980971e-01, -2.4566884e+00,
        1.3540927e-01,  2.3567626e-01, -2.5060624e-01,  4.1347963e-01,
       -8.0509502e-01,  1.8063062e-01, -2.2856698e+00, -1.7439750e+00,
       -4.0818384e-01, -1.2211739e+00, -1.3020699e+00,  2.0401649e+00,
      

In [43]:
model.similar_by_word('cancer')

  """Entry point for launching an IPython kernel.


[('lymphoma', 0.6247161030769348),
 ('carcinoma', 0.5886520147323608),
 ('neoplasms', 0.48419052362442017),
 ('adenocarcinoma', 0.4071921408176422),
 ('myeloma', 0.37583407759666443),
 ('sarcoma', 0.36255356669425964),
 ('canc', 0.32438212633132935),
 ('glioblastoma', 0.29906120896339417),
 ('neoplasm', 0.2948371171951294),
 ('tumors', 0.2940630316734314)]

In [19]:
model.similarity('cancer', 'lymphoma')

  """Entry point for launching an IPython kernel.


0.62471616

In [57]:
X = create_Doc2vec_Matrix(model, tagged_data)
X.shape

(10000, 300)

In [58]:
saveMatrix(pd.DataFrame(X), 'Doc2Vec')

In [62]:
saveModel(model, 'Doc2vec')

In [15]:
from tqdm import tqdm
max_epochs = 500 #100
vec_size = 300  #20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in tqdm(range(max_epochs)):
    #print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#model.save("d2v.model")
#print("Model Saved")

100%|██████████| 500/500 [22:14<00:00,  2.67s/it]


In [16]:
X = create_Doc2vec_Matrix(model, tagged_data)
X.shape
saveMatrix(pd.DataFrame(X), 'testD2V')

# TDS

https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

#### Distributed Bag of Words (DBOW)

"DBOW is the doc2vec model analogous to Skip-gram model in word2vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph."

dm = 0

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing
cores = multiprocessing.cpu_count()

  from pandas import Panel


In [13]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(tagged_data)])

100%|██████████| 10000/10000 [00:00<00:00, 1671037.45it/s]


In [14]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 10000/10000 [00:00<00:00, 435903.18it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2005884.27it/s]
100%|██████████| 10000/10000 [00:00<00:00, 274874.11it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1671370.39it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2507355.33it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1437587.06it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2026919.25it/s]
100%|██████████| 10000/10000 [00:00<00:00, 3342341.22it/s]
100%|██████████| 10000/10000 [00:00<00:00, 3331721.34it/s]
100%|██████████| 10000/10000 [00:00<00:00, 856225.04it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1671104.03it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1671104.03it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2524560.01it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2507955.03it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2493048.03it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2015910.79it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2511860.10it/s]


Wall time: 14.9 s


In [19]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [38]:
Y, X = vec_for_learning(model_dbow, tagged_data)

In [26]:
X = np.asarray(X)
X.shape

(10000, 300)

In [39]:
saveMatrix(pd.DataFrame(X), "DOC2VEC_DBOW")

In [29]:
Y

('0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

#### Distributed Memory (DM)

"Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document. We again instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 30 times."

dm = 1

In [30]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(tagged_data)])

100%|██████████| 10000/10000 [00:00<00:00, 345765.14it/s]


In [31]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 10000/10000 [00:00<00:00, 1253602.73it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1253003.53it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2514872.29it/s]
100%|██████████| 10000/10000 [00:00<00:00, 714958.49it/s]
100%|██████████| 10000/10000 [00:00<00:00, 641330.89it/s]
100%|██████████| 10000/10000 [00:00<00:00, 589800.04it/s]
100%|██████████| 10000/10000 [00:00<00:00, 3341542.38it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2507505.23it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2002914.86it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2005308.85it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1649936.67it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2506755.92it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1253190.71it/s]
100%|██████████| 10000/10000 [00:00<00:00, 2506606.11it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1984154.41it/s]
100%|██████████| 10000/10000 [00:00<00:00, 3310421.47it/s]
100%|██████████| 10000/10000 [00:00<00:00, 3342607.59it/s]


Wall time: 29.1 s


In [41]:
Y, X = vec_for_learning(model_dmm, tagged_data)

In [42]:
X = np.asarray(X)
X.shape

(10000, 300)

In [44]:
saveMatrix(pd.DataFrame(X), "DOC2VEC_DM")

In [43]:
Y

('0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'