In [87]:
%matplotlib inline

# Standard imports
import os
import string
import sys

In [2]:
# Setup path
sys.path.append("../")
sys.path.append("../contraxsuite_services/")

In [3]:
# Django imports
import django
from django.db import IntegrityError

# Setup django environment
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local")
django.setup()

In [40]:
# Import document model
from contraxsuite_services.apps.document import *
from contraxsuite_services.apps.task.utils.nlp import *
from contraxsuite_services.apps.task.utils.text import *
from contraxsuite_services.apps.task.tasks import normalize, stem_tokens, stemmer

In [8]:
# Data science imports
import gensim
import gensim.corpora
import gensim.models
import gensim.models.word2vec

## Generate sentence sample list

In [88]:
# Build training sample
sentence_list = []

# Iterate and print
for d in Document.objects.all()[0:50]:
    for tu in d.textunit_set.filter(unit_type="sentence").order_by("id").all():
        sentence_list.append(normalize(tu.text))

## Build word2vec model

In [89]:
# Fit model
word2vec_model = gensim.models.word2vec.Word2Vec(sentence_list)

2017-08-01 18:53:48,510 [MainThread  ] [INFO ]  collecting all words and their counts
2017-08-01 18:53:48,512 [MainThread  ] [INFO ]  PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-01 18:53:48,555 [MainThread  ] [INFO ]  PROGRESS: at sentence #10000, processed 240306 words, keeping 5640 word types
2017-08-01 18:53:48,573 [MainThread  ] [INFO ]  collected 6887 word types from a corpus of 337241 raw words and 14126 sentences
2017-08-01 18:53:48,574 [MainThread  ] [INFO ]  Loading a fresh vocabulary
2017-08-01 18:53:48,582 [MainThread  ] [INFO ]  min_count=5 retains 2560 unique words (37% of original 6887, drops 4327)
2017-08-01 18:53:48,583 [MainThread  ] [INFO ]  min_count=5 leaves 329967 word corpus (97% of original 337241, drops 7274)
2017-08-01 18:53:48,590 [MainThread  ] [INFO ]  deleting the raw counts dictionary of 6887 items
2017-08-01 18:53:48,591 [MainThread  ] [INFO ]  sample=0.001 downsamples 48 most-common words
2017-08-01 18:53:48,592 [MainThread 

In [90]:
# Show vocabulary size
len(word2vec_model.wv.vocab)

2560

## Check the vector for the concept "assign"

In [91]:
word2vec_model.wv[stemmer.stem("assign")]

array([ 0.26161176, -0.16407771, -0.40077955, -0.12600055,  0.41745022,
       -0.07047732,  0.64792246, -0.71008027, -0.08653647, -0.46014813,
       -0.18735853,  0.31150085,  0.12557301, -0.01342526,  0.00387105,
       -0.14358574,  0.26174042,  0.11262165, -0.12912995,  0.75031877,
        0.1266973 ,  0.56346273,  0.16052081,  1.20694351,  0.22862051,
        0.5383721 ,  0.86124098,  0.69743252, -0.14519298, -0.27886057,
        0.06332136, -0.02382321, -0.1628231 ,  0.44388682, -0.5663746 ,
        0.26292726, -0.83798271, -0.45327497,  0.08511386, -0.56426793,
        0.36550534, -0.57639825, -0.44245222, -0.42558497,  1.03601384,
       -0.47760651,  0.25984782, -0.29440579,  0.91382223,  0.0812201 ,
        0.16682558,  0.95292634,  0.59663928,  0.86263126, -0.3458392 ,
       -1.18008697, -0.17823279,  0.35147128,  0.23613958,  0.35853672,
        0.12772545,  0.13320085, -0.24835874, -0.45698234, -0.21994391,
        0.05583423,  0.64813834,  0.12949063, -0.10010085, -0.68

In [92]:
word2vec_model.wv.doesnt_match([stemmer.stem("purchase"), 
                                stemmer.stem("sell"),
                                stemmer.stem("transfer"),
                               stemmer.stem("memorandum")])

2017-08-01 18:53:49,653 [MainThread  ] [INFO ]  precomputing L2-norms of word weight vectors


'memorandum'

In [95]:
word2vec_model.wv.most_similar(positive=[stemmer.stem("stock"), stemmer.stem("option"), stemmer.stem("corporation")],
                               negative=[stemmer.stem("partnership")])

[('paid', 0.8569889068603516),
 ('escrow', 0.7999917268753052),
 ('purchas', 0.7718213796615601),
 ('us', 0.7551891803741455),
 ('issu', 0.7367774844169617),
 ('prefer', 0.7355434894561768),
 ('common', 0.7233516573905945),
 ('citizen', 0.7217422723770142),
 ('agent', 0.7140426635742188),
 ('proportion', 0.7114315032958984)]