In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Tutorial for downloading pretrained model
import os
os.environ["GENSIM_DATA_DIR"] = 'D:/gensim-data'
import gensim.downloader as api

wv_path = api.load('word2vec-google-news-300', return_path=True)
print(wv_path)
wv = api.load('word2vec-google-news-300')

for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))



2021-10-07 11:53:39,850 : INFO : word2vec-google-news-300 downloaded
2021-10-07 11:53:41,739 : INFO : loading projection weights from D:/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


D:/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


2021-10-07 11:55:12,969 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from D:/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-10-07T11:55:12.966151', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'load_word2vec_format'}


word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said
'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [14]:
# Loading my own corpus
import json
from gensim import utils
import gensim.models

In [15]:
class DevignCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        with open('data/devign/function.json') as f:
            data = json.load(f)
        for item in data:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(item["func"])

In [23]:
class ReVealCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        filenames = ['data/reveal/vulnerables.json', 'data/reveal/non-vulnerables.json']
        for fname in filenames:
            with open(fname) as f:
                data = json.load(f)
            for item in data:
                # assume there's one document per line, tokens separated by whitespace
                yield utils.simple_preprocess(item["code"])

In [25]:
def evaluate_words(wv):
    pairs = [
        ('int', 'long'),
        ('int', 'static'),
        ('long', 'static'),
        ('char', '*'),
        ('if', 'else'),
    ]
    for w1, w2 in pairs:
        try:
            print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))
        except KeyError as e:
            print(e)

corpuses = {
    "devign": DevignCorpus,
    "reveal": ReVealCorpus,
}
for corpus in corpuses:
    corpus_pretrained = corpus + '.wv'
    if os.path.exists(corpus_pretrained):
        model = gensim.models.Word2Vec.load(corpus_pretrained)
    else:
        sentences = corpuses[corpus]()
        model = gensim.models.Word2Vec(sentences=sentences, workers=8, vector_size=100)
        model.save(corpus_pretrained)
    evaluate_words(model.wv)
    for index, word in enumerate(model.wv.index_to_key):
        if index == 10:
            break
        print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

2021-10-07 12:48:41,577 : INFO : loading Word2Vec object from devign.wv
2021-10-07 12:48:41,619 : INFO : loading wv recursively from devign.wv.wv.* with mmap=None
2021-10-07 12:48:41,621 : INFO : setting ignored attribute cum_table to None
2021-10-07 12:48:42,222 : INFO : Word2Vec lifecycle event {'fname': 'devign.wv', 'datetime': '2021-10-07T12:48:42.222356', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'loaded'}
2021-10-07 12:48:42,228 : INFO : loading Word2Vec object from reveal.wv
2021-10-07 12:48:42,267 : INFO : loading wv recursively from reveal.wv.wv.* with mmap=None
2021-10-07 12:48:42,268 : INFO : setting ignored attribute cum_table to None


'int'	'long'	0.15
'int'	'static'	0.55
'long'	'static'	0.08
"Key '*' not present"
'if'	'else'	0.68
word #0/40397 is if
word #1/40397 is int
word #2/40397 is return
word #3/40397 is case
word #4/40397 is break
word #5/40397 is else
word #6/40397 is ret
word #7/40397 is avctx
word #8/40397 is for
word #9/40397 is uint


2021-10-07 12:48:42,657 : INFO : Word2Vec lifecycle event {'fname': 'reveal.wv', 'datetime': '2021-10-07T12:48:42.657991', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'loaded'}


'int'	'long'	0.28
'int'	'static'	0.49
'long'	'static'	0.14
"Key '*' not present"
'if'	'else'	0.61
word #0/30612 is if
word #1/30612 is int
word #2/30612 is null
word #3/30612 is return
word #4/30612 is const
word #5/30612 is offset
word #6/30612 is else
word #7/30612 is char
word #8/30612 is void
word #9/30612 is static
