In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Tutorial for downloading pretrained model
import os
os.environ["GENSIM_DATA_DIR"] = 'D:/gensim-data'
import gensim.downloader as api

wv_path = api.load('word2vec-google-news-300', return_path=True)
print(wv_path)
wv = api.load('word2vec-google-news-300')

for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))



2021-10-07 11:53:39,850 : INFO : word2vec-google-news-300 downloaded
2021-10-07 11:53:41,739 : INFO : loading projection weights from D:/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


D:/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


2021-10-07 11:55:12,969 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from D:/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2021-10-07T11:55:12.966151', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'load_word2vec_format'}


word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said
'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [6]:
# Loading my own corpus
import json
import os
from gensim import utils
import gensim.models

In [2]:
class DevignCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        with open('data/devign/function.json') as f:
            data = json.load(f)
        for item in data:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(item["func"])

In [23]:
class ReVealCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        filenames = ['data/reveal/vulnerables.json', 'data/reveal/non-vulnerables.json']
        for fname in filenames:
            with open(fname) as f:
                data = json.load(f)
            for item in data:
                # assume there's one document per line, tokens separated by whitespace
                yield utils.simple_preprocess(item["code"])

In [8]:
def evaluate_words(wv):
    pairs = [
        ('int', 'long'),
        ('int', 'static'),
        ('long', 'static'),
        ('char', '*'),
        ('if', 'else'),
    ]
    for w1, w2 in pairs:
        try:
            print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))
        except KeyError as e:
            print(e)

corpuses = {
    "devign": DevignCorpus,
    # "reveal": ReVealCorpus,
}
for corpus in corpuses:
    corpus_pretrained = corpus + '.wv'
    if os.path.exists(corpus_pretrained):
        model = gensim.models.Word2Vec.load(corpus_pretrained)
    else:
        sentences = corpuses[corpus]()
        model = gensim.models.Word2Vec(sentences=sentences, workers=8, window=10, vector_size=100)
        model.save(os.path.join('../word2vec', corpus_pretrained))
    evaluate_words(model.wv)
    for index, word in enumerate(model.wv.index_to_key):
        if index == 10:
            break
        print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

2021-10-14 23:36:07,743 : INFO : collecting all words and their counts
2021-10-14 23:36:08,545 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-14 23:36:12,636 : INFO : PROGRESS: at sentence #10000, processed 1375441 words, keeping 54592 word types
2021-10-14 23:36:16,188 : INFO : PROGRESS: at sentence #20000, processed 2708309 words, keeping 73958 word types
2021-10-14 23:36:18,766 : INFO : collected 82130 word types from a corpus of 3737441 raw words and 27318 sentences
2021-10-14 23:36:18,767 : INFO : Creating a fresh vocabulary
2021-10-14 23:36:18,953 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 40397 unique words (49.1866553025691%% of original 82130, drops 41733)', 'datetime': '2021-10-14T23:36:18.953667', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'prepare_vocab'}
2021-10-14 23:36:18,9

FileNotFoundError: [Errno 2] No such file or directory: 'word2vec\\devign.wv'

In [9]:

model.save(os.path.join('../word2vec', corpus_pretrained))

2021-10-14 23:37:55,405 : INFO : Word2Vec lifecycle event {'fname_or_handle': '../word2vec\\devign.wv', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-10-14T23:37:55.405328', 'gensim': '4.1.2', 'python': '3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:14:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'saving'}
2021-10-14 23:37:55,406 : INFO : not storing attribute cum_table
2021-10-14 23:37:55,446 : INFO : saved ../word2vec\devign.wv
