In [ ]:
import os
import collections
import numpy as np
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence
import pickle as pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

In [ ]:
nltk.download('punkt')
stopWords = stopwords.words('english')
ps = PorterStemmer() 

In [ ]:
def removeStopWords(words):
    return [word for word in words if word not in stopWords]

In [ ]:
def stemWords(words):
    return [ps.stem(word) for word in words]

In [ ]:
def read_data(filename):
    try:
        file=open(os.path.join(os.path.join(os.path.abspath('../binaries'),"data"),filename),"r", encoding="utf-8")
        tokens = [word.lower() for word in word_tokenize(file.read())]
        tokens=[x for x in tokens if not any(c.isdigit() for c in x)]
        tokens=removeStopWords(tokens)
        tokens=stemWords(tokens)
        return tokens
    except:
        raise Exception("../binaries/data/"+filename+" does not exist.")

In [ ]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
        data.append(index)
    return data, dictionary

In [ ]:
def collect_data(vocabulary_size=25000):
    vocabulary = read_data("wikisent2.txt")
    data,dictionary = build_dataset(vocabulary,vocabulary_size)
    del vocabulary
    return data, dictionary 

In [ ]:
vocab_size = 100000
data, dictionary = collect_data(vocabulary_size=vocab_size)
window_size = 3

In [ ]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

In [ ]:
names=["window_size","vocab_size","dictionary","labels","word_target","word_context"]
for name in names:
    with open(os.path.join(os.path.join(os.path.abspath('../binaries'),"pickles"),name+".pickle"), "wb") as f:
        pickle.dump(globals()[name], f)