In [1]:
import gc
from gensim.models.word2vec import Word2Vec



In [2]:
def load_char_samples(train_data_file, test_data_file):
    """Load training and testing data, get the characters of each sample in two dataset and return."""
    train_lines = open(train_data_file, 'r', encoding='utf-8').read().splitlines()[1:]
    test_lines = open(test_data_file, 'r', encoding='utf-8').read().splitlines()[1:]

    train_char_samples = [line.split(',')[1] for line in train_lines]
    test_char_samples = [line.split(',')[1] for line in test_lines]
    char_samples = train_char_samples + test_char_samples

    char_samples = [char_sample.split() for char_sample in char_samples]

    return char_samples

In [3]:
def batch_iter(data, batch_size=5000):
    """Generate batch iterator."""
    data_size = len(data)
    num_batches = ((data_size - 1) // batch_size) + 1
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        yield data[start_index:end_index]

In [None]:
# Load data
train_data_file = "../raw_data/train_set.csv"
test_data_file = "../raw_data/test_set.csv"
sentences = load_char_samples(train_data_file, test_data_file)
print("The number of samples in both training and testing dataset is: %d" % len(sentences))

In [None]:
# Calculate the size of vocabulary
words = []
for sentence in sentences:
    words.extend(sentence)
print("The total number of words in training set is: %d" % len(set(words)))

In [8]:
# Train model
model = Word2Vec(size=300, min_count=1)
model.build_vocab(sentences)
print(model)

batches = batch_iter(sentences, batch_size=5000)
for batch in batches:
    model.train(batch, total_examples=model.corpus_count, epochs=model.epochs)

Word2Vec(vocab=8303, size=300, alpha=0.025)


In [12]:
# Save word2vec model
model.wv.save("datagrand-char-300d.bin")
model.wv.save_word2vec_format("datagrand-char-300d.txt", binary=False)