In [105]:
import torch.utils.data as data
import torch.nn as nn

In [106]:
class Config():
    def __init__(self, vocab_size, bag_size):
        self.mode = True # True is skip gram mode
        self.vocab_size = vocab_size
        self.embedding_dim = 1024
        self.batch_size = 512
        self.bag_size = bag_size
        
        self.lr = 0.00001
        self.epochs = 30
        
        self.save_path = "./module"

In [107]:
def train_step(model, training_loader, device, config):
    #enter train mode to refresh gradient
    model.train()
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    total_loss = 0
    avg_loss = 0
    
    for step, (inputs, targets) in enumerate(training_loader):
        optimizer.zero_grad()
        
        inputs = inputs.to(device)
        targets = inputs.to(device)
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)
        print("\r", "Train step[{}/{}] loss:{}]\n".format(step + 1, len(training_loader), avg_loss), end="\n")
    return avg_loss

In [112]:
def train(training_dataset, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    training_loader = data.DataLoader(training_dataset, batch_size=config.batch_size, shuffle=True)
    
    model = Word2VecModel(config.vocab_size, config.embedding_dim, config.batch_size, config.bag_size, config.mode)
    model = model.to(device)
    
    for epoch in range(config.epochs):
        print("Epoch [{}/{}]".format(epoch + 1, config.epochs))
        epoch_avg_loss = train_step(model, training_loader, device, config)
        save_model(model, config.save_path, epoch_avg_loss, epoch)

In [125]:
def test():
    window_size = 2
    training_set = read_data("./datasets/trainset.csv")
    
    training_dataset = Word2VecDataset(training_set, window_size, True)
    config = Config(len(training_dataset.word2idx), training_dataset.bag_size)
    model = Word2VecModel(config.vocab_size, config.embedding_dim, config.batch_size, config.bag_size, config.mode)
    model = load_model(model, "./module/model_29_0.095")

    #print(training_dataset.word2idx)
    word_idx = 14
    indices = find_nearest(word_idx, 8, model)
    nearest_words = [training_dataset.idx2word[i.item()] for i in indices]
    print(training_dataset.idx2word[word_idx])
    print(nearest_words)

In [114]:
def main():
    %run utils.ipynb
    %run data_preprocess.ipynb
    %run modules.ipynb
    window_size = 2
    training_set = read_data("./datasets/trainset.csv")
    
    training_dataset = Word2VecDataset(training_set, window_size, True)
    config = Config(len(training_dataset.word2idx), training_dataset.bag_size)
    train(training_dataset, config)

In [123]:
main()

Reading data...
Makeing dictionary...
Generating pair word data...


KeyboardInterrupt: 

In [None]:
test()

Reading data...
Makeing dictionary...
Generating pair word data...
