In [40]:
import torch
import torch.utils.data as data
import torch.nn as nn
import word2vec.data_preprocess as data_preprocess
import word2vec.modules as modules
import word2vec.utils as utils

In [41]:
class Config():
    """This is config class that store all parameter about training model"""
    def __init__(self, vocab_size, bag_size):
        self.mode = True # True is skip gram mode
        self.vocab_size = vocab_size
        self.embedding_dim = 1024
        self.batch_size = 512
        self.bag_size = bag_size
        
        self.lr = 0.00001
        self.epochs = 30
        
        self.save_path = "./ckpt"

In [47]:
def train_step(model: modules.Word2VecModel, training_loader: data.DataLoader, device: torch.device, config: Config) -> float:
    """Train model with steps 
    
    Args:
        model: the model will be trained
        training_loader: the dataloader of training dataset
        device: the hardware of training
        config: the training config
    
    Returns:
        return avg loss in steps
    """
    model.train() # enter train mode to refresh gradient
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    total_loss = 0
    avg_loss = 0
    
    for step, (inputs, targets) in enumerate(training_loader):
        optimizer.zero_grad()
        
        inputs = inputs.to(device)
        targets = inputs.to(device)
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        avg_loss = total_loss / (step + 1)
        print("\r", "Train step[{}/{}] loss:{}]".format(step + 1, len(training_loader), avg_loss), end="")
    return avg_loss

In [43]:
def train(training_dataset, config):
    """The training process"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    training_loader = data.DataLoader(training_dataset, batch_size=config.batch_size, shuffle=True)
    
    model = modules.Word2VecModel(config.vocab_size, config.embedding_dim, config.batch_size, config.bag_size, config.mode)
    model = model.to(device)
    
    for epoch in range(config.epochs):
        print("Epoch [{}/{}]".format(epoch + 1, config.epochs))
        epoch_avg_loss = train_step(model, training_loader, device, config)
        save_model(model, config.save_path, epoch_avg_loss, epoch)

In [44]:
def test():
    """The test process"""
    window_size = 2
    training_set = read_data("./datasets/trainset.csv")
    
    training_dataset = Word2VecDataset(training_set, window_size, True)
    config = Config(len(training_dataset.word2idx), training_dataset.bag_size)
    model = Word2VecModel(config.vocab_size, config.embedding_dim, config.batch_size, config.bag_size, config.mode)
    model = load_model(model, "./module/model_29_0.095")

    #print(training_dataset.word2idx)
    word_idx = 14
    indices = find_nearest(word_idx, 8, model)
    nearest_words = [training_dataset.idx2word[i.item()] for i in indices]
    print(training_dataset.idx2word[word_idx])
    print(nearest_words)

In [45]:
def main():
    """The main function"""
    window_size = 2
    training_set = utils.read_data("./datasets/trainset.csv")
    
    training_dataset = data_preprocess.Word2VecDataset(training_set, window_size, True)
    config = Config(len(training_dataset.word2idx), training_dataset.bag_size)
    train(training_dataset, config)

In [None]:
if __name__ == "__main__":
    main()

Reading data...
Makeing dictionary...
Generating pair word data...
Epoch [1/30]
 Train step[478/2560] loss:9.286265049020615]]