In [1]:
#coding=utf-8
import os
import re
import sys
import yaml
import argparse
from datetime import datetime

import numpy as np
import pandas as pd

import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.utils.data import DataLoader

# utils
from utils import get_embedding, load_embed, save_embed, data_preprocessing
# data
from data import myDS, mytestDS
# model
from model import Siamese_lstm

In [None]:
config = {
    'experiment_name': 'siamese-transfer-baseline-1',
    'task': 'train',
    'make_dict': False,
    'data_preprocessing': False,

    'ckpt_dir': 'ckpt/',

    'training':{
        'num_epochs': 20,
        'learning_rate': 0.01,
        'optimizer': 'sgd'
    },
    
    'embedding':{
        'full_embedding_path': 'input/wiki.es.vec',
        'cur_embedding_path': 'input/embedding.pkl',
    },
        
    'model':{
        'fc_dropout': 0.1,
        'fc_dim': 100,
        'name': 'siamese',
        'embed_size': 300,
        'batch_size': 1,
        'embedding_freeze': False,
        'encoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.5,
        },  
    },   
    
    'result':{
        'filename':'result.txt',
        'filepath':'res/',
    }
}

In [22]:
en = pd.read_csv("input/cleaned_en.csv")
sp = pd.read_csv("input/cleaned_sp.csv")
test_data = pd.read_csv("input/cleaned_test.csv")

In [23]:
en.columns = ['s1', 's2', 'label']
# split dataset
msk = np.random.rand(len(en)) < 0.8
en_train = en[msk]
en_valid = en[~msk]
en_all_sents = en['s1'].tolist() + en['s2'].tolist()

# dataset
en_trainDS = myDS(en_train, en_all_sents)
en_validDS = myDS(en_valid, en_all_sents)

In [24]:
sp.columns = ['s1', 's2', 'label']
# split dataset
msk = np.random.rand(len(sp)) < 0.8
sp_train = sp[msk]
sp_valid = sp[~msk]
sp_all_sents = sp['s1'].tolist() + sp['s2'].tolist()

# dataset
sp_trainDS = myDS(sp_train, sp_all_sents)
sp_validDS = myDS(sp_valid, sp_all_sents)

## Embed

In [25]:
en_embed_dict = load_embed('input/en_embed.pkl')
sp_embed_dict = load_embed('input/sp_embed.pkl')

In [26]:
embed_size = 300
en_embed_list = []
for word in en_validDS.vocab._id2word:
    en_embed_list.append(en_embed_dict[word])
en_vocab_size = len(en_embed_list)
    

sp_embed_list = []
for word in sp_trainDS.vocab._id2word:
    sp_embed_list.append(sp_embed_dict[word])
sp_vocab_size = len(sp_embed_list)

In [27]:
"""
Input: English and Spanish embed list
Output: English and Spanish aligned Embedding weight
"""
def align_embeddings(en_embed_list, sp_embed_list, embed_size):
    print('English Vocab Size:{}, Spanish Vocab Size:{}'.format(len(en_embed_list), len(sp_embed_list)))
    dif = abs(len(en_embed_list) - len(sp_embed_list))
    compensate = []
    for i in range(dif):
        compensate.append(np.zeros(embed_size))
    # shorter one aligned to longer one
    if len(en_embed_list) < len(sp_embed_list):
        en_embed_list.extend(compensate)
    else: sp_embed_list.extend(compensate)
    
    if len(en_embed_list) == len(sp_embed_list):
        print('-> Aligned to', len(en_embed_list))
    
    en_weight = nn.Parameter(torch.from_numpy(np.array(en_embed_list)).type(torch.FloatTensor), requires_grad = False)
    sp_weight = nn.Parameter(torch.from_numpy(np.array(sp_embed_list)).type(torch.FloatTensor), requires_grad = False)

    return en_weight, sp_weight

In [28]:
aligned_size = max(en_vocab_size,sp_vocab_size)
en_embedding = nn.Embedding(aligned_size, embed_size)
sp_embedding = nn.Embedding(aligned_size, embed_size)

en_embedding.weight, sp_embedding.weight = align_embeddings(en_embed_list, sp_embed_list, config['model']['embed_size'])

English Vocab Size:2685, Spanish Vocab Size:4101
-> Aligned to 4101


In [29]:
config['embedding_matrix'] = en_embedding
# model
siamese_en = Siamese_lstm(config)
print(siamese_en)

Siamese_lstm(
  (encoder): LSTMEncoder(
    (embedding): Embedding(4101, 300)
    (lstm): LSTM(300, 150, dropout=0.5)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=600, out_features=100, bias=True)
    (2): Tanh()
    (3): Dropout(p=0.1)
    (4): Linear(in_features=100, out_features=2, bias=True)
  )
)


  "num_layers={}".format(dropout, num_layers))


In [33]:
# loss func
loss_weights = Variable(torch.FloatTensor([1, 3]))
if torch.cuda.is_available():
    loss_weights = loss_weights.cuda()
criterion = torch.nn.CrossEntropyLoss(loss_weights)

# optimizer
learning_rate = config['training']['learning_rate']
if config['training']['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese_en.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, siamese_en.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adadelta':
    optimizer = torch.optim.Adadelta(filter(lambda x: x.requires_grad, siamese_en.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(filter(lambda x: x.requires_grad, siamese_en.parameters()), lr=learning_rate)
print('Optimizer:', config['training']['optimizer'])
print('Learning rate:', config['training']['learning_rate'])

# log info
train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f'
valid_log_string = '%s :: Epoch %i :: valid loss: %0.4f\n'

# Restore saved model (if one exists).
ckpt_path = os.path.join(config['ckpt_dir'], config['experiment_name']+'.pt')
if os.path.exists(ckpt_path):
    print('Loading checkpoint: %s' % ckpt_path)
    ckpt = torch.load(ckpt_path)
    epoch = ckpt['epoch']
    siamese_en.load_state_dict(ckpt['siamese'])
    optimizer.load_state_dict(ckpt['optimizer'])
else:
    epoch = 1
    print('Fresh start!\n')

Optimizer: sgd
Learning rate: 0.01
Fresh start!



## English

In [34]:
""" Train """

if config['task'] == 'train':

    # save every epoch for visualization
    train_loss_record = []
    valid_loss_record = []
    best_record = 10.0

    # training
    print('Experiment: {}\n'.format(config['experiment_name']))

    while epoch < config['training']['num_epochs']:

        print('Start Epoch {} Training...'.format(epoch))

        # loss
        train_loss = []
        train_loss_sum = []
        # dataloader
        train_dataloader = DataLoader(dataset=en_trainDS, shuffle=True, num_workers=2, batch_size=1)

        for idx, data in enumerate(train_dataloader, 0):

            # get data
            s1, s2, label = data

            # clear gradients
            optimizer.zero_grad()

            # input
            output = siamese_en(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss backward
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.data.cpu())
            train_loss_sum.append(loss.data.cpu())

            # Every once and a while check on the loss
            if ((idx + 1) % 5000) == 0:
                print(train_log_string % (datetime.now(), epoch, idx + 1, len(en_train), np.mean(train_loss)))
                train_loss = []

        # Record at every epoch
        print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
        train_loss_record.append(np.mean(train_loss_sum))

        # Valid
        print('Epoch {} Validating...'.format(epoch))

        # loss
        valid_loss = []
        # dataloader
        valid_dataloader = DataLoader(dataset=en_validDS, shuffle=True, num_workers=2, batch_size=1)

        for idx, data in enumerate(valid_dataloader, 0):
            # get data
            s1, s2, label = data

            # input
            output = siamese_en(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss
            loss = criterion(output, label)
            valid_loss.append(loss.data.cpu())

        print(valid_log_string % (datetime.now(), epoch, np.mean(valid_loss)))
        # Record
        valid_loss_record.append(np.mean(valid_loss))
        epoch += 1

        if np.mean(valid_loss)-np.mean(train_loss_sum) > 0.02:
             print("Early Stopping!")
             break

        # Keep track of best record
        if np.mean(valid_loss) < best_record:
            best_record = np.mean(valid_loss)
            # save the best model
            state_dict = {
                'epoch': epoch,
                'siamese': siamese_en.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(state_dict, ckpt_path)
            print('Model saved!\n')

Experiment: siamese-transfer-baseline-1

Start Epoch 1 Training...
2018-07-28 19:39:39.277722 :: Epoch 1 :: Iter 5000 / 16942 :: train loss: 0.5665
2018-07-28 19:40:52.057422 :: Epoch 1 :: Iter 10000 / 16942 :: train loss: 0.5506
2018-07-28 19:42:04.341837 :: Epoch 1 :: Iter 15000 / 16942 :: train loss: 0.5155
Train Loss at epoch 1: 0.5372381806373596

Epoch 1 Validating...
2018-07-28 19:42:58.845380 :: Epoch 1 :: valid loss: 0.4686

Model saved!

Start Epoch 2 Training...
2018-07-28 19:44:13.207159 :: Epoch 2 :: Iter 5000 / 16942 :: train loss: 0.4723
2018-07-28 19:45:27.205700 :: Epoch 2 :: Iter 10000 / 16942 :: train loss: 0.4560
2018-07-28 19:46:41.484500 :: Epoch 2 :: Iter 15000 / 16942 :: train loss: 0.4342
Train Loss at epoch 2: 0.4521958827972412

Epoch 2 Validating...
2018-07-28 19:47:36.285819 :: Epoch 2 :: valid loss: 0.4122

Model saved!

Start Epoch 3 Training...
2018-07-28 19:48:50.928896 :: Epoch 3 :: Iter 5000 / 16942 :: train loss: 0.4003
2018-07-28 19:50:05.003489 :: 

In [35]:
config['experiment_name'] = config['experiment_name'] + '-Spain'
config['embedding_matrix'] = sp_embedding
siamese_sp = Siamese_lstm(config)

  "num_layers={}".format(dropout, num_layers))


In [36]:
print(siamese_sp)

Siamese_lstm(
  (encoder): LSTMEncoder(
    (embedding): Embedding(4101, 300)
    (lstm): LSTM(300, 150, dropout=0.5)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=600, out_features=100, bias=True)
    (2): Tanh()
    (3): Dropout(p=0.1)
    (4): Linear(in_features=100, out_features=2, bias=True)
  )
)


In [37]:
config['training']['learning_rate'] = 0.01
config['training']['optimizer'] = 'sgd'

In [38]:
# loss func
loss_weights = Variable(torch.FloatTensor([1, 3]))
if torch.cuda.is_available():
    loss_weights = loss_weights.cuda()
criterion = torch.nn.CrossEntropyLoss(loss_weights)

# optimizer
learning_rate = config['training']['learning_rate']
if config['training']['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese_sp.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, siamese_sp.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adadelta':
    optimizer = torch.optim.Adadelta(filter(lambda x: x.requires_grad, siamese_sp.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(filter(lambda x: x.requires_grad, siamese_sp.parameters()), lr=learning_rate)
print('Optimizer:', config['training']['optimizer'])
print('Learning rate:', config['training']['learning_rate'])


Optimizer: sgd
Learning rate: 0.01


In [19]:
# epoch = 1
# # log info
# train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f'
# valid_log_string = '%s :: Epoch %i :: valid loss: %0.4f\n'
# best_record = 10.0

In [50]:
ckpt_path = 'ckpt/siamese-transfer-baseline.pt'
print('Transfering English Model from: %s' % ckpt_path)
ckpt = torch.load(ckpt_path)
siamese_sp.load_state_dict(ckpt['siamese'])
best_record = 10.0
epoch = 1

Transfering English Model from: ckpt/siamese-transfer-baseline.pt


In [51]:
""" Train """

if config['task'] == 'train':

    # save every epoch for visualization
    train_loss_record = []
    valid_loss_record = []
#     best_record = 10.0

    # training
    print('Experiment: {}\n'.format(config['experiment_name']))

    while epoch < config['training']['num_epochs']:

        print('Start Epoch {} Training...'.format(epoch))

        # loss
        train_loss = []
        train_loss_sum = []
        # dataloader
        train_dataloader = DataLoader(dataset=sp_trainDS, shuffle=True, num_workers=2, batch_size=1)

        for idx, data in enumerate(train_dataloader, 0):

            # get data
            s1, s2, label = data

            # clear gradients
            optimizer.zero_grad()

            # input
            output = siamese_sp(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss backward
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.data.cpu())
            train_loss_sum.append(loss.data.cpu())

            # Every once and a while check on the loss
            if ((idx + 1) % 5000) == 0:
                print(train_log_string % (datetime.now(), epoch, idx + 1, len(sp_train), np.mean(train_loss)))
                train_loss = []

        # Record at every epoch
        print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
        train_loss_record.append(np.mean(train_loss_sum))

        # Valid
        print('Epoch {} Validating...'.format(epoch))

        # loss
        valid_loss = []
        # dataloader
        valid_dataloader = DataLoader(dataset=sp_validDS, shuffle=True, num_workers=2, batch_size=1)

        for idx, data in enumerate(valid_dataloader, 0):
            # get data
            s1, s2, label = data

            # input
            output = siamese_sp(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss
            loss = criterion(output, label)
            valid_loss.append(loss.data.cpu())

        print(valid_log_string % (datetime.now(), epoch, np.mean(valid_loss)))
        # Record
        valid_loss_record.append(np.mean(valid_loss))
        epoch += 1

        if np.mean(valid_loss)-np.mean(train_loss_sum) > 0.02:
             print("Early Stopping!")
             break

        # Keep track of best record
        if np.mean(valid_loss) < best_record:
            best_record = np.mean(valid_loss)
            # save the best model
            state_dict = {
                'epoch': epoch,
                'siamese': siamese_sp.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(state_dict, ckpt_path)
            print('Model saved!\n')

Experiment: siamese-transfer-baseline-1-Spain

Start Epoch 1 Training...
2018-07-29 09:30:08.443428 :: Epoch 1 :: Iter 5000 / 17058 :: train loss: 0.4356
2018-07-29 09:31:29.370992 :: Epoch 1 :: Iter 10000 / 17058 :: train loss: 0.4223
2018-07-29 09:32:51.388914 :: Epoch 1 :: Iter 15000 / 17058 :: train loss: 0.4143
Train Loss at epoch 1: 0.420628160238266

Epoch 1 Validating...
2018-07-29 09:33:53.864678 :: Epoch 1 :: valid loss: 0.4088

Model saved!

Start Epoch 2 Training...
2018-07-29 09:35:17.566606 :: Epoch 2 :: Iter 5000 / 17058 :: train loss: 0.3830
2018-07-29 09:36:44.782698 :: Epoch 2 :: Iter 10000 / 17058 :: train loss: 0.3876
2018-07-29 09:38:12.808182 :: Epoch 2 :: Iter 15000 / 17058 :: train loss: 0.3798
Train Loss at epoch 2: 0.3844834864139557

Epoch 2 Validating...
2018-07-29 09:39:22.079134 :: Epoch 2 :: valid loss: 0.4039

Model saved!

Start Epoch 3 Training...
2018-07-29 09:40:52.436446 :: Epoch 3 :: Iter 5000 / 17058 :: train loss: 0.3477
2018-07-29 09:42:24.41140

In [49]:
ckpt_path = 'ckpt/siamese-transfer-baseline-sp.pt'

In [43]:
assert('aa')