In [1]:
#coding=utf-8
import os
import yaml
import argparse
from datetime import datetime

import numpy as np
import pandas as pd

import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.utils.data import DataLoader

# utils
from utils import get_embedding, load_embed, save_embed, data_preprocessing
# data
from data import myDS, mytestDS
# model
from model import Siamese_lstm

In [94]:
config = {
    'experiment_name': 'siamese-baseline-aug-1000-less',
    'task': 'train',
    'make_dict': True,
    'data_preprocessing': False,

    'ckpt_dir': 'ckpt/',

    'training':{
        'num_epochs': 20,
        'learning_rate': 0.01,
        'optimizer': 'sgd'
    },
    
    'embedding':{
        'full_embedding_path': 'input/wiki.es.vec',
        'cur_embedding_path': 'input/embedding_aug.pkl',
    },
        
    'model':{
        'fc_dropout': 0.1,
        'fc_dim': 100,
        'name': 'siamese',
        'embed_size': 300,
        'batch_size': 1,
        'embedding_freeze': False,
        'encoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.0,
        },  
    },   
    
    'result':{
        'filename':'result.txt',
        'filepath':'res/',
    }
}


In [110]:
""" Read Data """

train_data = pd.read_csv('input/cleaned_train.csv')
test_data = pd.read_csv('input/cleaned_test.csv')

aug_data = pd.read_csv('input/aug_sp_1000.csv')

In [111]:
from utils import removeSpanishStopWords, cleanSpanish, clean_sent
aug_data = aug_data.drop(['Unnamed: 0'], axis=1)
aug_data.columns = ['spanish1', 'spanish2', 'result']
cleanSpanish(aug_data)
from nltk.corpus import stopwords
stops1 = set(stopwords.words("spanish"))
removeSpanishStopWords(aug_data, stops1)
aug_data.replace('', np.nan, inplace=True)
dirty_data = aug_data[aug_data.isnull().any(axis=1)]
print('dirty sample count:', dirty_data.shape[0])
print('positive dirty training sample:', len(dirty_data[dirty_data['result'] == 1]))
print('negative dirty training sample:', len(dirty_data[dirty_data['result'] == 0]))

In [117]:
print('Aug Neg:', aug_data[aug_data['result'] == 0].shape[0])
print('Aug Pos:', aug_data[aug_data['result'] == 1].shape[0])

Aug Neg: 38345
Aug Pos: 28635


In [118]:
aug_data.columns = ['s1','s2','label']

In [119]:
train_data = pd.concat([train_data, aug_data]).reset_index()

In [120]:
train_data = train_data.drop(['index'], axis=1)

In [121]:
test_data.columns = ['s1', 's2']

In [122]:
train_data.head()

Unnamed: 0,s1,s2,label
0,hola hago clic producto recibido,compré producto recibido correo electrónico co...,0
1,hola cerré disputa 21 mayo 2017 dice realizará...,obtuve reembolso dinero pasado dos meses cuánd...,0
2,ordené españa españa ahora mandan pedido china,pedido llegó color diferente pedí,0
3,debo pagar impuestos personalizados,cómo pagar derechos aduana,1
4,recibí pedido,pedido muestra pagado hice,0


In [123]:
test_data.head()

Unnamed: 0,s1,s2
0,cómo puedo recibir reembolso mediante tarjeta,cómo puedo recibir reembolso
1,cómo puedo recibir reembolso mediante tarjeta,cómo puedo recibir reembolso si banco ido banc...
2,podido pagar tarjeta debo hacer,puedo pagar tarjeta débito puedo hacer
3,podido pagar tarjeta debo hacer,puedo pagar pedido tarjeta visa débito puedo h...
4,podido pagar tarjeta debo hacer,puedo pagar tarjeta


In [124]:
# split dataset
msk = np.random.rand(len(train_data)) < 0.8
train = train_data[msk]
valid = train_data[~msk]
all_sents = train_data['s1'].tolist() + train_data['s2'].tolist() + test_data['s1'].tolist() + test_data['s2'].tolist()

# dataset
trainDS = myDS(train, all_sents)
validDS = myDS(valid, all_sents)

print('Data size:',train_data.shape[0], test_data.shape[0])

Data size: 88307 5000


In [130]:
train_data.shape

(88307, 3)

In [135]:
po = train_data[train_data['label']==1].shape[0]
ne = train_data[train_data['label']==0].shape[0]
print(po, ne, po/train_data.shape[0])

33933 54374 0.3842617233061932


In [125]:

full_embed_path = config['embedding']['full_embedding_path']
cur_embed_path = config['embedding']['cur_embedding_path']

if os.path.exists(cur_embed_path) and not config['make_dict']:
    embed_dict = load_embed(cur_embed_path)
    print('Loaded existing embedding.')
else:
    print('Making embedding...')
    embed_dict = get_embedding(trainDS.vocab._id2word, full_embed_path)
    save_embed(embed_dict,cur_embed_path)
    print('Saved generated embedding.')


vocab_size = len(embed_dict)
# initialize nn embedding
embedding = nn.Embedding(vocab_size, config['model']['embed_size'])
embed_list = []
for word in trainDS.vocab._id2word:
    embed_list.append(embed_dict[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

Making embedding...
Found 5189/5819 words with embedding vectors
Missing Ratio: 10.83%
Filled missing words' embeddings.
Embedding Matrix Size:  5819
Embedding saved
Saved generated embedding.


In [126]:
# embedding
config['embedding_matrix'] = embedding
config['vocab_size'] = len(embed_dict)

# model
siamese = Siamese_lstm(config)
print(siamese)

# loss func
loss_weights = Variable(torch.FloatTensor([1, 3]))
if torch.cuda.is_available():
    loss_weights = loss_weights.cuda()
criterion = torch.nn.CrossEntropyLoss(loss_weights)

# optimizer
learning_rate = config['training']['learning_rate']
if config['training']['optimizer'] == 'sgd':
    optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adam':
    optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'adadelta':
    optimizer = torch.optim.Adadelta(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate)
elif config['training']['optimizer'] == 'rmsprop':
    optimizer = torch.optim.RMSprop(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate)
print('Optimizer:', config['training']['optimizer'])
print('Learning rate:', config['training']['learning_rate'])

# log info
train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f'
valid_log_string = '%s :: Epoch %i :: valid loss: %0.4f\n'

# Restore saved model (if one exists).
ckpt_path = os.path.join(config['ckpt_dir'], config['experiment_name']+'.pt')

if os.path.exists(ckpt_path):
    print('Loading checkpoint: %s' % ckpt_path)
    ckpt = torch.load(ckpt_path)
    epoch = ckpt['epoch']
    siamese.load_state_dict(ckpt['siamese'])
    optimizer.load_state_dict(ckpt['optimizer'])
else:
    epoch = 1
    print('Fresh start!\n')


if torch.cuda.is_available():
    criterion = criterion.cuda()
    siamese = siamese.cuda()

Siamese_lstm(
  (encoder): LSTMEncoder(
    (embedding): Embedding(5819, 300)
    (lstm): LSTM(300, 150)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=600, out_features=100, bias=True)
    (2): Tanh()
    (3): Dropout(p=0.1)
    (4): Linear(in_features=100, out_features=2, bias=True)
  )
)
Optimizer: sgd
Learning rate: 0.01
Fresh start!



In [127]:
""" Train """

if config['task'] == 'train':

    # save every epoch for visualization
    train_loss_record = []
    valid_loss_record = []
    best_record = 10.0

    # training
    print('Experiment: {}\n'.format(config['experiment_name']))

    while epoch < config['training']['num_epochs']:

        print('Start Epoch {} Training...'.format(epoch))

        # loss
        train_loss = []
        train_loss_sum = []
        # dataloader
        train_dataloader = DataLoader(dataset=trainDS, shuffle=True, batch_size=1)

        for idx, data in enumerate(train_dataloader, 0):

            # get data
            s1, s2, label = data

            # clear gradients
            optimizer.zero_grad()

            # input
            output = siamese(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss backward
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.data.cpu())
            train_loss_sum.append(loss.data.cpu())

            # Every once and a while check on the loss
            if ((idx + 1) % 5000) == 0:
                print(train_log_string % (datetime.now(), epoch, idx + 1, len(train), np.mean(train_loss)))
                train_loss = []

        # Record at every epoch
        print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
        train_loss_record.append(np.mean(train_loss_sum))

        # Valid
        print('Epoch {} Validating...'.format(epoch))

        # loss
        valid_loss = []
        # dataloader
        valid_dataloader = DataLoader(dataset=validDS, shuffle=True, batch_size=1)

        for idx, data in enumerate(valid_dataloader, 0):
            # get data
            s1, s2, label = data

            # input
            output = siamese(s1, s2)
            output = output.squeeze(0)

            # label cuda
            label = Variable(label)
            if torch.cuda.is_available():
                label = label.cuda()

            # loss
            loss = criterion(output, label)
            valid_loss.append(loss.data.cpu())

        print(valid_log_string % (datetime.now(), epoch, np.mean(valid_loss)))
        # Record
        valid_loss_record.append(np.mean(valid_loss))
        epoch += 1

        if np.mean(valid_loss)-np.mean(train_loss_sum) > 0.02:
             print("Early Stopping!")
             break

        # Keep track of best record
        if np.mean(valid_loss) < best_record:
            best_record = np.mean(valid_loss)
            # save the best model
            state_dict = {
                'epoch': epoch,
                'siamese': siamese.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(state_dict, ckpt_path)
            print('Model saved!\n')

Experiment: siamese-baseline-aug-1000

Start Epoch 1 Training...
2018-07-29 21:45:20.762607 :: Epoch 1 :: Iter 5000 / 70298 :: train loss: 0.4657
2018-07-29 21:47:02.258746 :: Epoch 1 :: Iter 10000 / 70298 :: train loss: 0.2669
2018-07-29 21:48:43.323065 :: Epoch 1 :: Iter 15000 / 70298 :: train loss: 0.2125
2018-07-29 21:50:25.156432 :: Epoch 1 :: Iter 20000 / 70298 :: train loss: 0.2049
2018-07-29 21:52:06.390792 :: Epoch 1 :: Iter 25000 / 70298 :: train loss: 0.1875
2018-07-29 21:53:48.405431 :: Epoch 1 :: Iter 30000 / 70298 :: train loss: 0.1781
2018-07-29 21:55:30.768276 :: Epoch 1 :: Iter 35000 / 70298 :: train loss: 0.1779
2018-07-29 21:57:12.911882 :: Epoch 1 :: Iter 40000 / 70298 :: train loss: 0.1559
2018-07-29 21:58:56.050493 :: Epoch 1 :: Iter 45000 / 70298 :: train loss: 0.1536
2018-07-29 22:00:39.907874 :: Epoch 1 :: Iter 50000 / 70298 :: train loss: 0.1529
2018-07-29 22:02:23.451565 :: Epoch 1 :: Iter 55000 / 70298 :: train loss: 0.1425
2018-07-29 22:04:11.152360 :: Epoc

KeyboardInterrupt: 

In [129]:
testDS = mytestDS(test_data, all_sents)
# Do not shuffle here
test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size=1)

result = []
for idx, data in enumerate(test_dataloader, 0):

    # get data
    s1, s2 = data

    # input
    output = siamese(s1,s2)
    output = output.squeeze(0)

    # feed output into softmax to get prob prediction
    sm = nn.Softmax(dim=1)
    res = sm(output.data)[:,1]
    result += res.data.tolist()

result = pd.DataFrame(result)
print(result.shape)
print('Inference Done.')
res_path = os.path.join(config['result']['filepath'], 'result_1000.txt')
result.to_csv(res_path, header=False, index=False)
print('Result has writtn to', res_path, ', Good Luck!')

(5000, 1)
Inference Done.
Result has writtn to res/result_1000.txt , Good Luck!
