In [1]:
#coding=utf-8
import numpy as np
import pandas as pd
import os
import torch
import argparse
from datetime import datetime
from torch.autograd import Variable
import yaml
import torch.nn as nn
from torch.utils.data import DataLoader

# utils
from utils import get_embedding,load_embed,save_embed,data_preprocessing

# data
from data import myDS

# model
from model import Siamese_lstm

In [58]:
config = {
    'experiment_name': 'siamese-simple',
    'task': 'train',
    'make_dict': True,
    'data_preprocessing': False,

    'ckpt_dir': 'ckpt/',

    'training':{
        'num_epochs': 20,
        'learning_rate': 0.003
    },
    
    'embedding':{
        'full_embedding_path': 'input/wiki.es.vec',
        'cur_embedding_path': 'input/embedding.pkl',
    },
        

    'model':{
        'fc_dim': 100,
        'name': 'siamese',
        'embed_size': 300,
        'batch_size': 1,
        'embedding_freeze': False,
        'encoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.0,
        },
            
        'decoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.0,
        },
            
    },   
    
    'result':{
        'filename':'result.txt',
        'filepath':'res/',
    }
}

In [20]:
from torch.utils.data import Dataset
from collections import Counter


class mytestDS(Dataset):

    def __init__(self, df, all_sents):
        # Assign vocabularies.
        self.s1 = df['s1'].tolist()
        self.s2 = df['s2'].tolist()
        self.vocab = Vocab(all_sents, sos_token='<sos>', eos_token='<eos>', unk_token='<unk>')

    def __len__(self):
        return len(self.s1)

    def __getitem__(self, idx):
        # Split sentence into words.
        s1_words = self.s1[idx].split()
        s2_words = self.s2[idx].split()

        # Add <SOS> and <EOS> tokens.
        s1_words = [self.vocab.sos_token] + s1_words + [self.vocab.eos_token]
        s2_words = [self.vocab.sos_token] + s2_words + [self.vocab.eos_token]

        # Lookup word ids in vocabularies.
        s1_ids = [self.vocab.word2id(word) for word in s1_words]
        s2_ids = [self.vocab.word2id(word) for word in s2_words]

        return s1_ids, s2_ids



class Vocab(object):
    def __init__(self, all_sents, max_size=None, sos_token=None, eos_token=None, unk_token=None):
        """Initialize the vocabulary.
        Args:
            iter: An iterable which produces sequences of tokens used to update
                the vocabulary.
            max_size: (Optional) Maximum number of tokens in the vocabulary.
            sos_token: (Optional) Token denoting the start of a sequence.
            eos_token: (Optional) Token denoting the end of a sequence.
            unk_token: (Optional) Token denoting an unknown element in a
                sequence.
        """
        self.max_size = max_size
        self.pad_token = '<pad>'
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token

        # Add special tokens.
        id2word = [self.pad_token]
        if sos_token is not None:
            id2word.append(self.sos_token)
        if eos_token is not None:
            id2word.append(self.eos_token)
        if unk_token is not None:
            id2word.append(self.unk_token)

        # Update counter with token counts.
        counter = Counter()
        for x in all_sents:
            counter.update(x.split())

        # Extract lookup tables.
        if max_size is not None:
            counts = counter.most_common(max_size)
        else:
            counts = counter.items()
            counts = sorted(counts, key=lambda x: x[1], reverse=True)
        words = [x[0] for x in counts]
        id2word.extend(words)
        word2id = {x: i for i, x in enumerate(id2word)}

        self._id2word = id2word
        self._word2id = word2id

    def __len__(self):
        return len(self._id2word)

    def word2id(self, word):
        """Map a word in the vocabulary to its unique integer id.
        Args:
            word: Word to lookup.
        Returns:
            id: The integer id of the word being looked up.
        """
        if word in self._word2id:
            return self._word2id[word]
        elif self.unk_token is not None:
            return self._word2id[self.unk_token]
        else:
            raise KeyError('Word "%s" not in vocabulary.' % word)

    def id2word(self, id):
        """Map an integer id to its corresponding word in the vocabulary.
        Args:
            id: Integer id of the word being looked up.
        Returns:
            word: The corresponding word.
        """
        return self._id2word[id]


### Test Data Loading

In [21]:
train_data = pd.read_csv('input/cleaned_train.csv')
test_data = pd.read_csv('input/cleaned_test.csv')

all_sents = train_data['s1'].tolist() + train_data['s2'].tolist() + test_data['s1'].tolist() + test_data['s2'].tolist()

testDS = mytestDS(test_data, all_sents)

### Embedding

In [11]:
full_embed_path = config['embedding']['full_embedding_path']
cur_embed_path = config['embedding']['cur_embedding_path']

if os.path.exists(cur_embed_path) and not config['make_dict']:
    embed_dict = load_embed(cur_embed_path)
    print 'Loaded existing embedding.'
else:
    embed_dict = get_embedding(testDS.vocab._id2word, full_embed_path)
    save_embed(embed_dict,cur_embed_path)
    print 'Saved generated embedding.'


vocab_size = len(embed_dict)
# initialize nn embedding
embedding = nn.Embedding(vocab_size, config['model']['embed_size'])
embed_list = []
for word in testDS.vocab._id2word:
    embed_list.append(embed_dict[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

Found 5142/5774 words with embedding vectors
Missing Ratio: 10.95%
Filled missing words' embeddings.
Embedding Matrix Size:  5774
Embedding saved
Saved generated embedding.


### Model Loading

In [12]:
# embedding
config['embedding_matrix'] = embedding
config['vocab_size'] = len(embed_dict)

In [15]:
 # model
siamese = Siamese_lstm(config)

In [35]:
# optimizer
learning_rate = config['training']['learning_rate']
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese.parameters()) ,
                                        lr=learning_rate)

In [36]:
# Restore saved model (if one exists).
ckpt_path = os.path.join(config['ckpt_dir'], config['experiment_name']+'.pt')

if os.path.exists(ckpt_path):
    print('Loading checkpoint: %s' % ckpt_path)
    ckpt = torch.load(ckpt_path)
    epoch = ckpt['epoch']
    siamese.load_state_dict(ckpt['siamese'])
    optimizer.load_state_dict(ckpt['optimizer'])
else:
    epoch = 0

Loading checkpoint: ckpt/siamese-simple.pt


In [42]:
# Do not shuffle here
test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size = 1)

In [72]:
def inference():
    test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size = 1)
    prob_res = []
    for idx, data in enumerate(test_dataloader, 0):

        # get data
        s1, s2 = data

        # input 
        output = siamese(s1,s2)
        output = output.squeeze(0)

        sm = nn.Softmax(dim=1)
        res = sm(output.data)[:,1]
        prob_res += res.data.tolist()
        if idx == 100: break
    return prob_res

In [73]:
result = inference()
result = pd.DataFrame(result)
print 'Inference Done.'
res_path = os.path.join(config['result']['filepath'], config['result']['filename'])
result.to_csv(res_path,header=False,index=False)
print 'Result has writtn to', res_path, ', Good Luck!'

Inference Done.
Result has writtn to res/result.txt , Good Luck!
