In [15]:
#coding=utf-8
import numpy as np
import pandas as pd
import os
import torch
import argparse
from datetime import datetime
from torch.autograd import Variable
import yaml
import torch.nn as nn
from torch.utils.data import DataLoader

# utils
from utils import get_embedding,load_embed,save_embed,data_preprocessing

# data
from data import myDS, mytestDS

# model
from model import Siamese_lstm

In [16]:
config = {
    'experiment_name': 'siamese-dropout_0.5',
    'task': 'train',
    'make_dict': False,
    'data_preprocessing': False,

    'ckpt_dir': 'ckpt/',

    'training':{
        'num_epochs': 20,
        'learning_rate': 0.01,
        'optimizer': 'sgd'
    },
    
    'embedding':{
        'full_embedding_path': 'input/wiki.es.vec',
        'cur_embedding_path': 'input/embedding.pkl',
    },
        
    'model':{
        'fc_dim': 100,
        'name': 'siamese',
        'embed_size': 300,
        'batch_size': 1,
        'embedding_freeze': False,
        'encoder':{
            'hidden_size': 150,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.5,
        },  
    },   
    
    'result':{
        'filename':'result.txt',
        'filepath':'res/',
    }
}

In [30]:
import re
import nltk
import pickle
import numpy as np
from nltk.corpus import stopwords
stops1 = set(stopwords.words("spanish"))

def clean_sent(sent):
    sent = sent.lower()
    sent = re.sub(u'[_"\-;%()|+&=*%.,!?:#$@\[\]/]',' ',sent)
    sent = re.sub('¡',' ',sent)
    sent = re.sub('¿',' ',sent)
    sent = re.sub('Á','á',sent)
    sent = re.sub('Ó','ó',sent)
    sent = re.sub('Ú','ú',sent)
    sent = re.sub('É','é',sent)
    sent = re.sub('Í','í',sent)
    return sent
def cleanSpanish(df):
    df['spanish1'] = df.spanish1.map(lambda x: ' '.join([ word for word in
                                                         nltk.word_tokenize(clean_sent(x).decode('utf-8'))]).encode('utf-8'))
    df['spanish2'] = df.spanish2.map(lambda x: ' '.join([ word for word in
                                                         nltk.word_tokenize(clean_sent(x).decode('utf-8'))]).encode('utf-8'))
def removeSpanishStopWords(df, stop):
	df['spanish1'] = df.spanish1.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x.decode('utf-8'))
                                                         if word not in stop]).encode('utf-8'))
	df['spanish2'] = df.spanish2.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x.decode('utf-8'))
                                                         if word not in stop]).encode('utf-8'))

In [37]:

# Training data
df_train_en_sp = pd.read_csv('./input/cikm_english_train_20180516.txt', sep='	', header=None,
                             error_bad_lines=False)
df_train_sp_en = pd.read_csv('./input/cikm_spanish_train_20180516.txt', sep='	', header=None,
                             error_bad_lines=False)
df_train_en_sp.columns = ['english1', 'spanish1', 'english2', 'spanish2', 'result']
df_train_sp_en.columns = ['spanish1', 'english1', 'spanish2', 'english2', 'result']
train1 = pd.DataFrame(pd.concat([df_train_en_sp['spanish1'], df_train_sp_en['spanish1']], axis=0))
train2 = pd.DataFrame(pd.concat([df_train_en_sp['spanish2'], df_train_sp_en['spanish2']], axis=0))
train_data = pd.concat([train1, train2], axis=1).reset_index()
train_data = train_data.drop(['index'], axis=1)
result = pd.DataFrame(pd.concat([df_train_en_sp['result'], df_train_sp_en['result']], axis=0)).reset_index()
result = result.drop(['index'], axis=1)
# pd.get_dummies(result['result']).head()
train_data['result'] = result

# Evaluation data
test_data = pd.read_csv('./input/cikm_test_a_20180516.txt', sep='	', header=None, error_bad_lines=False)
test_data.columns = ['spanish1', 'spanish2']


In [38]:
test_data.iloc[1712,:]

spanish1                  Donde está eso
spanish2    Mi producto está defectuoso.
Name: 1712, dtype: object

s1:那是哪里 where is that s2:我的产品是有缺陷的. via baidu翻译

In [39]:
test_data.iloc[2349,:]

spanish1                      no es eso
spanish2    ¿Qué es Denunciar artículo?
Name: 2349, dtype: object

s1: 不是这样的 it's not that; s2:什么是报案？ via baidu翻译

In [41]:
cleanSpanish(test_data)
removeSpanishStopWords(test_data, stops1)

In [42]:
test_data.shape

(5000, 2)

In [43]:
test_data.replace('', np.nan, inplace=True)
dirty_data = test_data[test_data.isnull().any(axis=1)]
print 'dirty sample count:', dirty_data.shape[0]

dirty sample count: 2


## Handling dirty test_data

In [44]:
dirty_data

Unnamed: 0,spanish1,spanish2
1712,,producto defectuoso
2349,,denunciar artículo


In [45]:
test_data.iloc[1712,0] = 'hola'
test_data.iloc[2349,0] = 'hola'

In [48]:
test_data.columns = ['s1', 's2']
test_data.to_csv("input/cleaned_test.csv", index=False)

### Test Data Loading

In [51]:
train_data = pd.read_csv('input/cleaned_train.csv')
test_data = pd.read_csv('input/cleaned_test.csv')

In [53]:
all_sents = train_data['s1'].tolist() + train_data['s2'].tolist() + test_data['s1'].tolist() + test_data['s2'].tolist()

testDS = mytestDS(test_data, all_sents)

### Embedding

In [54]:
full_embed_path = config['embedding']['full_embedding_path']
cur_embed_path = config['embedding']['cur_embedding_path']

if os.path.exists(cur_embed_path) and not config['make_dict']:
    embed_dict = load_embed(cur_embed_path)
    print 'Loaded existing embedding.'
else:
    embed_dict = get_embedding(testDS.vocab._id2word, full_embed_path)
    save_embed(embed_dict,cur_embed_path)
    print 'Saved generated embedding.'


vocab_size = len(embed_dict)
# initialize nn embedding
embedding = nn.Embedding(vocab_size, config['model']['embed_size'])
embed_list = []
for word in testDS.vocab._id2word:
    embed_list.append(embed_dict[word])
weight_matrix = np.array(embed_list)
# pass weights to nn embedding
embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False)

Loaded existing embedding.


### Model Loading

In [55]:
# embedding
config['embedding_matrix'] = embedding
config['vocab_size'] = len(embed_dict)

In [56]:
 # model
siamese = Siamese_lstm(config)

In [57]:
# optimizer
learning_rate = config['training']['learning_rate']
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese.parameters()) ,
                                        lr=learning_rate)

In [58]:
# Restore saved model (if one exists).
ckpt_path = os.path.join(config['ckpt_dir'], config['experiment_name']+'.pt')

if os.path.exists(ckpt_path):
    print('Loading checkpoint: %s' % ckpt_path)
    ckpt = torch.load(ckpt_path)
    epoch = ckpt['epoch']
    siamese.load_state_dict(ckpt['siamese'])
    optimizer.load_state_dict(ckpt['optimizer'])
else:
    epoch = 0

Loading checkpoint: ckpt/siamese-dropout_0.5.pt


In [59]:
# Do not shuffle here
test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size = 1)

In [63]:
def inference():
    test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size = 1)
    prob_res = []
    for idx, data in enumerate(test_dataloader, 0):

        # get data
        s1, s2 = data

        # input 
        output = siamese(s1,s2)
        output = output.squeeze(0)

        sm = nn.Softmax(dim=1)
        res = sm(output.data)[:,1]
        prob_res += res.data.tolist()
    return prob_res

In [64]:
result = inference()
result = pd.DataFrame(result)
print 'Inference Done.'
res_path = os.path.join(config['result']['filepath'], config['result']['filename'])
result.to_csv(res_path,header=False,index=False)
print 'Result has writtn to', res_path, ', Good Luck!'

Inference Done.
Result has writtn to res/result.txt , Good Luck!


In [65]:
result.shape

(5000, 1)