## Data Augmentation

# Calculate Unlabeled Data Similarities:

Unlabeled data: Spanish - English


In [1]:
from collections import Counter
from utils import save_embed, load_embed
import pandas as pd
import numpy as np
import nltk
import re

### Read Unlabeled Data

In [2]:
un = pd.read_csv('input/cikm_unlabel_spanish_train_20180516.txt',sep='	', header=None, error_bad_lines=False)
un.columns = ['spanish','english']
un.head()

Unnamed: 0,spanish,english
0,Por qué no se ha podido procesar mi pago por r...,Why my payment could not be processed due to s...
1,tengo una duda Tengo una pregunta sobre mis cu...,I have a question I have a question about my c...
2,Por favor necesito saber si mi compra todavía ...,Please I need to know if my purchase is still ...
3,Cómo cancelo un pedido Si cancelo el pedido re...,How do I cancel an order? If I cancel the orde...
4,Cómo puedo recibir un reembolso mediante tarje...,How can I receive a refund by card How can I r...


In [3]:
un.shape

(55669, 2)

### Read Labeled Data

In [4]:
# Training data
train_en_sp = pd.read_csv('./input/cikm_english_train_20180516.txt', sep='	', header=None, error_bad_lines=False)
train_sp_en = pd.read_csv('./input/cikm_spanish_train_20180516.txt', sep='	', header=None, error_bad_lines=False)

In [5]:
train_en_sp.columns = ['english1', 'spanish1', 'english2', 'spanish2', 'result']
train_sp_en.columns = ['spanish3', 'english3', 'spanish4', 'english4', 'result']

### Read Test Data

In [6]:
test_sp_sp = pd.read_csv('./input/cikm_test_a_20180516.txt', sep='	', header=None, error_bad_lines=False)
test_sp_sp.columns = ['spanish5', 'spanish6']

#### Get all English sentences

In [7]:
all_en = pd.DataFrame(pd.concat([train_en_sp['english1'], train_en_sp['english2'], 
                                   train_sp_en['english3'], train_sp_en['english4']], axis=0))
all_en.columns = ['english']
all_en = all_en.reset_index()
all_en = all_en.drop(columns='index')
print(all_en.shape)

(42800, 1)


In [8]:
all_en.head()

Unnamed: 0,english
0,"hello, i click in product received"
1,"Hello! I have closed the dispute on may 21, 20..."
2,l ordered from spain to spain now they send th...
3,Do I need to pay custom duty
4,I didn't receive my order?


#### Get all Spanish sentences

In [9]:
all_sp = pd.DataFrame(pd.concat([train_en_sp['spanish1'], train_en_sp['spanish2'], train_sp_en['spanish3'], 
                                   train_sp_en['spanish4'],test_sp_sp['spanish5'], test_sp_sp['spanish6']], axis=0))
all_sp.columns = ['spanish']
all_sp = all_sp.reset_index()
all_sp = all_sp.drop(columns='index')
print(all_sp.shape)

(52800, 1)


## Data preprocess

Clean English & Spanish: Stopwords, Punctuations

#### Lower and clean punctuations

In [10]:
def clean_sent(sent):
    sent = sent.lower()
    sent = re.sub(u"[_'\-;%()|+&=*%.,!?:#$@\[\]/]",' ',sent)
    sent = re.sub('¡',' ',sent)
    sent = re.sub('¿',' ',sent)
    sent = re.sub('Á','á',sent)
    sent = re.sub('Ó','ó',sent)
    sent = re.sub('Ú','ú',sent)
    sent = re.sub('É','é',sent)
    sent = re.sub('Í','í',sent)
    return sent

In [11]:
all_sp['spanish'] = all_sp.spanish.map(lambda x: ' '.join([word for word in
                                                     nltk.word_tokenize(clean_sent(x))]))
all_en['english'] = all_en.english.map(lambda x: ' '.join([word for word in
                                                     nltk.word_tokenize(clean_sent(x))]))

KeyboardInterrupt: 

In [None]:
all_sp.head()

In [None]:
all_en.head()

#### Remove stopwords

In [None]:
from nltk.corpus import stopwords
sp_stop = set(stopwords.words("spanish"))
en_stop = set(stopwords.words("english"))

In [None]:
all_sp['spanish'] = all_sp.spanish.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in sp_stop]))
all_en['english'] = all_en.english.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in en_stop]))

In [None]:
all_sp.head()

In [None]:
all_en.head()

In [None]:
all_en.replace('', np.nan, inplace=True)
dirty_data = all_en[all_en.isnull().any(axis=1)]
print('Before clean:', len(all_en))
print('English dirty sample count:', dirty_data.shape[0])
all_en = all_en.dropna()
print('After clean:', len(all_en))

In [None]:
all_sp.replace('', np.nan, inplace=True)
dirty_data = all_en[all_sp.isnull().any(axis=1)]
print('Before clean:', len(all_sp))
print('English dirty sample count:', dirty_data.shape[0])
all_sp = all_sp.dropna()
print('After clean:', len(all_sp))


In [None]:
all_en.to_csv("input/all_en.csv", index=False)
all_sp.to_csv("input/all_sp.csv", index=False)

### Make dictionary

In [12]:
all_en = pd.read_csv("input/all_en.csv")
all_sp = pd.read_csv("input/all_sp.csv")

In [13]:
def make_dict(sents):
    counter = Counter()
    for sent in sents:
        counter.update(sent.split())
    dic = counter.items()
    return dict(dic)

In [14]:
sp_dict = make_dict(all_sp['spanish'])
en_dict = make_dict(all_en['english'])

In [15]:
print('Unlabeled English Vocab Count:',len(en_dict), '\nUnlabeled Spanish Vocab Count:', len(sp_dict))

Unlabeled English Vocab Count: 2683 
Unlabeled Spanish Vocab Count: 5763


In [16]:
class Vocab(object):
    def __init__(self, all_sents, max_size=None, sos_token=None, eos_token=None, unk_token=None):
        """Initialize the vocabulary.
        Args:
            iter: An iterable which produces sequences of tokens used to update
                the vocabulary.
            max_size: (Optional) Maximum number of tokens in the vocabulary.
            sos_token: (Optional) Token denoting the start of a sequence.
            eos_token: (Optional) Token denoting the end of a sequence.
            unk_token: (Optional) Token denoting an unknown element in a
                sequence.
        """
        self.max_size = max_size
        self.pad_token = '<pad>'
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.unk_token = unk_token

        # Add special tokens.
        id2word = [self.pad_token]
        if sos_token is not None:
            id2word.append(self.sos_token)
        if eos_token is not None:
            id2word.append(self.eos_token)
        if unk_token is not None:
            id2word.append(self.unk_token)

        # Update counter with token counts.
        counter = Counter()
        for x in all_sents:
            counter.update(x.split())

        # Extract lookup tables.
        if max_size is not None:
            counts = counter.most_common(max_size)
        else:
            counts = counter.items()
            counts = sorted(counts, key=lambda x: x[1], reverse=True)
        words = [x[0] for x in counts]
        id2word.extend(words)
        word2id = {x: i for i, x in enumerate(id2word)}

        self._id2word = id2word
        self._word2id = word2id

    def __len__(self):
        return len(self._id2word)

    def word2id(self, word):
        """Map a word in the vocabulary to its unique integer id.
        Args:
            word: Word to lookup.
        Returns:
            id: The integer id of the word being looked up.
        """
        if word in self._word2id:
            return self._word2id[word]
        elif self.unk_token is not None:
            return self._word2id[self.unk_token]
        else:
            raise KeyError('Word "%s" not in vocabulary.' % word)

    def id2word(self, id):
        """Map an integer id to its corresponding word in the vocabulary.
        Args:
            id: Integer id of the word being looked up.
        Returns:
            word: The corresponding word.
        """
        return self._id2word[id]

In [17]:
en_vocab = Vocab(all_en['english'].tolist())
sp_vocab = Vocab(all_sp['spanish'].tolist())

### Get Pre-trained Embeddings

In [18]:
en_embed_path = 'input/wiki.en.vec'
sp_embed_path = 'input/wiki.es.vec'

In [19]:
from utils import get_embedding

In [20]:
en_embed = get_embedding(en_vocab._id2word, en_embed_path)

Found 2608/2684 words with embedding vectors
Missing Ratio: 2.83%
Filled missing words' embeddings.
Embedding Matrix Size:  2684


In [21]:
sp_embed = get_embedding(sp_vocab._id2word, sp_embed_path)

Found 5142/5764 words with embedding vectors
Missing Ratio: 10.79%
Filled missing words' embeddings.
Embedding Matrix Size:  5764


In [22]:
save_embed(en_embed,'input/en_embed.pkl')
save_embed(sp_embed,'input/sp_embed.pkl')
print('Embedding saved!')

Embedding saved
Embedding saved
Embedding saved!


## Second Start

In [23]:
all_en = pd.read_csv("input/all_en.csv")
all_sp = pd.read_csv("input/all_sp.csv")

In [24]:
en_embed = load_embed('input/en_embed.pkl')
sp_embed = load_embed('input/sp_embed.pkl')

## Utilize the unlabeled data

1. Takes in one column of Spanish or English sentences list, corresponding embeddings.  

2. Calculate every sentence's embedding.
    How to represent sentences: word embeddings average?

3. Calculate every sentence pairs cosine distance. 
    E.g. s1-s2, s1-s3 ... s1-s1000, get the first 5% and last 5% score to be positive or negative sentence pairs.
    
4. Concat all results to be added into train data.

### Clean up unlabeled data

In [25]:
un = pd.read_csv('input/cikm_unlabel_spanish_train_20180516.txt',sep='	', header=None, error_bad_lines=False)
un.columns = ['spanish','english']
print('Unlabeled Sentences Count:',len(un['spanish']))

Unlabeled Sentences Count: 55669


In [26]:
from nltk.corpus import stopwords
sp_stop = set(stopwords.words("spanish"))
en_stop = set(stopwords.words("english"))

In [27]:
def clean_sent(sent):
    sent = sent.lower()
    sent = re.sub(u"[_'\-;%()|+&=*%.,!?:#$@\[\]/]",' ',sent)
    sent = re.sub('¡',' ',sent)
    sent = re.sub('¿',' ',sent)
    sent = re.sub('Á','á',sent)
    sent = re.sub('Ó','ó',sent)
    sent = re.sub('Ú','ú',sent)
    sent = re.sub('É','é',sent)
    sent = re.sub('Í','í',sent)
    return sent

In [28]:
un['spanish'] = un.spanish.map(lambda x: ' '.join([word for word in
                                                     nltk.word_tokenize(clean_sent(x))]))
un['english'] = un.english.map(lambda x: ' '.join([word for word in
                                                     nltk.word_tokenize(clean_sent(x))]))

KeyboardInterrupt: 

In [None]:
un['spanish'] = un.spanish.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in sp_stop]))
un['english'] = un.english.map(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in en_stop]))

In [None]:
un.head()

In [None]:
un.replace('', np.nan, inplace=True)
dirty_data = un[un.isnull().any(axis=1)]
print('Before clean:', len(un))
print('dirty sample count:', dirty_data.shape[0])
un = un.dropna()
print('After clean:', len(un))

### Drop duplicated English and Spanish Sentences respectively

In [None]:
un_sp = pd.DataFrame(un['spanish'])
un_en = pd.DataFrame(un['english'])

In [None]:
print('Spanish sentences count:', len(un_sp))
print('current duplicated sentences:',un_sp.duplicated().sum())
un_sp = un_sp.drop_duplicates()
print('After drop duplicated:', len(un_sp))

In [None]:
print('Spanish sentences count:', len(un_en))
print('current duplicated sentences:',un_en.duplicated().sum())
un_en = un_en.drop_duplicates()
print('After drop duplicated:', len(un_en))

In [105]:
un_sp.to_csv("input/cleaned_unlabeled_Spanish.csv", index=False)
un_en.to_csv("input/cleaned_unlabeled_English.csv", index=False)

In [80]:
un.to_csv("input/cleaned_unlabeled.csv", index=False)

## Second Start

In [29]:
un_sp = pd.read_csv("input/cleaned_unlabeled_Spanish.csv")
un_en = pd.read_csv("input/cleaned_unlabeled_English.csv")

In [30]:
en_embed = load_embed('input/en_embed.pkl')
sp_embed = load_embed('input/sp_embed.pkl')

### Drop sentences that less than 5 words to get better cosine results

In [31]:
sents = un_en['english'].tolist()
for i in range(len(sents)):
    if len(sents[i].split()) <= 5:
        un_en['english'].iloc[i] = np.nan
    
print('sent length < 5:',un_en['english'].isnull().sum())
un_en = un_en.dropna()
un_en = un_en.reset_index()
un_en = un_en.drop(['index'], axis=1)

sent length < 5: 12694


In [32]:
sents = un_sp['spanish'].tolist()
for i in range(len(sents)):
    if len(sents[i].split()) <= 5:
        un_sp['spanish'].iloc[i] = np.nan
    
print('sent length < 5:',un_sp['spanish'].isnull().sum())
un_sp = un_sp.dropna()
un_sp = un_sp.reset_index()
un_sp = un_sp.drop(['index'], axis=1)

sent length < 5: 8380


In [33]:
print('Spanish sentences count:',un_sp.shape[0],'\nEnglish sentences count:', un_en.shape[0])

Spanish sentences count: 44820 
English sentences count: 38299


In [34]:
def cosine_similarity(s1,s2):
    return np.linalg.norm(s1-s2)

In [35]:
def WMD_core(s1,s2,embedding):
    s1_vect = []
    # if can't find corresponding word embeddings, add single word embeddings
    # 分词后的词语若无对应词向量，则添加该单字向量
    
    for word in s1.split():
        if word in embedding:
            s1_vect.append(embedding[word])
        else: continue

    s2_vect = []
    for word in s2.split():
        if word in embedding:
            s2_vect.append(embedding[word])
        else: continue

#     print(len(s1_vect), len(s2_vect))
    total_min = []
    sum_min = 0.0
    
    # find every words' nearest
    for w1 in s1_vect: 
        cur_min = 1000.0
        min_dis = []
        # for every word in s1, find the nearest in s2
        for w2 in s2_vect:
            temp = cosine_similarity(w1,w2)
            if temp < cur_min:
                cur_min = temp
#         min_dis.append(cur_min)
        sum_min += cur_min
#         total_min.extend(min_dis)
#     print(total_min)
#     return round(sum_min/len(s1),6)
    return sum_min/len(s1)

In [36]:
def qselectmin(A,k):
    if len(A)<k:return A
    pivot = A[-1]
    right = [pivot] + [x for x in A[:-1] if x<pivot]
    rlen = len(right)
    if rlen==k:
        return right
    if rlen>k:
        return qselectmin(right, k)
    else:
        left = [x for x in A[:-1] if x>=pivot]
        return qselectmin(left, k-rlen) + right

def qselectmax(A,k):
    if len(A)<k:return A
    pivot = A[-1]
    right = [pivot] + [x for x in A[:-1] if x>pivot]
    rlen = len(right)
    if rlen==k:
        return right
    if rlen>k:
        return qselectmax(right, k)
    else:
        left = [x for x in A[:-1] if x<=pivot]
        return qselectmax(left, k-rlen) + right


In [37]:
"""
Input: sentence; embedding
Output: This sentence's embeddings

Using words' embedding averages as sentence embeddings
option: WMD?

"""
def sent_embed(sent,embedding):
    sent_len = len(sent)
    sent_vec = np.zeros(embed_size)
    for w in sent.split():
        sent_len = len(sent)
        if w in embedding:
            sent_vec += embedding[w]
        else: continue
    sent_vec = sent_vec/sent_len 
    return sent_vec

In [245]:
cosine_similarity(sent_embed(sents[0], embedding) , sent_embed(sents[1], embedding))

0.31838501826110932

In [39]:
sents1 = un_en['english'][100:200]
sents1 = sents1.reset_index()
sents1 = sents1.drop(['index'], axis=1)

In [48]:
en1 = un_en['english'][0:100]
en2 = un_en['english'][100:200]
en2 = en2.reset_index()
en2 = en2.drop(['index'],axis=1)['english']

sp1 = un_sp['spanish'][0:50]
sp2 = un_sp['spanish'][500:1000]
sp2 = sp2.reset_index()
sp2 = sp2.drop(['index'],axis=1)['spanish']


In [50]:
embed_size = 300

In [58]:
aug_sp = aug_data(sp1, sp2, sp_embed)

i: 0.0 / 0.5


KeyError: 51

In [54]:
def aug_data(sents1, sents2, embedding, embed_size=300):
    sum_pairs = pd.DataFrame()
    score = {}
    for i in range(len(sents1)):
        if(i%100==0): print('i:', i/100,'/',len(sents1)/100)
        origin_sent = sents1[i]
        origin_sent_embed = sent_embed(sents1[i], embedding)
        score[i] = []

        for j in range(len(sents2)):
    #         if(j%10000==0): print(j,'/',len(sents))
            score[i].append(cosine_similarity(origin_sent_embed, sent_embed(sents2[j], embedding)))

    #         WMD Score
    #         score[i].append(WMD_core(sents[i], sents[j], embedding))        

            # max distances to be negative scores
            neg_score = qselectmax(score[i], 10)
            # min distances to be positive scores
            pos_score = qselectmin(score[i], 10)
            pairs = []

            # neg pairs
            for s in neg_score:
                pair = [origin_sent, sents1[score[i].index(s)],0]
                pairs.append(pair)

            # pos pairs
            for s in pos_score:
                if s == 0: continue
                pair = [origin_sent, sents1[score[i].index(s)],1]
                pairs.append(pair)

            pairs = pd.DataFrame(pairs)
            pairs = pairs.drop_duplicates()
        sum_pairs = sum_pairs.append(pairs)
    print('Done')
    sum_pairs.columns = ['s1','s2','label']
    sum_pairs = sum_pairs.sort_values('label')
    print('Augmented Count:',len(sum_pairs))
    print('Positive Count:',sum_pairs[sum_pairs['label']==1].shape[0])
    return sum_pairs

In [408]:
from sklearn.utils import shuffle
sum_pairs = shuffle(sum_pairs)
sum_pairs.head()

Unnamed: 0,s1,s2,label
5,product received correspond description paid c...,product received correspond description satisf...,1
1,hello good night concern request already count...,use debit card mastercard debit card,0
3,receive refund pay credit card credit card pay...,fuck male want talk human need human,0
4,able open dispute request refund product arriv...,pay mercado pago pay mercado pago,0
7,receive refund credit card receive refund cred...,receive refund pay credit card paid credit car...,1


In [71]:
sents = un_sp['spanish'][0:1000]
embedding = en_embed
embed_size = 300
sum_pairs = pd.DataFrame()
score = {}
for i in range(len(sents)):
    if(i%100==0): print('i:', i/100,'/',len(sents)/100)
    origin_sent = sents[i]
    origin_sent_embed = sent_embed(sents[i], embedding)
    score[i] = []

    for j in range(len(sents)):
#         if(j%10000==0): print(j,'/',len(sents))
        score[i].append(cosine_similarity(origin_sent_embed, sent_embed(sents[j], embedding)))

#         WMD Score
#         score[i].append(WMD_core(sents[i], sents[j], embedding))        

        # max distances to be negative scores
        neg_score = qselectmax(score[i], 10)
        # min distances to be positive scores
        pos_score = qselectmin(score[i], 30)
        pairs = []

        # neg pairs
        for s in neg_score:
            pair = [origin_sent, sents[score[i].index(s)],0]
            pairs.append(pair)

        # pos pairs
        for s in pos_score:
            if s == 0: continue
            pair = [origin_sent, sents[score[i].index(s)],1]
            pairs.append(pair)

        pairs = pd.DataFrame(pairs)
        pairs = pairs.drop_duplicates()
    sum_pairs = sum_pairs.append(pairs)
print('Done')
sum_pairs.columns = ['s1','s2','label']
sum_pairs = sum_pairs.sort_values('label')
print('Augmented Count:',len(sum_pairs))
print('Positive Count:',sum_pairs[sum_pairs['label']==1].shape[0])

i: 0.0 / 10.0
i: 1.0 / 10.0
i: 2.0 / 10.0
i: 3.0 / 10.0
i: 4.0 / 10.0
i: 5.0 / 10.0
i: 6.0 / 10.0
i: 7.0 / 10.0
i: 8.0 / 10.0
i: 9.0 / 10.0
Done
Augmented Count: 18996
Positive Count: 9071


In [None]:
sents = un_sp['spanish'][0:1200]
embedding = en_embed
embed_size = 300
sum_pairs_wmd = pd.DataFrame()
score = {}
for i in range(len(sents)):
    if(i%100==0): print('i:', i/100,'/',len(sents)/100)
    origin_sent = sents[i]
    origin_sent_embed = sent_embed(sents[i], embedding)
    score[i] = []

    for j in range(len(sents)):
#         if(j%10000==0): print(j,'/',len(sents))
#         score[i].append(cosine_similarity(origin_sent_embed, sent_embed(sents[j], embedding)))

#         WMD Score
        score[i].append(WMD_core(sents[i], sents[j], embedding))        

        # max distances to be negative scores
        neg_score = qselectmax(score[i], 10)
        # min distances to be positive scores
        pos_score = qselectmin(score[i], 30)
        pairs = []

        # neg pairs
        for s in neg_score:
            pair = [origin_sent, sents[score[i].index(s)],0]
            pairs.append(pair)

        # pos pairs
        for s in pos_score:
            if s == 0: continue
            pair = [origin_sent, sents[score[i].index(s)],1]
            pairs.append(pair)

        pairs = pd.DataFrame(pairs)
        pairs = pairs.drop_duplicates()
        sum_pairs_wmd = sum_pairs_wmd.append(pairs)
print('Done')
sum_pairs_wmd.columns = ['s1','s2','label']
sum_pairs_wmd = sum_pairs_wmd.sort_values('label')
print('Augmented Count:',len(sum_pairs_wmd))
print('Positive Count:',sum_pairs_wmd[sum_pairs_wmd['label']==1].shape[0])

i: 0.0 / 12.0
i: 1.0 / 12.0
i: 2.0 / 12.0
i: 3.0 / 12.0
i: 4.0 / 12.0
i: 5.0 / 12.0
i: 6.0 / 12.0
i: 7.0 / 12.0
i: 8.0 / 12.0
i: 9.0 / 12.0


In [69]:
sum_pairs.to_csv('input/aug_sp_1000_less.csv')

In [415]:
aug_sp = pd.read_csv('aug_sp.csv')
aug_en = pd.read_csv('aug_en.csv')

In [380]:
# max distances to be negative scores
neg_score = qselectmax(score[i], 10)
# min distances to be positive scores
pos_score = qselectmin(score[i], 10)
pairs = []

# neg pairs
for s in neg_score:
    pair = [origin_sent, sents[score[i].index(s)],0,s]
    pairs.append(pair)

# pos pairs
for s in pos_score:
    if s == 0: continue
    pair = [origin_sent, sents[score[i].index(s)],1,s]
    pairs.append(pair)

In [376]:
pairs.iloc[6,:].tolist()

['payment could processed due security reasons order closed security reasons payment still charged',
 'payment problem presented order closed security reasons payment still charged',
 1,
 0.078975945574712936]

In [233]:
pos_score = qselectmax(score, 10)
neg_score = qselectmin(score, 10)
pairs = []

# neg pairs
for s in pos_score:
    pair = [origin_sent, sents[score.index(s)],0]
    pairs.append(pair)

# pos pairs
for s in neg_score:
    if s == 0: continue
    pair = [origin_sent, sents[score.index(s)],1]
    pairs.append(pair)

pairs = pd.DataFrame(pairs)
pairs = pairs.drop_duplicates()
pairs.columns = ['s1','s2','label']
print('Total:', pairs.shape[0], '\nPos:', pairs[pairs['label'] == 1].shape[0],
      '\nNeg:', pairs[pairs['label'] == 0].shape[0])

Total: 10 
Pos: 0 
Neg: 10


In [204]:
pairs.iloc[3,:].tolist()

['payment could processed due security reasons order closed security reasons payment still charged',
 'hello status order closed would like know done correctly order closed security reasons payment still charged',
 1,
 0.095889463288334739]

In [357]:
pd.DataFrame([[5,6,1],[1,2,0]],columns= ['spanish1','spanish2','label'])

Unnamed: 0,spanish1,spanish2,label
0,5,6,1
1,1,2,0


In [313]:
print('Max score index:', score[0].index(max(score[0])))
print('Original Sent:', un_sp[0])
print('Similar Sent:',un_sp[score[0].index(max(score[0]))])

Max score index: 277
Original Sent: podido procesar pago razones seguridad si pedido cerrado razones seguridad aún cobra pago
Similar Sent: hola hola


In [None]:
"""
Input: sentences list, embeddings.
Output: labeled data in pd forms.
"""
def aug_data(sents,embedding):
    score = {}
    sum_pairs = []
    for i in range(len(sents)):
        if(i%1000==0): print('i:', i/1000)
        origin_sent = sents[i]
        score[i] = []

        for j in range(len(sents)):

            score[i].append(WMD_core(sents[i], sents[j], embedding))
            # max distances to be negative scores
            neg_score = qselectmax(score[i], 5)
            # min distances to be positive scores
            pos_score = qselectmin(score[i], 5)
            pairs = []

            # neg pairs
            for s in neg_score:
                pair = [origin_sent, sents[score[i].index(s)],0]
                pairs.append(pair)

            # pos pairs
            for s in pos_score:
                if s == 0: continue
                pair = [origin_sent, sents[score[i].index(s)],1]
                pairs.append(pair)

            pairs = pd.DataFrame(pairs)
            pairs = pairs.drop_duplicates()

        sum_pairs = sum_pairs.append(pairs)
    return sum_pairs.drop_duplicates()
            
    
        
        

    

In [245]:
from torch import nn
embed_size = 300
sp_vocab_size = len(sp_embed)
en_vocab_size = len(en_embed)
# initialize nn embedding
sp_embedding = nn.Embedding(sp_vocab_size, embed_size)
en_embedding = nn.Embedding(en_vocab_size, embed_size)

ImportError: dlopen(/Users/liushijing/anaconda3/lib/python3.6/site-packages/torch/_C.cpython-36m-darwin.so, 9): no suitable image found.  Did find:
	/Users/liushijing/anaconda3/lib/python3.6/site-packages/torch/_C.cpython-36m-darwin.so: truncated mach-o error: segment __TEXT extends to 12820480 which is past end of file 7979008
	/Users/liushijing/anaconda3/lib/python3.6/site-packages/torch/_C.cpython-36m-darwin.so: truncated mach-o error: segment __TEXT extends to 12820480 which is past end of file 7979008