In [1]:
import torch
import pandas as pd
import numpy as np
import re

In [5]:
dev = pd.read_csv('./data/SNLI/snli_1.0_dev.txt',sep='\t')
dev.iloc[0,:]

gold_label                                                          neutral
sentence1_binary_parse    ( ( Two women ) ( ( are ( embracing ( while ( ...
sentence2_binary_parse    ( ( The sisters ) ( ( are ( ( hugging goodbye ...
sentence1_parse           (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...
sentence2_parse           (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ...
sentence1                 Two women are embracing while holding to go pa...
sentence2                 The sisters are hugging goodbye while holding ...
captionID                                                  4705552913.jpg#2
pairID                                                  4705552913.jpg#2r1n
label1                                                              neutral
label2                                                           entailment
label3                                                              neutral
label4                                                              neutral
label5      

In [8]:
def convert_label(x):
    '''
    性质标签转换成数字
    '''
    if x == 'entailment':
        return 0
    elif x == 'contradiction':
        return 1
    else:
        return 2
    
def convert_string(text):
    '''
    去除标点符号
    '''
    punctuation = '.!,;:?"\''
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    #  '[{}]+'.format(punctuation) 即为：'[.!,;:?"\']+'
    return text.strip().lower()

def deal_SNLI(file):
    data = pd.read_csv(file, sep='\t')
    new_data = data.loc[:, ['sentence1', 'sentence2', 'gold_label']].copy()
    # 三列标签中有任何一列为空该行都会被删除
    new_data.dropna(axis=0,how='any',inplace=True)
    new_data['gold_label']=new_data['gold_label'].apply(lambda x : convert_label(x))
    new_data['sentence1'] = new_data['sentence1'].apply(lambda x : convert_string(x))
    new_data['sentence2'] = new_data['sentence2'].apply(lambda x : convert_string(x))

    return new_data

In [9]:
dev_data=deal_SNLI('./data/SNLI/snli_1.0_dev.txt')
dev_data.to_csv('./data/SNLI/snli-dev.txt', sep='\t', header=None, index=False)

test_data=deal_SNLI('./data/SNLI/snli_1.0_test.txt')
test_data.to_csv('./data/SNLI/snli-test.txt', sep='\t', header=None, index=False)

train_data=deal_SNLI('./data/SNLI/snli_1.0_train.txt')
train_data.to_csv('./data/SNLI/snli-train.txt', sep='\t', header=None, index=False)