In [8]:
import pandas as pd
import numpy as np

In [24]:
df= pd.read_csv('data/SICK_test.txt', sep='\t')
df

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,6,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,3.300,NEUTRAL
1,7,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,3.700,NEUTRAL
2,8,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,3.000,NEUTRAL
3,10,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,4.900,ENTAILMENT
4,11,A brown dog is attacking another animal in fro...,A brown dog is helping another animal in front...,3.665,NEUTRAL
...,...,...,...,...,...
4922,9991,The young girl is blowing a bubble that is huge,There is no girl in pink twirling a ribbon,2.100,NEUTRAL
4923,9992,A dog in a colored coat is running across the ...,The flute is being played by one man,1.000,NEUTRAL
4924,9994,A boy is happily playing the piano,A white bird is landing swiftly in the water,1.000,NEUTRAL
4925,9995,"The girl , who is little , is combing her hair...",Two people wearing helmets are driving over th...,1.000,NEUTRAL


In [9]:
np.unique(df['entailment_judgment'])

array(['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL'], dtype=object)

In [10]:
df.loc[2]

pair_ID                                                                3
sentence_A             The young boys are playing outdoors and the ma...
sentence_B             The kids are playing outdoors near a man with ...
relatedness_score                                                    4.7
entailment_judgment                                           ENTAILMENT
Name: 2, dtype: object

In [15]:
df.loc[37]

pair_ID                                                 90
sentence_A             A man is jumping into an empty pool
sentence_B               A man is jumping into a full pool
relatedness_score                                      3.0
entailment_judgment                          CONTRADICTION
Name: 37, dtype: object

In [18]:
import re

def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString) 
    return newString.strip()

In [25]:
df['sentence_A'] = df['sentence_A'].apply(lambda x: text_cleaner(x))
df['sentence_B'] = df['sentence_B'].apply(lambda x: text_cleaner(x))
df

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,6,there is no boy playing outdoors and there is ...,a group of kids is playing in a yard and an ol...,3.300,NEUTRAL
1,7,a group of boys in a yard is playing and a man...,the young boys are playing outdoors and the ma...,3.700,NEUTRAL
2,8,a group of children is playing in the house an...,the young boys are playing outdoors and the ma...,3.000,NEUTRAL
3,10,a brown dog is attacking another animal in fro...,a brown dog is attacking another animal in fro...,4.900,ENTAILMENT
4,11,a brown dog is attacking another animal in fro...,a brown dog is helping another animal in front...,3.665,NEUTRAL
...,...,...,...,...,...
4922,9991,the young girl is blowing a bubble that is huge,there is no girl in pink twirling a ribbon,2.100,NEUTRAL
4923,9992,a dog in a colored coat is running across the ...,the flute is being played by one man,1.000,NEUTRAL
4924,9994,a boy is happily playing the piano,a white bird is landing swiftly in the water,1.000,NEUTRAL
4925,9995,the girl who is little is combing her hair...,two people wearing helmets are driving over th...,1.000,NEUTRAL


In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['entailment_judgment'])
df['target'] = le.transform(df['entailment_judgment'])
df

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment,target
0,6,there is no boy playing outdoors and there is ...,a group of kids is playing in a yard and an ol...,3.300,NEUTRAL,2
1,7,a group of boys in a yard is playing and a man...,the young boys are playing outdoors and the ma...,3.700,NEUTRAL,2
2,8,a group of children is playing in the house an...,the young boys are playing outdoors and the ma...,3.000,NEUTRAL,2
3,10,a brown dog is attacking another animal in fro...,a brown dog is attacking another animal in fro...,4.900,ENTAILMENT,1
4,11,a brown dog is attacking another animal in fro...,a brown dog is helping another animal in front...,3.665,NEUTRAL,2
...,...,...,...,...,...,...
4922,9991,the young girl is blowing a bubble that is huge,there is no girl in pink twirling a ribbon,2.100,NEUTRAL,2
4923,9992,a dog in a colored coat is running across the ...,the flute is being played by one man,1.000,NEUTRAL,2
4924,9994,a boy is happily playing the piano,a white bird is landing swiftly in the water,1.000,NEUTRAL,2
4925,9995,the girl who is little is combing her hair...,two people wearing helmets are driving over th...,1.000,NEUTRAL,2


In [32]:
data_for_training = df[['sentence_A', 'sentence_B', 'target']]
pd.concat([data_for_training, df[:3000][['sentence_A', 'sentence_B', 'target']]], axis=0).to_csv('data/train.csv', index=False)
data_for_training

Unnamed: 0,sentence_A,sentence_B,target
0,there is no boy playing outdoors and there is ...,a group of kids is playing in a yard and an ol...,2
1,a group of boys in a yard is playing and a man...,the young boys are playing outdoors and the ma...,2
2,a group of children is playing in the house an...,the young boys are playing outdoors and the ma...,2
3,a brown dog is attacking another animal in fro...,a brown dog is attacking another animal in fro...,1
4,a brown dog is attacking another animal in fro...,a brown dog is helping another animal in front...,2
...,...,...,...
4922,the young girl is blowing a bubble that is huge,there is no girl in pink twirling a ribbon,2
4923,a dog in a colored coat is running across the ...,the flute is being played by one man,2
4924,a boy is happily playing the piano,a white bird is landing swiftly in the water,2
4925,the girl who is little is combing her hair...,two people wearing helmets are driving over th...,2


In [33]:
data_for_test = df[3000:][['sentence_A', 'sentence_B', 'target']]
data_for_test.to_csv('data/test.csv', index=False)
data_for_test

Unnamed: 0,sentence_A,sentence_B,target
3000,a boy and a girl are naked,a boy and a girl in swimsuits are wearing arm ...,0
3001,the boy and the girl are cheerfully playing an...,the boy and the girl are playing and wearing a...,1
3002,the boy and the girl are cheerfully playing an...,a boy and a girl in swimsuits are wearing arm ...,2
3003,the boy and the girl are not playing and weari...,a boy and a girl in swimsuits are wearing arm ...,2
3004,a boy and a girl in swimsuits are wearing floa...,the boy and the girl are playing and wearing a...,2
...,...,...,...
4922,the young girl is blowing a bubble that is huge,there is no girl in pink twirling a ribbon,2
4923,a dog in a colored coat is running across the ...,the flute is being played by one man,2
4924,a boy is happily playing the piano,a white bird is landing swiftly in the water,2
4925,the girl who is little is combing her hair...,two people wearing helmets are driving over th...,2


In [34]:
pd.read_csv('data/train.csv')

Unnamed: 0,sentence_A,sentence_B,target
0,there is no boy playing outdoors and there is ...,a group of kids is playing in a yard and an ol...,2
1,a group of boys in a yard is playing and a man...,the young boys are playing outdoors and the ma...,2
2,a group of children is playing in the house an...,the young boys are playing outdoors and the ma...,2
3,a brown dog is attacking another animal in fro...,a brown dog is attacking another animal in fro...,1
4,a brown dog is attacking another animal in fro...,a brown dog is helping another animal in front...,2
...,...,...,...
7922,two people are wading through the water,a couple or people are wading through the water,1
7923,nobody is standing in the ocean and watching t...,two people are wading through the water,2
7924,two people are standing in the ocean and watch...,a couple or people are wading through the water,2
7925,two people are standing in the ocean and watch...,two people are wading through the water,2
