In [4]:
## Pre-processing data

In [5]:
%%time
import pandas as pd
df_train = pd.read_csv(r'Export_CSV\training_clean.csv')

Wall time: 8.38 s


In [6]:
df_train.head()

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",contradiction,,,,
2,entailment,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,"( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",entailment,,,,
3,neutral,( Children ( ( ( smiling and ) waving ) ( at c...,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...,Children smiling and waving at camera,They are smiling at their parents,neutral,,,,
4,entailment,( Children ( ( ( smiling and ) waving ) ( at c...,( There ( ( are children ) present ) ),(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...,Children smiling and waving at camera,There are children present,entailment,,,,


## Build a dictionary of POS tags are used in dataset

In [7]:
%%time
dic_postag = {
    'CC' : 'Coordinating conjunction',
    'CD' : 'Cardinal number',
    'DT' : 'Determiner',
    'EX' : 'Existential there',
    'FW' : 'Foreign word',
    'IN' : 'Preposition or subordinating conjunction',
    'JJ' : 'Adjective',
    'JJR' : 'Adjective, comparative',
    'JJS' : 'Adjective, superlative',
    'LS' : 'List item marker',
    'MD' : 'Modal',
    'NN' : 'Noun, singular or mass',
    'NNS' : 'Noun, plural',
    'NNP' : 'Proper noun, singular',
    'NNPS' : 'Proper noun, plural',
    'PDT' : 'Perdeterminer',
    'POS' : 'Possessive ending',
    'PRP' : 'Personal pronoun',
    'PRP$' : 'Possessive pronoun',
    'RB' : 'Adverb',
    'RBR' : 'Adverb, comparative',
    'RBS' : 'Adverb, superlative',
    'RP' : 'Particle',
    'SYM' : 'Symbol',
    'TO' : 'to',
    'UH' : 'Interjection',
    'VB' : 'Verb, base form',
    'VBD' : 'Verb, past tense',
    'VBG' : 'Verb, gerund or present participle',
    'VBN': 'Verb, past participle',
    'VBP' : 'Verb, non-3rd persion singular present',
    'VBZ' : 'Verb, 3rd persion singular present',
    'WDT' : 'Wh-determiner',
    'WP' : 'Wh-pronoun',
    'WP$' : 'Possessive wh-pronoun',
    'WRB' : 'Wh-adverb'
}

first_ten = {k: dic_postag[k] for k in list(dic_postag)[:10]}
first_ten

Wall time: 0 ns


{'CC': 'Coordinating conjunction',
 'CD': 'Cardinal number',
 'DT': 'Determiner',
 'EX': 'Existential there',
 'FW': 'Foreign word',
 'IN': 'Preposition or subordinating conjunction',
 'JJ': 'Adjective',
 'JJR': 'Adjective, comparative',
 'JJS': 'Adjective, superlative',
 'LS': 'List item marker'}

## Count number of POS tags in each pair of sentence

In [8]:
# Create feature name (column name) for new dataframe vector feature
cols = [key+'1' for key in dic_postag.keys()] + [key+'2' for key in dic_postag.keys()]
print(cols)

['CC1', 'CD1', 'DT1', 'EX1', 'FW1', 'IN1', 'JJ1', 'JJR1', 'JJS1', 'LS1', 'MD1', 'NN1', 'NNS1', 'NNP1', 'NNPS1', 'PDT1', 'POS1', 'PRP1', 'PRP$1', 'RB1', 'RBR1', 'RBS1', 'RP1', 'SYM1', 'TO1', 'UH1', 'VB1', 'VBD1', 'VBG1', 'VBN1', 'VBP1', 'VBZ1', 'WDT1', 'WP1', 'WP$1', 'WRB1', 'CC2', 'CD2', 'DT2', 'EX2', 'FW2', 'IN2', 'JJ2', 'JJR2', 'JJS2', 'LS2', 'MD2', 'NN2', 'NNS2', 'NNP2', 'NNPS2', 'PDT2', 'POS2', 'PRP2', 'PRP$2', 'RB2', 'RBR2', 'RBS2', 'RP2', 'SYM2', 'TO2', 'UH2', 'VB2', 'VBD2', 'VBG2', 'VBN2', 'VBP2', 'VBZ2', 'WDT2', 'WP2', 'WP$2', 'WRB2']


In [9]:
%%time
df_vector_feature = pd.DataFrame()
df_vector_feature['gold_label'] = df_train['gold_label']
for feature_name in cols:
    df_vector_feature[feature_name] = [0] * df_train.shape[0] # df_train.shape[0] return the number of sample row in df_train
    
df_vector_feature.shape

Wall time: 23.5 s


(549361, 73)

In [10]:
df_vector_feature.head()

Unnamed: 0,gold_label,CC1,CD1,DT1,EX1,FW1,IN1,JJ1,JJR1,JJS1,...,VB2,VBD2,VBG2,VBN2,VBP2,VBZ2,WDT2,WP2,WP$2,WRB2
0,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,contradiction,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,entailment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,entailment,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Example for building vector feature from POS Tag in sentence

In [11]:
df_train['sentence1_parse'][0]

'(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))'

In [15]:
import re
for key in dic_postag.keys():
    count = len(re.findall(key+r'\s', df_train['sentence1_parse'][0]))
    if count != 0: print(key, count)

DT 3
IN 2
JJ 2
NN 3
VBZ 1


## Start processing the whole dataset and export an csv file for fitting into classifier model

### Please run the code below in Pycharm

In [None]:
%%time
import re

print('Beginning processing dataframe.')
for index, row in df_train[['sentence1_parse', 'sentence2_parse']].iterrows():
    for key in dic_postag.keys():
        df_vector_feature.loc[index, key + '1'] = len(re.findall(key + r'\s', row['sentence1_parse']))
        df_vector_feature.loc[index, key + '2'] = len(re.findall(key + r'\s', row['sentence2_parse']))
    if index == 999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_1,000.csv', index=None, header=True)
    elif index == 9999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_10,000.csv', index=None, header=True)
    elif index == 99999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_100,000.csv', index=None, header=True)
    elif index == 149999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_150,000.csv', index=None, header=True)
    elif index == 199999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_200,000.csv', index=None, header=True)
    elif index == 249999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_250,000.csv', index=None, header=True)
    elif index == 299999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_300,000.csv', index=None, header=True)
    elif index == 349999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_350,000.csv', index=None, header=True)
    elif index == 399999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_400,000.csv', index=None, header=True)
    elif index == 449999:
        export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_450,000.csv', index=None, header=True)

    if index % 1000 == 0:
        print('Epoc', index)

print('Output CSV file complete.')
export_csv = df_vector_feature.to_csv(r'Export_CSV\Preprocessing_Complete.csv', index=None, header=True)