In [1]:
# Utilities
import itertools
import importlib
import sys
import os

In [2]:
# Add src to path
src_path = os.path.abspath(os.path.join(os.getcwd(), '../src'))
if src_path not in sys.path:
    sys.path.append(src_path)
    
import load_and_prepare_data as ld
import indexer

In [13]:
# Run this to reload our modules
importlib.reload(indexer)
importlib.reload(ld)
from indexer import Indexer

In [4]:
## Preparing data

TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS  = ld.prepareData("../data/train.csv", True)
DEV_FIRST_TEXTS, DEV_SECOND_TEXTS, DEV_LABELS = ld.prepareData("../data/dev.csv", True)

TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS, \
TEST_FIRST_TEXTS, TEST_SECOND_TEXTS, TEST_LABELS = ld.split_data(TRAIN_FIRST_TEXTS, TRAIN_SECOND_TEXTS, TRAIN_LABELS)

print("Training data size: ", len(TRAIN_FIRST_TEXTS),
     "\nDev data size: ", len(DEV_FIRST_TEXTS), 
     "\nTest data size: ", len(TEST_FIRST_TEXTS))

Prepared 30000 data points.
Prepared 6000 data points.
Training data size:  27000 
Dev data size:  6000 
Test data size:  3000


In [5]:
test = ld.Dataset(TEST_FIRST_TEXTS, TEST_SECOND_TEXTS, TEST_LABELS)
print(len(test.FIRST_TEXTS), len(TEST_FIRST_TEXTS))
test.ExtractFeatures()
print(len(test.FIRST_TEXTS), len(TEST_FIRST_TEXTS))

3000 3000
(1/3) Extracting POS . . .
(2/3) Extracting Punctuation . . .
(3/3) Extracting Information . . .
Cleaned up all data!
2995 2995


In [6]:
label_indexer = Indexer(values=test.LABELS)
label_indexer.i2v
label_indexer.save("i2v.json", "v2i.json")

{0: 0, 1: 1}
{0: 0, 1: 1}


In [14]:
POS_VALS = list(itertools.chain(*test.FIRST_POS[:2] + test.SECOND_POS[:2]))
pos_indexer = Indexer(values=POS_VALS, pre=indexer.POS_PRE)

In [15]:
pos_indexer.i2v

{0: '<PAD>',
 1: '<UNK>',
 2: 'UH',
 3: 'VBD',
 4: 'WP',
 5: 'VBG',
 6: 'NNP',
 7: 'IN',
 8: 'PRP',
 9: 'VB',
 10: 'RB',
 11: 'WDT',
 12: 'JJR',
 13: '(',
 14: 'NNS',
 15: ':',
 16: 'MD',
 17: ',',
 18: 'POS',
 19: 'JJ',
 20: 'PRP$',
 21: 'NNPS',
 22: 'CC',
 23: 'WRB',
 24: "''",
 25: ')',
 26: 'NN',
 27: 'EX',
 28: 'VBP',
 29: 'PDT',
 30: 'VBN',
 31: 'TO',
 32: '$',
 33: 'DT',
 34: 'CD',
 35: 'FW',
 36: '``',
 37: 'JJS',
 38: '.',
 39: 'RP',
 40: 'VBZ'}