In [None]:
import json
import string
import nltk
from collections import defaultdict, Counter
from sklearn.preprocessing import StandardScaler

FNAME = 'project_files/devel.json' # Need to change later

TESTNAME = 'project_files/testing.json'

Q_WORDS = ['how','what','whom','when','who','where','which']
SELECTED_Q = ['how','what','which']

ner_dir = 'stanford-ner-2018-02-27/'
ner_jarfile = ner_dir + 'stanford-ner.jar'
ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_filename=ner_modelfile, path_to_jar=ner_jarfile)

pos_dir = 'stanford-postagger-2018-02-27/'
pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger'
pos_jarfile = pos_dir + 'stanford-postagger.jar'
pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile, path_to_jar=pos_jarfile)

with open(FNAME) as json_data:
    infile = json.load(json_data)

with open(TESTNAME) as test_data:
    testfile = json.load(test_data)

### Select the required Features for multi-layer perceptron (MLP) algorithm in scikit learn (deep learning)

In [3]:
def get_features(infile):
    feature_count = defaultdict(int)

    for dic in infile:  
        question = dic['question']
        question_token = nltk.word_tokenize(question)
        question_token = [word.lower() for word in question_token]
        for q in SELECTED_Q:
            if q in question_token:
                index = question_token.index(q)
                next_ = question_token[index + 1]
                feature_count[next_] += 1
    return feature_count

dic = get_features(infile)
feature_count = sorted(dic.items(), key=lambda x: x[1],reverse=True)

selected_feature = []
# Only select the words whose frequencies larger than one
for item in feature_count:
    if item[1] > 1:
        selected_feature.append(item[0])
print selected_feature

[u'is', u'many', u'was', u'year', u'did', u'type', u'does', u'country', u'percentage', u'much', u'kind', u'?', u'language', u'other', u'date', u'city', u"'s", u'do', u'political', u'are', u'century', u'long', u'company', u'month', u'can', u'two', u'group', u'were', u'period', u'percent', u'sort', u'word', u'part', u'would', u'decade', u'term', u'notable', u'age', u'has', u'of', u'form', u'neighborhood', u'organization', u'color', u'old', u'branch', u'far', u'award', u'dutch', u'isotope', u'else', u'industry', u'area', u'era', u'nation', u'event', u'state', u'party', u'caused', u'large', u'town', u'dialect', u'philosophy', u'types', u'years', u'germanic', u'name', u'university', u'temperature', u'place', u'number', u'rank', u'individual', u'nationality', u'region', u'book', u'religion', u'magazine', u'tribe', u'span', u'empire', u'time', u'concept', u'dynasty', u'gender', u'street', u'action', u'family', u'county', u'must', u'work', u'topic', u'element', u'geographic', u'material', u'co

### Select the expected classes for MLP classification in scikit learn
array y of size (n_samples,), which holds the target values (class labels) for the training samples

In [None]:
def get_classes(infile):
    all_ans = []
    
    for dic in infile:
        ans = dic['text']
        ans_token = nltk.word_tokenize(ans)
        ner = ner_tagger.tag(ans_token)
        pos = pos_tagger.tag(ans_token)
        
        ner_tags = set()
        pos_tags = set()
        
        for item in ner:
            ner_tags.add(item[1])
        
        if 'O' in ner_tags:
            ner_tags.remove('O')
        
        for item in pos:
            pos_tags.add(item[1])
        
        ans_tags = [ner_tags, pos_tags]
        
        all_ans.append(ans_tags)
        
    return all_ans

### Create feature vectors of training samples 
array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors

In [None]:
def get_vectors(infile):
    all_vec = []

    q_word_len = len(Q_WORDS)
    feature_len = len(selected_feature)
    
    for dic in infile: 
        vec = []
        vec += [0] * (q_word_len + feature_len)
        
        ques = dic['question']
        ques_token = nltk.word_tokenize(ques)
        ques_token = [word.lower() for word in ques_token]
        
        for i in range(q_word_len):
            if Q_WORDS[i] in ques_token:
                vec[i] = 1
                
        for j in range(feature_len):
            if selected_feature[j] in ques_token:
                vec[q_word_len+j] = 1
                
        all_vec.append(vec)
    
    return all_vec

In [None]:
X_train = get_vectors(infile)
y_train = get_classes(infile)

X_test = get_vectors(testfile)


# Scale the data
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)