In [6]:
import json
import string
import nltk
from collections import defaultdict, Counter
from sklearn.preprocessing import StandardScaler
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
from sklearn.neural_network import MLPClassifier

FNAME = 'project_files/devel.json' # Need to change later

TESTNAME = 'project_files/testing.json'

Q_WORDS = ['how','what','whom','when','who','where','which']
SELECTED_Q = ['how','what','which']

ner_dir = 'stanford-ner-2018-02-27/'
ner_jarfile = ner_dir + 'stanford-ner.jar'
ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_filename=ner_modelfile, path_to_jar=ner_jarfile)

pos_dir = 'stanford-postagger-2018-02-27/'
pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger'
pos_jarfile = pos_dir + 'stanford-postagger.jar'
pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile, path_to_jar=pos_jarfile)

with open(FNAME) as train_data:
    trainfile = json.load(train_data)

with open(TESTNAME) as test_data:
    testfile = json.load(test_data)

### Select the required Features for multi-layer perceptron (MLP) algorithm in scikit learn (deep learning)

In [7]:
def get_features(trainfile):
    feature_count = defaultdict(int)

    for dic in trainfile:  
        question = dic['question']
        question_token = nltk.word_tokenize(question)
        question_token = [word.lower() for word in question_token]
        for q in SELECTED_Q:
            if q in question_token:
                index = question_token.index(q)
                next_ = question_token[index + 1]
                feature_count[next_] += 1
    return feature_count

dic = get_features(trainfile)
feature_count = sorted(dic.items(), key=lambda x: x[1],reverse=True)

selected_feature = []
# Only select the words whose frequencies larger than one
for item in feature_count:
    if item[1] > 1:
        selected_feature.append(item[0])
print selected_feature

[u'is', u'many', u'was', u'year', u'did', u'type', u'does', u'country', u'percentage', u'much', u'kind', u'?', u'language', u'other', u'date', u'city', u"'s", u'do', u'political', u'are', u'century', u'long', u'company', u'month', u'can', u'two', u'group', u'were', u'period', u'percent', u'sort', u'word', u'part', u'would', u'decade', u'term', u'notable', u'age', u'has', u'of', u'form', u'neighborhood', u'organization', u'color', u'old', u'branch', u'far', u'award', u'dutch', u'isotope', u'else', u'industry', u'area', u'era', u'nation', u'event', u'state', u'party', u'caused', u'large', u'town', u'dialect', u'philosophy', u'types', u'years', u'germanic', u'name', u'university', u'temperature', u'place', u'number', u'rank', u'individual', u'nationality', u'region', u'book', u'religion', u'magazine', u'tribe', u'span', u'empire', u'time', u'concept', u'dynasty', u'gender', u'street', u'action', u'family', u'county', u'must', u'work', u'topic', u'element', u'geographic', u'material', u'co

### Select the expected classes for MLP classification in scikit learn
array y of size (n_samples,), which holds the target values (class labels) for the training samples

In [25]:
def get_classes(trainfile):
    all_ans = []
    
    with open('project_files/tagged_ans.json') as f:
        tagged_ans = json.load(f)
    
    
    for dic in trainfile:
        ans = dic['text']
        ans_token = nltk.word_tokenize(ans)
        
        if tagged_ans.has_key(ans):
            ans_tags = [tagged_ans[ans][0], tagged_ans[ans][1]]
        else:
            ner = ner_tagger.tag(ans_token)
            pos = pos_tagger.tag(ans_token)

            ner_tags = set()
            pos_tags = set()

            for item in ner:
                ner_tags.add(item[1])
            
#             print ner_tags
            
            if 'O' in ner_tags:
                ner_tags.remove('O')

            for item in pos:
                pos_tags.add(item[1])

            ans_tags = [list(ner_tags), list(pos_tags)]
            
            tagged_ans[ans] = ans_tags
            
            with open('project_files/tagged_ans.json', 'w') as f:
                json.dump(tagged_ans, f)

        all_ans.append(ans_tags)
        
    return all_ans

### Create feature vectors of training samples 
array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors

In [26]:
def get_vectors(infile, vec_file_name):
    all_vec = []

    q_word_len = len(Q_WORDS)
    feature_len = len(selected_feature)
    
    with open(vec_file_name) as vec_f:
        vecfile = json.load(vec_f)
    
    for dic in infile: 
        
        ques = dic['question']
        
        if vecfile.has_key(ques):
            vec = vecfile[ques]
        else:
            vec = []
            vec += [0] * (q_word_len + feature_len)

            ques_token = nltk.word_tokenize(ques)
            ques_token = [word.lower() for word in ques_token]

            for i in range(q_word_len):
                if Q_WORDS[i] in ques_token:
                    vec[i] = 1

            for j in range(feature_len):
                if selected_feature[j] in ques_token:
                    vec[q_word_len+j] = 1
            
            vecfile[ques] = vec
                
        all_vec.append(vec)
    
    with open(vec_file_name, 'w') as vec_f:
        json.dump(vecfile, vec_f)
    
    return all_vec

In [27]:
X_train = get_vectors(trainfile, 'project_files/train_vec.json')
print "X_train"

X_test = get_vectors(testfile, 'project_files/test_vec.json')
print "X_test"

X_train
X_test


In [28]:
# y_train_all_ans is [[NER tags], [POS tags]]
# will be converted to two 1-d array of integers later
y_train_all_ans = get_classes(trainfile)

In [29]:
with open('project_files/y_train_all_ans.json', 'w') as f:
    json.dump(y_train_all_ans, f)

In [31]:
# check all items in y_train_all_ans are list type
# for item in y_train_all_ans:
#     for item1 in item:
#         if type(item1) != list:
#             print item1

In [32]:
def get_tags_index(tags, all_ans_type):
    if tags in all_ans_type:
        tags_index = all_ans_type.index(tags)
    else:
        all_ans_type.append(tags)
        tags_index = len(all_ans_type) - 1
    
    return tags_index

In [33]:
all_ans_ner_type = []
all_ans_pos_type = []

all_ans_ner = []
all_ans_pos = []

for item in y_train_all_ans:
    ner_tags = set(item[0])
    pos_tags = set(item[1])
    
    all_ans_ner.append(get_tags_index(ner_tags, all_ans_ner_type))
    all_ans_pos.append(get_tags_index(pos_tags, all_ans_pos_type))

In [39]:
with open('project_files/all_ans_ner.json', 'w') as f:
    json.dump(all_ans_ner, f)
    
with open('project_files/all_ans_pos.json', 'w') as f:
    json.dump(all_ans_pos, f)

In [40]:
# Scale the data
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
mlp_ner = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_ner.fit(X_train,all_ans_ner)

mlp_pos = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp_pos.fit(X_train,all_ans_pos)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [53]:
testing_ner_pred = mlp_ner.predict(X_test)
testing_pos_pred = mlp_pos.predict(X_test)

In [62]:
def get_tags_from_index(pred, all_ans_type):
    
    pred_tags = []
    
    for item in pred:
        pred_tags.append(list(all_ans_type[item]))
    
    return pred_tags
    
testing_ner_pred_tags = get_tags_from_index(testing_ner_pred, all_ans_ner_type)
testing_pos_pred_tags = get_tags_from_index(testing_pos_pred, all_ans_pos_type)

In [63]:
with open('project_files/testing_ner_pred_tags.json', 'w') as f:
    json.dump(testing_ner_pred_tags, f)
    
with open('project_files/testing_pos_pred_tags.json', 'w') as f:
    json.dump(testing_pos_pred_tags, f)