In [2]:
import json
import string
import nltk
from collections import defaultdict, Counter
from sklearn.preprocessing import StandardScaler
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
from sklearn.neural_network import MLPClassifier

FNAME = 'project_files/training.json' # Need to change later

TESTNAME = 'project_files/testing.json'

Q_WORDS = ['how','what','whom','when','who','where','which','name']
SELECTED_Q = ['what','which']

ner_dir = 'stanford-ner-2018-02-27/'
ner_jarfile = ner_dir + 'stanford-ner.jar'
ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_filename=ner_modelfile, path_to_jar=ner_jarfile)

pos_dir = 'stanford-postagger-2018-02-27/'
pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger'
pos_jarfile = pos_dir + 'stanford-postagger.jar'
pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile, path_to_jar=pos_jarfile)

with open(FNAME) as train_data:
    trainfile = json.load(train_data)

with open(TESTNAME) as test_data:
    testfile = json.load(test_data)

### Select the required Features for multi-layer perceptron (MLP) algorithm in scikit learn (deep learning)

In [None]:
def get_features(trainfile):
    feature_count = defaultdict(int)

    for dic in trainfile:  
        question = dic['question']
        question_token = nltk.word_tokenize(question)
        question_token = [word.lower() for word in question_token]
        for q in SELECTED_Q:
            if q in question_token:
                index = question_token.index(q)
                if index+1 < len(question_token):
                    next_ = question_token[index + 1]
                    feature_count[next_] += 1
    return feature_count

dic = get_features(trainfile)
feature_count = sorted(dic.items(), key=lambda x: x[1],reverse=True)

selected_feature = []
# Only select the words whose frequencies larger than one
for item in feature_count:
    if item[1] > 1:
        selected_feature.append(item[0])
print selected_feature

### Select the expected classes for MLP classification in scikit learn
array y of size (n_samples,), which holds the target values (class labels) for the training samples

In [9]:
with open('mlp_files/all_ans_pos.json') as f:
    lst_la = json.load(f)

In [10]:
len(lst_la)

43379

In [None]:
def get_classes(trainfile):
    all_ans = []
    
    with open('mlp_files/tagged_ans.json') as f:
        tagged_ans = json.load(f)
    
    
    for i in range(len(trainfile)):
        print i
        dic = trainfile[i]
        ans = dic['text']
        ans_token = nltk.word_tokenize(ans)
    
    
        ans_tags = [[],[]]
        if tagged_ans.has_key(ans):
            ans_tags = [tagged_ans[ans][0], tagged_ans[ans][1]]
        else:
            pos = pos_tagger.tag(ans_token)

            pos_tags = set()

            for item in pos:
                pos_tags.add(item[1])

            ans_tags[1] = list(pos_tags)
            
            tagged_ans[ans] = ans_tags

        all_ans.append(ans_tags)
        
    with open('mlp_files/tagged_ans.json', 'w') as f:
        json.dump(tagged_ans, f)
        
    return all_ans

In [None]:
# def get_classes(trainfile):
#     all_ans = []
    
#     with open('mlp_files/tagged_ans.json') as f:
#         tagged_ans = json.load(f)
    
    
#     for dic in trainfile:
#         ans = dic['text']
#         ans_token = nltk.word_tokenize(ans)
        
#         if tagged_ans.has_key(ans):
#             ans_tags = [tagged_ans[ans][0], tagged_ans[ans][1]]
#         else:
#             ner = ner_tagger.tag(ans_token)
#             pos = pos_tagger.tag(ans_token)

#             ner_tags = set()
#             pos_tags = set()

#             for item in ner:
#                 ner_tags.add(item[1])
            
# #             print ner_tags
            
#             if 'O' in ner_tags:
#                 ner_tags.remove('O')

#             for item in pos:
#                 pos_tags.add(item[1])

#             ans_tags = [list(ner_tags), list(pos_tags)]
            
#             tagged_ans[ans] = ans_tags
            
#             with open('mlp_files/tagged_ans.json', 'w') as f:
#                 json.dump(tagged_ans, f)

#         all_ans.append(ans_tags)
        
#     return all_ans

### Create feature vectors of training samples 
array X of size (n_samples, n_features), which holds the training samples represented as floating point feature vectors

In [None]:
def get_vectors(infile, vec_file_name):
    all_vec = []

    q_word_len = len(Q_WORDS)
    feature_len = len(selected_feature)
    
    with open(vec_file_name) as vec_f:
        vecfile = json.load(vec_f)
    
    for dic in infile: 
        
        ques = dic['question']
        
        if vecfile.has_key(ques):
            vec = vecfile[ques]
        else:
            vec = []
            vec += [0] * (q_word_len + feature_len)

            ques_token = nltk.word_tokenize(ques)
            ques_token = [word.lower() for word in ques_token]

            for i in range(q_word_len):
                if Q_WORDS[i] in ques_token:
                    vec[i] = 1

            for j in range(feature_len):
                if selected_feature[j] in ques_token:
                    vec[q_word_len+j] = 1
            
            vecfile[ques] = vec
                
        all_vec.append(vec)
    
    with open(vec_file_name, 'w') as vec_f:
        json.dump(vecfile, vec_f)
    
    return all_vec

In [None]:
X_train = get_vectors(trainfile, 'mlp_files/train_vec.json')
print "X_train"

X_test = get_vectors(testfile, 'mlp_files/test_vec.json')
print "X_test"

In [None]:
# y_train_all_ans is [[NER tags], [POS tags]]
# will be converted to two 1-d array of integers later
y_train_all_ans = get_classes(trainfile)

In [None]:
with open('mlp_files/y_train_all_ans.json', 'w') as f:
    json.dump(y_train_all_ans, f)

In [None]:
# check all items in y_train_all_ans are list type
# for item in y_train_all_ans:
#     for item1 in item:
#         if type(item1) != list:
#             print item1

In [None]:
def get_tags_index(tags, all_ans_type):
    if tags in all_ans_type:
        tags_index = all_ans_type.index(tags)
    else:
        all_ans_type.append(tags)
        tags_index = len(all_ans_type) - 1
    
    return tags_index

In [None]:
print y_train_all_ans[40000]

In [None]:
all_ans_ner_type = []
all_ans_pos_type = []

all_ans_ner = []
all_ans_pos = []

for item in y_train_all_ans:
    ner_tags = set(item[0])
    pos_wanted = item[1]

    while type(pos_wanted[0]) == list:
        pos_wanted = pos_wanted[0]

    pos_tags = set(pos_wanted)
    
    all_ans_ner.append(get_tags_index(ner_tags, all_ans_ner_type))
    all_ans_pos.append(get_tags_index(pos_tags, all_ans_pos_type))

In [None]:
with open('mlp_files/all_ans_ner.json', 'w') as f:
    json.dump(all_ans_ner, f)
    
with open('mlp_files/all_ans_pos.json', 'w') as f:
    json.dump(all_ans_pos, f)

In [None]:
# Scale the data
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# mlp_ner = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
# mlp_ner.fit(X_train,all_ans_ner)

# mlp_pos = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
# mlp_pos.fit(X_train,all_ans_pos)

mlp_ner = MLPClassifier(hidden_layer_sizes=(100, 100))
mlp_ner.fit(X_train,all_ans_ner)

mlp_pos = MLPClassifier(hidden_layer_sizes=(100, 100))
mlp_pos.fit(X_train,all_ans_pos)

In [None]:
mlp_ner_pred = mlp_ner.predict(X_test)
mlp_pos_pred = mlp_pos.predict(X_test)

In [None]:
def get_tags_from_index(pred, all_ans_type):
    
    pred_tags = []
    
    for item in pred:
        pred_tags.append(list(all_ans_type[item]))
    
    return pred_tags
    
mlp_ner_pred_tags = get_tags_from_index(mlp_ner_pred, all_ans_ner_type)
mlp_pos_pred_tags = get_tags_from_index(mlp_pos_pred, all_ans_pos_type)

In [None]:
with open('mlp_files/mlp_ner_pred_tags.json', 'w') as f:
    json.dump(mlp_ner_pred_tags, f)
    
with open('mlp_files/mlp_pos_pred_tags.json', 'w') as f:
    json.dump(mlp_pos_pred_tags, f)