In [50]:
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics as mtr
import pickle

In [17]:
def bigram(words):
    big = []
    for i in range(len(words[:-1])):
        big.append(words[i] + ' ' + words[i+1])
    return big

In [3]:
with open ('smarttask_dbpedia_train.json') as f:
    train_query_file = json.load(f)

#list of categories, with literal divided into types
classes = np.array(['resource', 'date', 'number', 'string', 'boolean'])

y_label = []
for query in train_query_file:
    try:
        words = word_tokenize(query['question'].replace('?',''))
        #bigrams in the query
        bigrams = bigram(words)
    except:
        words = []
        bigrams = []
    word_count = {w: f for w, f in sorted(dict(Counter(words)).items(), key=lambda item: item[1], reverse=True)}
    bi_count = {b: f for b, f in sorted(dict(Counter(bigrams)).items(), key=lambda item: item[1], reverse=True)}
    query['words'] = {**word_count, **bi_count}
    if query['category']=='literal':
        y_label.append(query['type'][0])
    else:
        y_label.append(query['category'])
y_label = np.array(y_label)
for i, c in enumerate(classes):
    y_label = np.where(y_label==c,i,y_label)
y_label = y_label.astype(int)

In [None]:
#division into train & eval sets
X_train, X_eval = train_test_split(train_query_file, test_size=0.2, random_state=1)
y_train, y_eval = train_test_split(y_label, test_size=0.2, random_state=1)

In [None]:
#vectorizing
vec = DictVectorizer()
X_train = vec.fit_transform([q['words'] for q in X_train])
X_eval = vec.transform([q['words'] for q in X_eval])

In [30]:
def predict_query(query, dictVect, classifier):
    try:
        words = word_tokenize(query.replace('?',''))
        #bigrams in the query
        bigrams = bigram(words)
    except:
        words = []
        bigrams = []
    word_count = {w: f for w, f in sorted(dict(Counter(words)).items(), key=lambda item: item[1], reverse=True)}
    bi_count = {b: f for b, f in sorted(dict(Counter(bigrams)).items(), key=lambda item: item[1], reverse=True)}
    tokens = {**word_count, **bi_count}
    word_vector = vec.transform(tokens)
    return classifier.predict(word_vector)

def metrics(y, pred):
    acc = mtr.accuracy_score(y_eval, pred)
    bacc = mtr.balanced_accuracy_score(y_eval, pred)
    f1_mac = mtr.f1_score(y_eval, pred,average='macro')
    f1_mic = mtr.f1_score(y_eval, pred,average='micro')
    print('Accuracy: {:.3}\nBalanced Accuracy: {:.3}\nF1 Macro: {:.3}\nF1 Micro: {:.3}'.format(acc,bacc,f1_mac,f1_mic))

In [52]:
try:
    with open('biqpclas','rb') as f:
        clf = pickle.load(f)
except:
    print('No file, training')
    clf = MLPClassifier(random_state=4)
    clf.fit(X_train,y_train)

In [53]:
pred = clf.predict(X_eval)

In [54]:
metrics(y_eval, pred)

Accuracy: 0.938
Balanced Accuracy: 0.9
F1 Macro: 0.92
F1 Micro: 0.938


In [55]:
#demonstrate query prediction
query = 'What is the largest country in the world?'
classes[predict_query(query, vec, clf)]

array(['resource'], dtype='<U8')

In [51]:
if True:
    with open('biqpclas','wb') as f:
        pickle.dump(clf,f)