# SMART
This file loads the trained category and type classifiers. The predicts and evaluate the results.

In [24]:
import utils
import pickle
import numpy as np
import gensim 
from gensim.models import Word2Vec 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from elasticsearch import Elasticsearch

## Classify the category

In [2]:
clf_category = pickle.load(open('category_classifier.sav', 'rb'))

In [3]:
test = utils.load_dataset('../smart-dataset/datasets/Wikidata/lcquad2_anstype_wikidata_test_gold.json')

In [4]:
_, _, X_test, y_test = utils.prepare_X_y({}, test)
test_vectors = utils.transform_dataset(X_test, 'category_vectorizer.sav')

In [5]:
pred_category = clf_category.predict(test_vectors)
print("Accuracy:", sum(pred_category==y_test)/len(pred_category))

Accuracy: 0.9022095821483264


## Classify the type

In [6]:
clf_type_literal = pickle.load(open('type_literal_classifier.sav', 'rb'))


In [8]:
def split_bool_literal_reference(X, y):
    bool_map = {}
    literal_map = {}
    resource_map = {}

    for i in range(len(y)):
        if y[i] == 'boolean':
            bool_map[i] = X[i]
        elif y[i] == 'literal':
            literal_map[i] = X[i]
        elif y[i] == 'resource':
            resource_map[i] = X[i]
            
    return bool_map, literal_map, resource_map

In [9]:
bool_map, literal_map, resource_map = split_bool_literal_reference(X_test, pred_category)


In [10]:
   
X_test_literal = list(literal_map.values())
y_test_literal = []
for i in list(literal_map.keys()):
    y_test_literal.append(test[i]['type'][0])

In [11]:
test_vectors_literal = utils.transform_dataset(X_test_literal, 'literal_vectorizer.sav')

In [12]:
pred_literal = clf_type_literal.predict(test_vectors_literal)
print("Accuracy:", sum(pred_literal==y_test_literal)/len(pred_literal))

Accuracy: 0.8540596094552929


## Bundle

In [26]:
es = Elasticsearch()
INDEX_NAME = 'questions'
es.info()

In [27]:
q = 'Is cola healthy?'

es.search(index=INDEX_NAME, q=q, _source=True, size=2).get('hits', {}).get('hits', {})

[{'_index': 'questions',
  '_type': '_doc',
  '_id': '30077',
  '_score': 8.535293,
  '_source': {'question': 'Tell me about cola for product or material produced of The Coca-Cola Company?',
   'category': 'resource',
   'type': ['goods', 'physical object', 'class', 'trademark', 'soft drink']}},
 {'_index': 'questions',
  '_type': '_doc',
  '_id': '24488',
  '_score': 7.8734784,
  '_source': {'question': 'which cola starts with the letter p',
   'category': 'resource',
   'type': ['soft drink',
    'trademark',
    'carbonated beverage',
    'non-alcoholic beverage',
    'symbol',
    'class',
    'protected name',
    'intellectual property',
    'sign']}},
 {'_index': 'questions',
  '_type': '_doc',
  '_id': '22455',
  '_score': 7.3887763,
  '_source': {'question': 'Which is the corporation for the products produced by Coca-Cola?',
   'category': 'resource',
   'type': ['juridical person',
    'company',
    'separate legal entity',
    'legal form']}},
 {'_index': 'questions',
  '_t

In [23]:
stop_words = set(stopwords.words('english'))
word2vec_sg = pickle.load(open('word2vec_sg.sav', 'rb'))
word2vec_sg.wv.most_similar('cola') # Check if working

In [63]:
def predict_resource_types(es, index, model, index_name='questions', size=5, threshold=0.85):
    q = test[index]['question']
    q = " ".join([w for w in word_tokenize(q)  if not w in stop_words])
    results = []
    try:
        results = es.search(index=index_name, q=q, _source=True, size=size).get('hits', {}).get('hits', {})
    except:
        return []
    types = set([])
    for result in results:
        for t in result['_source']['type']:
            if len(t.split()) == 1:
                types.add(t)

    q_terms = q.split() # Question terms
    t_terms = list(types) # Types from search
    pred_types = set([])

    for i in q_terms:
        for j in t_terms:
            try:
                score = model.wv.similarity(i, j)
                if score > threshold:
                    pred_types.add(j)
    #                 print(i, j, score)
            except:
                pass

    return list(pred_types)

In [47]:

predict_resource_types(es, 1431, word2vec_sg, threshold=0.85)

['measure']

In [48]:
test[1431]

{'id': 29188,
 'question': "Which is measured by Young's modulus?",
 'category': 'resource',
 'type': ['SI unit', 'unit of measurement', 'SI derived unit']}

In [79]:
type_preds = []
count_literal = 0
for i in range(len(test)):
    
    if i in bool_map:
        type_preds.append('boolean')
    if i in literal_map:
        type_preds.append(pred_literal[count_literal])
        count_literal += 1
    if i in resource_map:
        type_preds.append(predict_resource_types(es, i, word2vec_sg, threshold=0.85))

In [53]:
y_test_types = []
    
for doc in test:
    y_test_types.append(doc['type'])

In [83]:
score = 0
for i in range(len(test)):
    if i in bool_map:
        score += type_preds[i] == y_test_types[i][0]
    if i in literal_map:
        score += type_preds[i] == y_test_types[i][0]
    if i in resource_map:
        preds = type_preds[i]
        labels = y_test_types[i]
        sub_score = 0
        for pred in preds:
            if pred in labels:
                sub_score += 1/len(labels)
            else:
                sub_score -= 1/len(labels)
        score += max(sub_score, 0)
score / len(y_test_types)

0.32242900803181723

Decisions made:

* We used MLP for category classification because there are a known amount of labels (multi class classification)
* Boolean category is always boolean type, so no need for further classification/ prediction
* We used MLP for literal type classifications because there are a known amount of labels (multi class classification)
* We use CountVectorizer because TFIDF penalize frequent words (who, what ...)
* We use pickle to serialize MLP models and vectorizers
* For resource, we start by looking for similar questions using elasticsearch (bm25)
* Use skip-gram word2vec embedding to cross-reference types found in search and terms in query
* Skip-gram predict the context better than CBOW (CBOW often have meaningless predictions)
* Mention that we ignore types that are more than one word due to limitations in word2vec
* Boolean and literal types have a strict scoring (0 or 1)
* Have not decided how to score resource types (+/- proportion of terms)

TODO:
* Optimize MLP models
* Improve implementation of resource scoring
* Extend the indexed text by concatenating description of the types
* Type hierarchy
* Try dbpedia

* How we concatenate question and types in word2vec have a major impact on performance
