# Type classifier
text goes here

In [1]:
import json
from sklearn.neural_network import MLPClassifier
import numpy as np
import pickle
import utils
import gensim 
from gensim.models import Word2Vec 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from elasticsearch import Elasticsearch

In [2]:
train = utils.load_dataset('datasets/DBpedia/train.json')
test = utils.load_dataset('datasets/DBpedia/test_grnd.json')

In [3]:
X_train, y_train, X_test, y_test = utils.prepare_X_y(train, test)
# train_vectors, test_vectors = utils.extract_features(X_train, X_test, 'literal_vectorizer.sav')

# Predict types

In [4]:
def split_bool_literal_reference(X, y):
    bool_map = {}
    literal_map = {}
    resource_map = {}

    for i in range(len(y)):
        if y[i] == 'boolean':
            bool_map[i] = X[i]
        elif y[i] == 'literal':
            literal_map[i] = X[i]
        elif y[i] == 'resource':
            resource_map[i] = X[i]
            
    return bool_map, literal_map, resource_map

In [5]:
bool_map, literal_map, resource_map = split_bool_literal_reference(X_train, y_train)
test_bool_map, test_literal_map, test_resource_map = split_bool_literal_reference(X_test, y_test)

## Train literal

In [6]:
X_train_literal = list(literal_map.values())
y_train_literal = []
for i in list(literal_map.keys()):
    y_train_literal.append(train[i]['type'][0])
    
X_test_literal = list(test_literal_map.values())
y_test_literal = []
for i in list(test_literal_map.keys()):
    y_test_literal.append(test[i]['type'][0])

In [7]:
train_vectors_literal, test_vectors_literal = utils.extract_features(X_train_literal, X_test_literal, 'literal_vectorizer.sav')

In [8]:
clf = MLPClassifier(random_state=1, max_iter=300)
clf.fit(train_vectors_literal, y_train_literal)
pred_literal = clf.predict(test_vectors_literal)
print("Accuracy:", sum(pred_literal==y_test_literal)/len(pred_literal))
pickle.dump(clf, open('type_literal_classifier.sav', 'wb'))

Accuracy: 0.9502407704654896


## Bundle

In [9]:
es = Elasticsearch()
INDEX_NAME = 'questions'
es.info()
stop_words = set(stopwords.words('english'))
word2vec_sg = pickle.load(open('word2vec_sg.sav', 'rb'))
# word2vec_sg.wv.most_similar('cola') # Check if working

In [10]:
def predict_resource_types(es, index, model, index_name='questions', size=5, threshold=0.85):
    q = test[index]['question']
    q = " ".join([w for w in word_tokenize(q)  if not w in stop_words])
    results = []
    try:
        results = es.search(index=index_name, q=q, _source=True, size=size).get('hits', {}).get('hits', {})
    except:
        return []
    types = set([])
    for result in results:
        for t in result['_source']['type']:
            if len(t.split()) == 1:
                types.add(t)

    q_terms = q.split() # Question terms
    t_terms = list(types) # Types from search
    pred_types = set([])

    for i in q_terms:
        for j in t_terms:
            try:
                score = model.wv.similarity(i, j[4:])
                if score > threshold:
                    pred_types.add(j)
#                     print(i, j, score)
            except:
#                 print(i, j)
                pass

    return list(pred_types)

In [11]:
predict_resource_types(es, 12, word2vec_sg, threshold=0.85)

['dbo:Location', 'dbo:CelestialBody', 'dbo:Planet', 'dbo:Asteroid']

In [12]:
test[12]

{'id': 'dbpedia_10882',
 'question': "What is the furthest planet with the lowest synodic period from the source Otto's encyclopdedia?",
 'category': 'resource',
 'type': ['dbo:Planet', 'dbo:CelestialBody', 'dbo:Place', 'dbo:Location']}

In [13]:
type_preds = []
count_literal = 0
for i in range(len(test)):
    
    if i in test_bool_map:
        type_preds.append('boolean')
    if i in test_literal_map:
        type_preds.append(pred_literal[count_literal])
        count_literal += 1
    if i in test_resource_map:
        type_preds.append(predict_resource_types(es, i, word2vec_sg, threshold=0.85))

In [14]:
y_test_types = []
    
for doc in test:
    y_test_types.append(doc['type'])

In [15]:
y_test_types[:5]

[['number'],
 ['dbo:Single', 'dbo:MusicalWork', 'dbo:Work'],
 ['number'],
 ['boolean'],
 ['string']]

In [16]:
score = 0
bool_score = 0
literal_score = 0
resource_score_penalty = 0
resource_score_no_penalty = 0
resource_score_min_1_correct = 0
for i in range(len(test)):
    if i in test_bool_map:
        score += type_preds[i] == y_test_types[i][0]
        bool_score += type_preds[i] == y_test_types[i][0]
    if i in test_literal_map:
        score += type_preds[i] == y_test_types[i][0]
        literal_score += type_preds[i] == y_test_types[i][0]
    if i in test_resource_map:
        preds = type_preds[i]
        labels = y_test_types[i]
        sub_score = 0
        penalty = 0
        min_1_correct = False
        for pred in preds:
            if pred in labels:
                sub_score += 1/len(labels)
                min_1_correct = True
            else:
                penalty += 1/len(labels)
                
        resource_score_no_penalty += sub_score
        resource_score_penalty += max(sub_score - penalty, 0)
        if min_1_correct:
            resource_score_min_1_correct += 1
        score += max(sub_score, 0)
        
        
# print(score / len(y_test_types))
print("Individual category accuracy on their respective category subsets")
print("Boolean accuracy:", bool_score/ len(test_bool_map))
print("Literal accuracy:", literal_score/ len(test_literal_map))
print("Resource min 1 correct accuracy:", resource_score_min_1_correct/ len(test_resource_map))
print("Resource no penalty accuracy:", resource_score_no_penalty/ len(test_resource_map))
print("Resource penalty accuracy:", resource_score_penalty/ len(test_resource_map))
print()
print("Individual category accuracy on across entire test set")
print("Boolean accuracy:", bool_score/ len(y_test_types))
print("Literal accuracy:", literal_score/ len(y_test_types))
print("Resource min 1 correct accuracy:", resource_score_min_1_correct/ len(y_test_types))
print("Resource no penalty accuracy:", resource_score_no_penalty/ len(y_test_types))
print("Resource penalty accuracy:", resource_score_penalty/ len(y_test_types))
print()
print("Total score")
print("Resource min 1 correct accuracy:", (resource_score_min_1_correct + bool_score + literal_score)/ len(y_test_types))
print("Resource no penalty accuracy:", (resource_score_no_penalty + bool_score + literal_score)/ len(y_test_types))
print("Resource penalty accuracy:", (resource_score_penalty + bool_score + literal_score)/ len(y_test_types))

ZeroDivisionError: division by zero