# Literal classifier
This notebook trains the literal category classifier.

The type classifier is saved as: 'type_literal_classifier.sav'

Use the vectorizer 'literal_vectorizer.sav' to extract the feature vectors

In [1]:
import json
from sklearn.neural_network import MLPClassifier
import numpy as np
import pickle
import utils
import gensim 
from gensim.models import Word2Vec 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from elasticsearch import Elasticsearch

In [2]:
train = utils.load_dataset('datasets/DBpedia/train.json')
test = utils.load_dataset('datasets/DBpedia/test_grnd.json')

In [3]:
X_train, y_train, X_test, y_test = utils.prepare_X_y(train, test)

Extract the literal train/test objects with their indices as keys so that we can iterate over them in correct order.


In [4]:
_, literal_map, _ = utils.split_bool_literal_reference(X_train, y_train)
_, test_literal_map, _ = utils.split_bool_literal_reference(X_test, y_test)

Build train and test sets consisting only of literal objects.
Then extract the features. 

In [5]:
X_train_literal = list(literal_map.values())
y_train_literal = []
for i in list(literal_map.keys()):
    y_train_literal.append(train[i]['type'][0])
    
X_test_literal = list(test_literal_map.values())
y_test_literal = []
for i in list(test_literal_map.keys()):
    y_test_literal.append(test[i]['type'][0])
    
train_vectors_literal, test_vectors_literal = utils.extract_features(X_train_literal, X_test_literal, 'literal_vectorizer.sav')

Train the type classifier

In [6]:
clf = MLPClassifier(random_state=1, max_iter=300)
clf.fit(train_vectors_literal, y_train_literal)
pred_literal = clf.predict(test_vectors_literal)
print("Accuracy:", sum(pred_literal==y_test_literal)/len(pred_literal))
pickle.dump(clf, open('type_literal_classifier.sav', 'wb'))

Accuracy: 0.9502407704654896
