In [17]:
import json
import re
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /home/karl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/karl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def load_dataset(filepath):
    '''
    Doc...
    '''
    obj = []
    with open(filepath, 'r') as file:
        obj = json.load(file)
    return obj

In [3]:
# train = load_dataset('datasets/DBpedia/smarttask_dbpedia_train.json')
train = load_dataset('../smart-dataset/datasets/Wikidata/lcquad2_anstype_wikidata_train.json')
test = load_dataset('../smart-dataset/datasets/Wikidata/lcquad2_anstype_wikidata_test_gold.json')
# terms = ['who', 'what', 'when', 'where', 'which', 'whom', 'whos', 'why', 'do', 'how', 'is']
categories = {'who':'resource', 
            'what':'resource', 
            'when':'literal', 
            'where':'resource',
            'which':'literal',
            'whom': 'resource',
            'whose': 'resource',
            'why': 'literal',
            'was': 'boolean',
            'do':'boolean',
            'does': 'boolean',
            'is':'boolean',
            'did': 'boolean',
            'how': 'literal',
            'list': 'resource',
            'are': 'boolean',
            'name': 'resource',
            'tell': 'resource',
            'were': 'boolean',
            'count': 'literal'}

In [4]:
def categorize(question):
    '''
    Doc...
    '''
    if type(question) is not str:
        return None
    question = question.lower()
    idxs = {}
    for term in categories.keys():
        val = re.search(r'\b({})\b'.format(term), question)
        if val is not None:
            idxs[val.start()] =  term

    if len(idxs.keys()) == 0:
        return None
    
    return categories[idxs[min(idxs.keys())]]

In [5]:
categorize("What does the numbers mean?")

'resource'

In [6]:
count = 0
invalid = 0
qs = []
for doc in train:
    question = doc['question']
    category = doc['category']
    prediction = categorize(question)
    if prediction is None:
        invalid += 1
        qs.append(doc)
    if prediction == category:
        count += 1

print(count/len(train))
print(invalid)
for q in qs[:10]:
    print(q)
    print()

0.7248917867514109
225
{'id': 19829, 'question': "Could you summarize Korea's history of this topic?", 'category': 'resource', 'type': ['aspect of history', 'history of the world']}

{'id': 22651, 'question': 'n/a', 'category': 'resource', 'type': ['human settlement']}

{'id': 18673, 'question': 'Of the century breaks of the Colm Gilcreest equal less than 9.6?', 'category': 'boolean', 'type': ['boolean']}

{'id': 25365, 'question': 'Give me a film character from a fictional universe, such as Marvel comics that starts with a W.', 'category': 'resource', 'type': ['fictional character']}

{'id': 21297, 'question': 'Mention the fictional universe described or included in The Matrix.', 'category': 'resource', 'type': ['fictional location', 'setting', 'fictional entity']}

{'id': 22857, 'question': 'n/a', 'category': 'resource', 'type': ['community', 'geographic region', 'geographic location', 'artificial geographic entity', 'administrative territorial entity of Guatemala', 'second-level adm

In [8]:
def prepare_X_y(train, test):
    '''
    doc...
    '''
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    for doc in train:
        X_train.append(doc['question'])
        y_train.append(doc['category'])
        
    for doc in test:
        X_test.append(doc['question'])
        y_test.append(doc['category'])

    
    return X_train, y_train, X_test, y_test

In [9]:
def extract_features(train_dataset, test_dataset):
    """Extracts feature vectors from a preprocessed train and test datasets.
    
    Args:
        train_dataset: List of strings, each consisting of the preprocessed email content. 
        test_dataset: List of strings, each consisting of the preprocessed email content. 
    
    Returns:
        
    """
    # YOUR CODE HERE
#     vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vectorizer = CountVectorizer()
    train_vectors = vectorizer.fit_transform(train_dataset)
    
    test_vectors = vectorizer.transform(test_dataset)
    return train_vectors, test_vectors

In [12]:
X_train, y_train, X_test, y_test = prepare_X_y(train, test)
train_vectors, test_vectors = extract_features(X_train, X_test)

In [13]:
def train(X, y):
    """Trains a classifier on extracted feature vectors.
    
    Args:
        X: Numerical array-like object (2D) representing the instances.
        y: Numerical array-like object (1D) representing the labels.
    
    Returns:
        A trained model object capable of predicting over unseen sets of instances.
    """
    # YOUR CODE HERE
    clf = MultinomialNB(alpha=.01)
    clf.fit(X, y)
    return clf

In [18]:
def trainMLP(X, y):
    """Trains a classifier on extracted feature vectors.
    
    Args:
        X: Numerical array-like object (2D) representing the instances.
        y: Numerical array-like object (1D) representing the labels.
    
    Returns:
        A trained model object capable of predicting over unseen sets of instances.
    """
    # YOUR CODE HERE
    clf = MLPClassifier(random_state=1, max_iter=300)
    clf.fit(X, y)
    return clf

In [14]:
classifier = train(train_vectors, y_train)

In [15]:
pred = classifier.predict(test_vectors)
sum(pred==y_test)/len(pred)

0.8361408882082695

In [16]:
pred[2]

'literal'

In [19]:
classifierMLP = trainMLP(train_vectors, y_train)
pred = classifierMLP.predict(test_vectors)
sum(pred==y_test)/len(pred)

0.9022095821483264