# Predict the topic of a Math Question on Math Education Resources

We will use **Machine Learning** to predict the topic of a Math Question from the [Math Education Resources](http://math-education-resources.com). For simplicity we will only consider two topics. Using [multiclass classification](https://en.wikipedia.org/wiki/Multiclass_classification) this can be extended to more than two topics (at the time of writing, April 2015, we have about 1500 questions with 150 topics on MER).

## Data inspection

In [312]:
import os
import json
import numpy as np
from pymongo import MongoClient

In [313]:
client = MongoClient()

In [314]:
questions_collection = client['merdb'].questions


questions_collection.find_one()

{u'ID': u'UBC+MATH307+April_2012+01_(d)',
 u'_id': ObjectId('55383310cec2a2367cebc622'),
 u'answer_html': u'<p>No content found.</p>',
 u'answer_latex': u'No content found.',
 u'contributors': [u'Konradbe'],
 u'course': u'MATH307',
 u'flags': [u'RQ', u'CH', u'CS', u'CT'],
 u'hints_html': [u'<p>No content found.</p>'],
 u'hints_latex': [u'No content found.'],
 u'hints_raw': [u'No content found.'],
 u'num_votes': 0,
 u'question': u'1 (d)',
 u'rating': -1,
 u'sols_html': [u'<p>No content found.</p>'],
 u'sols_latex': [u'No content found.'],
 u'sols_raw': [u'No content found.'],
 u'statement_html': u'<p>Suppose you are given a set of <em>N</em> data points <em>(x<sub>n</sub>, y<sub>n</sub>)</em>, with <em>x<sub>n</sub></em> increasing, and you wish to interpolate these points with a spline function <em><span class="math">\\(f\\)</span></em>, where <em><span class="math">\\(f\\)</span>(x)</em> is given by the cubic polynomial <em>p<sub>n</sub>(x)</em> on each interval <em>(x<sub>n</sub>, x<

In [315]:
# create an array of all topics of interest

topic_tags = ["Eigenvalues_and_eigenvectors", "Probability_density_function", "Taylor_series", "Substitution"]

num_classes = len(topic_tags)

In [316]:
# create an array of questions with topics contained in topic_tags

questions = []
for q in questions_collection.find({"topics": 
                                         {"$in": topic_tags}
                                        }):
    questions.append(q)


In [317]:
# count how many questions there are for each topic in topic_tags

for t in topic_tags:
    print(t, questions_collection.find({"topics": t}).count())
    

('Eigenvalues_and_eigenvectors', 45)
('Probability_density_function', 39)
('Taylor_series', 50)
('Substitution', 37)


In [318]:
questions[77].keys()

[u'rating',
 u'contributors',
 u'topics',
 u'year',
 u'answer_html',
 u'course',
 u'solvers',
 u'sols_raw',
 u'hints_html',
 u'question',
 u'statement_raw',
 u'num_votes',
 u'statement_html',
 u'term',
 u'statement_latex',
 u'hints_raw',
 u'hints_latex',
 u'ID',
 u'sols_latex',
 u'url',
 u'flags',
 u'answer_latex',
 u'sols_html',
 u'_id']

In [319]:
# data preprocessing modules

import helpers
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.preprocessing import label_binarize

In [320]:
# split the questions into training and test


# for reproducibility we set the seed of the random number generator
np.random.seed(23)

test_proportion = 0.75
num_samples = int(test_proportion * len(questions))
test_indices = np.random.choice(range(len(questions)), num_samples, replace=False)


questions_train = [q for i, q in enumerate(questions) if not i in test_indices]
questions_test = [q for i, q in enumerate(questions) if i in test_indices]

for topic in topic_tags:
    print('%s questions in test set: %d' % (topic, sum([1 for q in questions_test if topic in q['topics']])))

Eigenvalues_and_eigenvectors questions in test set: 30
Probability_density_function questions in test set: 32
Taylor_series questions in test set: 40
Substitution questions in test set: 26


In [321]:
# clean & filter data
def words_from_question(q):
    # we are only interested in question statement, hints and solution
    all_text = q['statement_html'] + q['hints_html'][0] + q['sols_html'][0] 
    return helpers.strip_text(all_text)

# remove commonly used words and combine words with the same root
def words_stemmed_no_stop(words):
    stop = stopwords.words('english')
    res = []
    for word in words:
        stemmed = PorterStemmer().stem_word(word)
        if stemmed not in stop and len(stemmed) > 1: #take words longer than 1 char
            res.append(stemmed)
    return res

In [322]:
# collect all stemmed words from training set, excluding stop words

vocabulary = []

for q in questions_train:
    vocabulary += words_stemmed_no_stop(words_from_question(q))
vocabulary_sorted = sorted(set(vocabulary))



print('Number of distinct words:', len(vocabulary_sorted))
print(vocabulary_sorted[:15])

('Number of distinct words:', 462)
[u'abov', u'accord', u'ad', u'addit', u'age', u'ahead', u'allow', u'almost', u'also', u'altern', u'alway', u'analyt', u'ani', u'anoth', u'answer']


In [323]:
# returns a binary vector for a question, indicating whether a word
# from vocabulary is contained in the question

def question_to_vector(q, voc):
    # vector of dimension 1xlen(voc)
    x_vec = np.zeros(len(voc))
    words = words_stemmed_no_stop(words_from_question(q))
    for word in words:
        if word in voc:
            x_vec[voc.index(word)] = 1
    return x_vec

In [324]:
print(question_to_vector(questions[0], vocabulary))
sum(question_to_vector(questions[0], vocabulary))

[ 1.  1.  1. ...,  0.  0.  0.]


24.0

In [325]:
# create an array of numerical labels for classes

class_indices = range(0, num_classes)

# transform questions into appropriate labels
def questions_to_y(qs):
    topic_labels = []
    for q in qs:
        # go through topic_tags, if any of the topics is in the question's topic list
        # append its index to topic_labels
        for i in class_indices:
            if topic_tags[i] in q['topics']:
                topic_labels.append(i)
                # assumes there is only one topic for each question
                break 
                
    return label_binarize(topic_labels, class_indices)

In [326]:
def questions_to_X(qs, voc):
    X = np.zeros(shape=(len(qs), len(voc)))

    for i, q in enumerate(qs):
        X[i, :] = question_to_vector(q, voc)
    return X

In [327]:
X_train = questions_to_X(questions_train, vocabulary_sorted)
X_test = questions_to_X(questions_test, vocabulary_sorted)

In [328]:
assert len(X_train) == len(questions_train)

In [329]:
y_train = questions_to_y(questions_train)
y_test = questions_to_y(questions_test)

In [330]:
# machine learning modules
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

In [331]:
# SVM classifier

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability = True, random_state=np.random.RandomState(0)))

In [332]:
trained_classifier = classifier.fit(X_train, y_train)

In [334]:
preds = trained_classifier.predict_proba(X_test)

print(np.around(preds, 3))

[[ 0.743  0.038  0.037  0.04 ]
 [ 0.823  0.093  0.08   0.003]
 [ 0.827  0.087  0.022  0.01 ]
 [ 0.839  0.103  0.013  0.012]
 [ 0.978  0.025  0.003  0.006]
 [ 0.96   0.067  0.011  0.003]
 [ 0.994  0.04   0.025  0.   ]
 [ 0.981  0.049  0.017  0.002]
 [ 0.979  0.011  0.156  0.001]
 [ 0.875  0.12   0.024  0.004]
 [ 0.95   0.02   0.023  0.007]
 [ 0.98   0.055  0.01   0.002]
 [ 0.562  0.054  0.113  0.029]
 [ 0.858  0.065  0.027  0.012]
 [ 0.988  0.013  0.005  0.008]
 [ 0.992  0.023  0.016  0.001]
 [ 0.991  0.015  0.041  0.001]
 [ 0.917  0.023  0.014  0.027]
 [ 0.838  0.055  0.655  0.001]
 [ 0.042  0.131  0.948  0.014]
 [ 0.008  0.039  0.009  0.991]
 [ 0.03   0.022  0.011  0.961]
 [ 0.038  0.247  0.969  0.005]
 [ 0.018  0.044  0.007  0.977]
 [ 0.032  0.936  0.028  0.017]
 [ 0.048  0.027  1.     0.003]
 [ 0.015  0.024  1.     0.012]
 [ 0.009  0.951  0.065  0.043]
 [ 0.024  0.04   0.008  0.971]
 [ 0.05   0.047  0.001  0.985]
 [ 0.022  0.039  0.995  0.025]
 [ 0.033  0.979  0.013  0.01 ]
 [ 0.012

In [311]:
# code not edited

from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], preds[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), preds.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(roc_auc)

IndexError: index 1 is out of bounds for axis 1 with size 1