In [1]:
%pylab inline
from parser import *

Populating the interactive namespace from numpy and matplotlib


In [2]:
parsed = parse_all('../logs')
parsed = [x for x in parsed if len(x) > 100]
print('There are', len(parsed), 'files with over 100 tokens')

Read successfully 1384 out of 1480 files
There are 800 files with over 100 tokens


In [3]:
def get_classified_responses(tokens):
    description = None
    current = []
    responses = []
    positives = []
    prev_tok = None
    for tok in tokens:
        if tok[0] == 'description':
            new_desc = tok[1].strip()
            if description is not None:
                positive = 0 if description == new_desc else 1
                responses += current
                positives += [positive] * len(current)
                current = []
            description = new_desc
        elif tok[0] == 'response' and prev_tok == 'command':
            current.append(tok[1].strip())
        prev_tok = tok[0]
    return responses, positives

In [4]:
responses, positives = [], []
for p in parsed:
    r, p = get_classified_responses(p)
    responses += r
    positives += p

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

limit = 1000000
count_vect = CountVectorizer()
X_count = count_vect.fit_transform(responses[:limit])
tf_transformer = TfidfTransformer(use_idf=False).fit(X_count)
X_tf = tf_transformer.transform(X_count)
clf = MultinomialNB().fit(X_tf, positives[:limit])

In [8]:
predicted = clf.predict(X_tf)
print('Accuracy:', (predicted == positives[:limit]).mean())

Accuracy: 0.875293742458


In [9]:
test = ["You can't do that thing", "Taken", "You opened the chest. You found a golden key inside it", "You need a key to open the chest"]
X_test_count = count_vect.transform(test)
X_test_tf = tf_transformer.transform(X_test_count)
predicted = clf.predict(X_test_tf)
for t, p in zip(test, predicted):
    print(t, p)

You can't do that thing 0
Taken 1
You opened the chest. You found a golden key inside it 1
You need a key to open the chest 0


In [10]:
from sklearn.externals import joblib
joblib.dump((clf, count_vect, tf_transformer), 'response_bayes_classifier.pkl', protocol=2)

['response_bayes_classifier.pkl']

In [11]:
clf2, count_vect2, tf_transformer2 = joblib.load('response_bayes_classifier.pkl')

In [12]:
test = ["You can't do that thing", "Taken", "You opened the chest. You found a golden key inside it", "You need a key to open the chest"]
X_test_count = count_vect2.transform(test)
X_test_tf = tf_transformer2.transform(X_test_count)
predicted = clf2.predict(X_test_tf)
for t, p in zip(test, predicted):
    print(t, p)

You can't do that thing 0
Taken 1
You opened the chest. You found a golden key inside it 1
You need a key to open the chest 0
