In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.externals import joblib

In [2]:
sents = pd.read_pickle('../data/processed-my-ingredients.pickle')

In [36]:
# Change Sents to Features & Labels!

def sents_to_features(sents):
    features = []
    labels = []
    for token in sents:
        num_words = len(token['lemma'])
        for i in range(num_words):
            features.append({
                'word': token['lemma'][i],
                'pos': token['pos'][i],
                'tag': token['tag'][i],
                'is_alpha': token['is_alpha'][i],
                'is_num': token['is_num'][i],
                'is_start': i == 0,
                'is_end': i == num_words - 1,
#                 'prev_word': token['lemma'][i-1] if i-1 >= 0 else '',
                'prev_pos': token['pos'][i-1] if i-1 >= 0 else '',
                'prev_tag': token['tag'][i-1] if i-1 >= 0 else '',
                'prev_is_num': token['is_num'][i-1] if i-1 >= 0 else '',
#                 'next_word': token['lemma'][i+1] if i+1 < num_words else '',
                'next_pos': token['pos'][i+1] if i+1 < num_words else '',
                'next_tag': token['tag'][i+1] if i+1 < num_words else '',
                'next_is_num': token['is_num'][i+1] if i+1 < num_words else '',
            })
            labels.append(token['label'][i])
    return features, labels

features, labels = sents_to_features(sents)
pickle.dump((features, labels), open('../data/features-my-ingredients.pickle', 'wb'))

In [37]:
print('Num Recipes:', len(features))
features[8]

Num Recipes: 654788


{'word': '4.0',
 'pos': 'NUM',
 'tag': 'CD',
 'is_alpha': False,
 'is_num': True,
 'is_start': True,
 'is_end': False,
 'prev_pos': '',
 'prev_tag': '',
 'prev_is_num': '',
 'next_pos': 'PUNCT',
 'next_tag': '-LRB-',
 'next_is_num': False}

In [38]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.75)



# Overview
We are going to build several basic models trying to use more and more of the data provided. 

1. only pos / tag
1. pos and tag
1. pos, tag, is_alpha, is_num

In [29]:
clf_dt = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('decision_tree', DecisionTreeClassifier(criterion='entropy'))
])
clf_lr = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('log_reg', LogisticRegression())
])
clf_multi_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('multi_nb', MultinomialNB())
])
clf_gaus_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('gaus_nb', GaussianNB())
])
clf_bern_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('bern_nb', BernoulliNB())
])
clf_dt.fit(X_train[:5000], y_train[:5000])
clf_lr.fit(X_train[:5000], y_train[:5000])
clf_multi_nb.fit(X_train[:5000], y_train[:5000])
clf_gaus_nb.fit(X_train[:5000], y_train[:5000])
clf_bern_nb.fit(X_train[:5000], y_train[:5000])
print('Decision Tree Score:      ', clf_dt.score(X_test[:15000], y_test[:15000]))
print('Logistic Regression Score:', clf_lr.score(X_test[:15000], y_test[:15000]))
print('Multi. Naive Bayes Score: ', clf_multi_nb.score(X_test[:15000], y_test[:15000]))
print('Gauss Naive Bayes Score:  ', clf_gaus_nb.score(X_test[:15000], y_test[:15000]))
print('Bern Naive Bayes Score:   ', clf_bern_nb.score(X_test[:15000], y_test[:15000]))



Decision Tree Score:       0.8435333333333334
Logistic Regression Score: 0.8635333333333334
Multi. Naive Bayes Score:  0.8118
Gauss Naive Bayes Score:   0.7548666666666667
Bern Naive Bayes Score:    0.8061333333333334


In [30]:
# WARNING: these are very SLOW!
# clf_svc_rbf = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('svc', SVC())
# ])
clf_svc_linear = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('svc', SVC(kernel='linear'))
])
# clf_svc_rbf.fit(X_train[:5000], y_train[:5000])
clf_svc_linear.fit(X_train[:5000], y_train[:5000])
# print('SVC RBF Score:   ', clf_svc_rbf.score(X_test[:15000], y_test[:15000]))
print('SVC Linear Score:', clf_svc_linear.score(X_test[:15000], y_test[:15000]))

SVC Linear Score: 0.8739333333333333


# Best Models
1. Logistic Regression - 86.4%
1. Linear SVC - 86.9%
1. Decition Tree - 84.4%

In [19]:
def sents_to_features(sents):
    features = []
    labels = []
    for token in sents:
        num_words = len(token['lemma'])
        for i in range(num_words):
            features.append({
                # Current Word
                'word': token['lemma'][i],
#                 'pos': token['pos'][i],
                'tag': token['tag'][i],
#                 'is_alpha': token['is_alpha'][i],
#                 'is_num': token['is_num'][i],
#                 'is_start': i == 0,
#                 'is_end': i == num_words - 1,
#                 # Prev Word
                'prev_word': token['lemma'][i-1] if i-1 >= 0 else '',
#                 'prev_pos': token['pos'][i-1] if i-1 >= 0 else '',
                'prev_tag': token['tag'][i-1] if i-1 >= 0 else '',
#                 'prev_is_alpha': token['is_alpha'][i-1] if i-1 >= 0 else '',
#                 'prev_is_num': token['is_num'][i-1] if i-1 >= 0 else '',
#                 # Prev Prev Word
                'prev_prev_word': token['lemma'][i-2] if i-2 >= 0 else '',
#                 'prev_prev_pos': token['pos'][i-2] if i-2 >= 0 else '',
                'prev_prev_tag': token['tag'][i-2] if i-2 >= 0 else '',
#                 'prev_prev_is_num': token['is_num'][i-2] if i-2 >= 0 else '',
#                 # Next Word
                'next_word': token['lemma'][i+1] if i+1 < num_words else '',
#                 'next_pos': token['pos'][i+1] if i+1 < num_words else '',
                'next_tag': token['tag'][i+1] if i+1 < num_words else '',
#                 'next_is_num': token['is_num'][i+1] if i+1 < num_words else '',
#                 # Next Next
                'next_next_word': token['lemma'][i+2] if i+2 < num_words else '',
#                 'next_next_pos': token['pos'][i+2] if i+2 < num_words else '',
                'next_next_tag': token['tag'][i+2] if i+2 < num_words else '',
#                 'next_next_is_num': token['is_num'][i+2] if i+2 < num_words else '',
            })
            labels.append(token['label'][i])
    return features, labels

features, labels = sents_to_features(sents)
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.75)

clf_lr = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('log_reg', LogisticRegression())
])
# clf_dt = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('decision_tree', DecisionTreeClassifier(criterion='entropy'))
# ])
# clf_svc_linear = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('svc', SVC(kernel='linear'))
# ])
clf_lr.fit(X_train[:275000], y_train[:275000])
print('Logistic Regression Score:', clf_lr.score(X_test, y_test))
# clf_dt.fit(X_train[:50000], y_train[:50000])
# print('Decision Tree Score:', clf_dt.score(X_test[:75000], y_test[:75000]))
# clf_svc_linear.fit(X_train[:50000], y_train[:50000])
# print('SVC Linear Score:', clf_svc_linear.score(X_test[:75000], y_test[:75000]))



Logistic Regression Score: 0.9278606205367234


In [20]:
ex = X_train[10:25]
pprint([t['word'] for t in ex])
pprint(clf_lr.predict(ex))
# pprint(clf_dt.predict(ex))
# pprint(clf_svc_linear.predict(ex))

['salt',
 'sheet',
 'slice',
 'small',
 'parmesan',
 'lettuce',
 'meat',
 ',',
 'small',
 'juice',
 '3.0',
 'large',
 '(',
 '3.5',
 ')']
array(['NAME', 'UNIT', 'COMMENT', 'COMMENT', 'NAME', 'NAME', 'NAME',
       'PUNCTUATION', 'COMMENT', 'NAME', 'QUANTITY', 'COMMENT',
       'PUNCTUATION', 'QUANTITY', 'PUNCTUATION'], dtype='<U11')


In [21]:
filename = '../data/logistic-regression-92.8.pickle'
joblib.dump(clf_lr, filename)

['../data/logistic-regression-92.8.pickle']

In [22]:
clf_lr = joblib.load(filename)