In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.externals import joblib

In [2]:
sents = pd.read_pickle('../data/processed-my-ingredients.pickle')
np.random.seed(4)
np.random.shuffle(sents) # randomize

In [None]:
# Change Sents to Features & Labels!

def sents_to_features(sents):
    features = []
    labels = []
    for token in sents:
        num_words = len(token['lemma'])
        for i in range(num_words):
            features.append({
                'word': token['lemma'][i],
                'pos': token['pos'][i],
                'tag': token['tag'][i],
                'is_alpha': token['is_alpha'][i],
                'is_num': token['is_num'][i],
                'is_start': i == 0,
                'is_end': i == num_words - 1,
#                 'prev_word': token['lemma'][i-1] if i-1 >= 0 else '',
                'prev_pos': token['pos'][i-1] if i-1 >= 0 else '',
                'prev_tag': token['tag'][i-1] if i-1 >= 0 else '',
                'prev_is_num': token['is_num'][i-1] if i-1 >= 0 else '',
#                 'next_word': token['lemma'][i+1] if i+1 < num_words else '',
                'next_pos': token['pos'][i+1] if i+1 < num_words else '',
                'next_tag': token['tag'][i+1] if i+1 < num_words else '',
                'next_is_num': token['is_num'][i+1] if i+1 < num_words else '',
            })
            labels.append(token['label'][i])
    return features, labels

features, labels = sents_to_features(sents)
pickle.dump((features, labels), open('../data/features-my-ingredients.pickle', 'wb'))

In [None]:
print('Num Recipes:', len(features))
features[8]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.75)

# Overview
We are going to build several basic models trying to use more and more of the data provided. 

1. only pos / tag
1. pos and tag
1. pos, tag, is_alpha, is_num

In [None]:
clf_dt = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('decision_tree', DecisionTreeClassifier(criterion='entropy'))
])
clf_lr = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('log_reg', LogisticRegression())
])
clf_multi_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('multi_nb', MultinomialNB())
])
clf_gaus_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('gaus_nb', GaussianNB())
])
clf_bern_nb = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('bern_nb', BernoulliNB())
])
clf_dt.fit(X_train[:5000], y_train[:5000])
clf_lr.fit(X_train[:5000], y_train[:5000])
clf_multi_nb.fit(X_train[:5000], y_train[:5000])
clf_gaus_nb.fit(X_train[:5000], y_train[:5000])
clf_bern_nb.fit(X_train[:5000], y_train[:5000])
print('Decision Tree Score:      ', clf_dt.score(X_test[:15000], y_test[:15000]))
print('Logistic Regression Score:', clf_lr.score(X_test[:15000], y_test[:15000]))
print('Multi. Naive Bayes Score: ', clf_multi_nb.score(X_test[:15000], y_test[:15000]))
print('Gauss Naive Bayes Score:  ', clf_gaus_nb.score(X_test[:15000], y_test[:15000]))
print('Bern Naive Bayes Score:   ', clf_bern_nb.score(X_test[:15000], y_test[:15000]))

In [None]:
# WARNING: these are very SLOW!
# clf_svc_rbf = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('svc', SVC())
# ])
clf_svc_linear = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('svc', SVC(kernel='linear'))
])
# clf_svc_rbf.fit(X_train[:5000], y_train[:5000])
clf_svc_linear.fit(X_train[:5000], y_train[:5000])
# print('SVC RBF Score:   ', clf_svc_rbf.score(X_test[:15000], y_test[:15000]))
print('SVC Linear Score:', clf_svc_linear.score(X_test[:15000], y_test[:15000]))

# Best Models
1. Logistic Regression - 86.4%
1. Linear SVC - 86.9%
1. Decition Tree - 84.4%

In [3]:
def sents_to_features(sents):
    features = []
    labels = []
    for token in sents:
        num_words = len(token['lemma'])
        for i in range(num_words):
            features.append({
                # Current Word
                'word': token['lemma'][i],
#                 'pos': token['pos'][i],
                'tag': token['tag'][i],
#                 'is_alpha': token['is_alpha'][i],
#                 'is_num': token['is_num'][i],
#                 'is_start': i == 0,
#                 'is_end': i == num_words - 1,
                # Prev Word
                'prev_word': token['lemma'][i-1] if i-1 >= 0 else '',
                'prev_tag': token['tag'][i-1] if i-1 >= 0 else '',
#                 'prev_is_alpha': token['is_alpha'][i-1] if i-1 >= 0 else '',
#                 'prev_is_num': token['is_num'][i-1] if i-1 >= 0 else '',
#                 'prev_is_start': i-1 == 0 if i-1 >= 0 else '',
#                 'prev_is_end': i-1 == num_words - 1 if i-1 >= 0 else '',
                # Prev Prev Word
                'prev_prev_word': token['lemma'][i-2] if i-2 >= 0 else '',
                'prev_prev_tag': token['tag'][i-2] if i-2 >= 0 else '',
#                 'prev_prev_is_alpha': token['is_alpha'][i-2] if i-2 >= 0 else '',
#                 'prev_prev_is_num': token['is_num'][i-2] if i-2 >= 0 else '',
#                 'prev_prev_is_start': i-2 == 0 if i-2 >= 0 else '',
#                 'prev_prev_is_end': i-2 == num_words - 1 if i-2 >= 0 else '',
                # Next Word
                'next_word': token['lemma'][i+1] if i+1 < num_words else '',
                'next_tag': token['tag'][i+1] if i+1 < num_words else '',
#                 'next_is_alpha': token['is_alpha'][i+1] if i+1 < num_words else '',
#                 'next_is_num': token['is_num'][i+1] if i+1 < num_words else '',
#                 'next_is_start': i+1 == 0 if i+1 < num_words else '',
#                 'next_is_end': i+1 == num_words - 1 if i+1 < num_words else '',
                # Next Next
                'next_next_word': token['lemma'][i+2] if i+2 < num_words else '',
                'next_next_tag': token['tag'][i+2] if i+2 < num_words else '',
#                 'next_next_is_alpha': token['is_alpha'][i+2] if i+2 < num_words else '',
#                 'next_next_is_num': token['is_num'][i+2] if i+2 < num_words else '',
#                 'next_next_is_start': i+2 == 0 if i+2 < num_words else '',
#                 'next_next_is_end': i+2 == num_words - 1 if i+2 < num_words else '',
            })
            labels.append(token['label'][i])
    return features, labels

features, labels = sents_to_features(sents)
X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    train_size=0.75, 
                                                    shuffle=False)

clf_lr = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('log_reg', LogisticRegression())
])
# clf_dt = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('decision_tree', DecisionTreeClassifier(criterion='entropy'))
# ])
# clf_svc_linear = Pipeline([
#     ('vectorizer', DictVectorizer(sparse=False)),
#     ('svc', SVC(kernel='linear'))
# ])
clf_lr.fit(X_train, y_train)
print('Logistic Regression Score:', clf_lr.score(X_test, y_test))
# clf_dt.fit(X_train[:50000], y_train[:50000])
# print('Decision Tree Score:', clf_dt.score(X_test[:75000], y_test[:75000]))
# clf_svc_linear.fit(X_train[:50000], y_train[:50000])
# print('SVC Linear Score:', clf_svc_linear.score(X_test[:75000], y_test[:75000]))



Logistic Regression Score: 0.9317153032737313


In [5]:
filename = '../data/logistic-regression-93.2.pickle'
joblib.dump(clf_lr, filename)

['../data/logistic-regression-93.2.pickle']

In [6]:
filename = '../data/logistic-regression-93.2.pickle'
clf_lr = joblib.load(filename)

In [7]:
ex = X_test[150:200]
labels = y_test[150:200]
preds = clf_lr.predict(ex)

incorrect = []
for i,l in enumerate(labels):
    if l != preds[i]:
        incorrect.append((l,preds[i],ex[i],i))

if incorrect:
    pprint(incorrect)
else:
    print('All correct!')
    
# pprint([w['is_end'] for w in X_test[150+23:150+40]])
pd.DataFrame(X_test[150+23:150+40])

[('OTHER',
  'NAME',
  {'next_next_tag': '',
   'next_next_word': '',
   'next_tag': '',
   'next_word': '',
   'prev_prev_tag': 'NN',
   'prev_prev_word': 'pine',
   'prev_tag': 'NNS',
   'prev_word': 'nut',
   'tag': 'NNS',
   'word': 'almond'},
  14),
 ('COMMENT',
  'NAME',
  {'next_next_tag': 'NN',
   'next_next_word': 'wine',
   'next_tag': 'JJ',
   'next_word': 'white',
   'prev_prev_tag': 'CD',
   'prev_prev_word': '0.5',
   'prev_tag': 'NNS',
   'prev_word': 'cup',
   'tag': 'VBP',
   'word': 'dry'},
  32),
 ('COMMENT',
  'NAME',
  {'next_next_tag': 'NN',
   'next_next_word': 'chicken',
   'next_tag': 'NN',
   'next_word': 'wine',
   'prev_prev_tag': 'NNS',
   'prev_prev_word': 'cup',
   'prev_tag': 'VBP',
   'prev_word': 'dry',
   'tag': 'JJ',
   'word': 'white'},
  33),
 ('OTHER',
  'NAME',
  {'next_next_tag': 'NN',
   'next_next_word': 'broth',
   'next_tag': 'NN',
   'next_word': 'chicken',
   'prev_prev_tag': 'VBP',
   'prev_prev_word': 'dry',
   'prev_tag': 'JJ',
   'prev

Unnamed: 0,next_next_tag,next_next_word,next_tag,next_word,prev_prev_tag,prev_prev_word,prev_tag,prev_word,tag,word
0,NNP,tbsp,NNP,½,,,,,CD,1.0
1,.,.,NNP,tbsp,,,CD,1.0,NNP,½
2,NN,sake,.,.,CD,1.0,NNP,½,NNP,tbsp
3,,,NN,sake,NNP,½,NNP,tbsp,.,.
4,,,,,NNP,tbsp,.,.,NN,sake
5,CD,0.5,SYM,-,,,,,CD,1
6,NNS,cup,CD,0.5,,,CD,1,SYM,-
7,VBP,dry,NNS,cup,CD,1,SYM,-,CD,0.5
8,JJ,white,VBP,dry,SYM,-,CD,0.5,NNS,cup
9,NN,wine,JJ,white,CD,0.5,NNS,cup,VBP,dry


In [11]:
len(X_train)

491091