In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from nltk.stem import *
from nltk import wordpunct_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import *
import string
from keras.layers import *
import itertools
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.sparse import csr_matrix

In [2]:
train_dat = pd.read_json("data/train.json")
test_dat = pd.read_json("data/test.json")

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.stemmer = SnowballStemmer("english")
    def __call__(self, doc):
        return [self.convert_origin(t) for t in word_tokenize(doc)
                if t not in string.punctuation]
    def transform(self,doc):
        return self.__call__(doc)
    def convert_origin(self,word):
        word = word.replace(".","").replace('/','').replace('-','').replace("'s","").replace("'n","")
        return self.wnl.lemmatize(self.stemmer.stem(word).encode('ascii','ignore').lower())
# Smooth idf, sublinear df, norm
stemmer = SnowballStemmer("english")
lemma_normalizer = LemmaTokenizer()
encoder = LabelEncoder()

total_dat = train_dat.drop(['cuisine'],axis=1).append(test_dat)

In [3]:
# normalize_ingres = lambda ingres: str([lemma_normalizer.transform(ingre) for ingre in ingres])
def normalize_ingres(ingres):
    result = [lemma_normalizer.transform(ingre) for ingre in ingres]
    return ','.join(list(itertools.chain(*result)))

In [4]:
total_dat['ingre_str'] = total_dat.ingredients.map(normalize_ingres) 

In [5]:
lemma = lambda x: x.strip().lower().split(',')
ingredient_lemmatized = total_dat.ingre_str.map(lemma)

In [6]:
vocab = []
result = [vocab.extend(recipe) for recipe in ingredient_lemmatized]
vocab = set(vocab)

In [7]:
word2idx = dict((v, i) for i, v in enumerate(vocab))

In [13]:
# idx2word = list(words)
vocab_size = len(vocab)
print vocab_size

2823


In [14]:
recipe_to_array = []

In [15]:
#convert word to integer
to_idx = lambda x: [word2idx[word] for word in x]

In [16]:
recipe_to_array = np.zeros((vocab_size))

In [17]:
# vocbulary of word to one-hot vector

In [18]:
word2hot = {}
for k,v in word2idx.iteritems():
    recipe_to_array = np.zeros((vocab_size))
    recipe_to_array[v] = 1
    word2hot[k] = recipe_to_array

In [19]:
# convert to k-hot vector
def to_k_hot(recipes):
    result = np.zeros((vocab_size))
    for recipe in recipes:
        result = result + word2hot[recipe]
#         print recipe
    return result

In [34]:
recipe_to_array = csr_matrix(np.array(ingredient_lemmatized.map(to_k_hot).tolist()))

In [49]:
onehotEncoder = OneHotEncoder()
labels = onehotEncoder.fit_transform(encoder.fit_transform(train_dat.cuisine).reshape(-1,1))

In [52]:
train_len = train_dat.shape[0]
word_embedding = 400
output_size = encoder.classes_.shape[0]

In [53]:
X_train, X_val, y_train, y_val = train_test_split(recipe_to_array[:train_len],labels, test_size = 0.2)

In [68]:
# input_layer = Input(shape=(X_train))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=word_embedding, input_length=vocab_size)

In [69]:
from keras.models import Sequential

In [80]:
model = Sequential([
        embedding_layer,
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Flatten(),
        Dense(output_size, activation='softmax')
    ])
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [74]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 2823, 400)     1129200     embedding_input_2[0][0]          
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 2819, 128)     256128      embedding_4[0][0]                
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 563, 128)      0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
flatten_2 (Flatten)              (None, 72064)         0           maxpooling1d_2[0][0]             
___________________________________________________________________________________________

In [78]:
# model.fit(X_train.toarray(),y_train,validation_data=(X_val.toarray(),y_val), nb_epoch=256, batch_size=64, verbose=2)

In [71]:
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)  # global max pooling
# x = Flatten()(x)
# x = Dense(128, activation='relu')(x)
# preds = Dense(len(labels_index), activation='softmax')(x)