In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from nltk.stem import *
from nltk import wordpunct_tokenize, word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import *
import string
import itertools
from sklearn.preprocessing import LabelEncoder

# Import train and test data

In [3]:
train_dat = pd.read_json("data/train.json")
test_dat = pd.read_json("data/test.json")

In [4]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.stemmer = SnowballStemmer("english")
    def __call__(self, doc):
        return [self.convert_origin(t) for t in word_tokenize(doc)
                if t not in string.punctuation]
    def transform(self,doc):
        return self.__call__(doc)
    def convert_origin(self,word):
        word = word.replace(".","").replace('/','').replace('-','').replace("'s","").replace("'n","")
        return self.wnl.lemmatize(self.stemmer.stem(word).encode('ascii','ignore').lower())
# Smooth idf, sublinear df, norm
stemmer = SnowballStemmer("english")
lemma_normalizer = LemmaTokenizer()
encoder = LabelEncoder()

In [5]:
total_dat = train_dat.drop(['cuisine'],axis=1).append(test_dat)

In [6]:
# normalize_ingres = lambda ingres: str([lemma_normalizer.transform(ingre) for ingre in ingres])
def normalize_ingres(ingres):
    result = [lemma_normalizer.transform(ingre) for ingre in ingres]
    return ' '.join(list(itertools.chain(*result)))

In [7]:
total_dat['ingre_str'] = total_dat.ingredients.map(normalize_ingres)

# Build vocabulary

In [8]:
# set_ingredients = np.array([[stemmer.stem(item).encode('utf-8') for item in ingre] 
#                                 for ingre in total_dat.ingredients])
set_ingredients = np.array([[lemma_normalizer.convert_origin(item) for item in ingre] 
                                for ingre in total_dat.ingredients])

chain = list(itertools.chain(*set_ingredients))
total_items = np.array(list(chain))

total_items = np.unique(total_items)

vocabulary = []
for item in total_items:
    for word in lemma_normalizer.transform(item):
        vocabulary.append(word)
vocab = np.unique(vocabulary)
del vocabulary, set_ingredients

In [12]:
total_dat["ingredient_str"] = total_dat.ingredients.map(lambda d: ' '.join(d))
total_dat_ing = total_dat.drop(['ingredients'],axis=1)

# Vectorize TFIDF

In [13]:
tfIdfTrans = TfidfVectorizer(tokenizer=LemmaTokenizer(), vocabulary=vocab)

In [14]:
tfIdfTrans.fit(total_dat_ing.ingredient_str)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x7f5d3056eed0>,
        use_idf=True,
        vocabulary=array([u'', u'00', ..., u'ziti', u'zucchini'],
      dtype='<U18'))

In [15]:
vectorIngr = tfIdfTrans.transform(total_dat_ing.ingredient_str[:train_dat.shape[0]])

In [16]:
encoder = LabelEncoder()
labels = train_dat.cuisine
X_train, X_test, y_train, y_test = train_test_split(vectorIngr, labels, train_size=0.8)
cv = StratifiedKFold(y_train, n_folds=8, shuffle=True)

In [181]:
# np.savez("train_data.npz",X_train=X_train,X_val=X_test,y_val = y_test,y_train=y_train,
#          X_test=tfIdfTrans.transform(total_dat_ing.ingredient_str[train_dat.shape[0]:]))

In [None]:
estimator = LogisticRegression(C=10)
# estimator = LinearSVC()
# gausianEstimator = GaussianNB()

In [None]:
for train,test in cv:
    estimator.fit(X_train[train],y_train[train])
# scores = estimator.score(X_val,y_val)

In [None]:
scores = estimator.score(X_test,y_test)

In [None]:
print scores

# Get test submissions

In [128]:
# def save_submission(model_name,loss_model,y_test):
test = pd.read_json('data/test.json')
new_test = test.drop(['id'],axis=1)
y_test_sub = estimator.predict(tfIdfTrans.transform(total_dat_ing.ingredient_str[train_dat.shape[0]:]))
submisstion = pd.read_csv('data/sample_submission.csv')
nn_sub = pd.DataFrame(columns=submisstion.columns, index=test.index)
nn_sub.id = test.id
nn_sub[nn_sub.columns[1:]] = y_test_sub
log_loss = scores
nn_sub.to_csv("results/%s_%s.csv"%("LogiRegression",log_loss),index=None)
print "Save submission completed"

Save submission completed


In [228]:
np.savez("preprocess_data_remove_punct_duplicate", X_train=X_train,y_train=y_train,X_val=X_test,y_val=y_test,X_test=total_dat_ing.ingredient_str[train_dat.shape[0]:])

# Reduce data dimensions

In [244]:
del submisstion, y_test_sub

In [247]:
del test, nn_sub

In [104]:
from sklearn.feature_selection import *
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [233]:
model = SelectFromModel(ExtraTreesClassifier(n_estimators=100,n_jobs=-1, warm_start=True, criterion='entropy'))

In [234]:
model.fit(X_train,y_train)

SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=True),
        prefit=False, threshold=None)

In [235]:
X_train_reduce = model.transform(X_train)
X_test_reduce = model.transform(X_test)

In [264]:
classifier = GradientBoostingClassifier()
param_grid = {
    "n_estimators":[40,60],
    "learning_rate":[0.1]
}
gridSearch = GridSearchCV(estimator=classifier,param_grid=param_grid,n_jobs=1, cv=3)
# estimator.fit(X_train_reduce,y_train)

In [None]:
gridSearch.fit(X_train_reduce.toarray(),y_train)

# Latent Sematic Allocation

In [None]:
reducer = TruncatedSVD(n_components=300)

In [None]:
vecLSA = reducer.fit_transform(vectorIngr)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vecLSA, labels, train_size=0.8)

In [None]:
# estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100)
estimator.fit(X_train,y_train)

In [None]:
scores = estimator.score(X_test,y_test)

In [103]:
# # def save_submission(model_name,loss_model,y_test):
# test = pd.read_json('data/test.json')
# new_test = test.drop(['id'],axis=1)
# y_test_sub = estimator.predict(reducer.transform(tfIdfTrans.transform(total_dat_ing.ingredient_str[train_dat.shape[0]:])))
# submisstion = pd.read_csv('data/sample_submission.csv')
# nn_sub = pd.DataFrame(columns=submisstion.columns, index=test.index)
# nn_sub.id = test.id
# nn_sub[nn_sub.columns[1:]] = y_test_sub
# log_loss = scores
# nn_sub.to_csv("results/%s_%s.csv"%("LogiRegression",log_loss),index=None)
# print "Save submission completed"

Save submission completed


In [20]:
import pandas as pd
import numpy as np
from utils import *
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from scipy.sparse import csr_matrix
from data_hub import *

In [22]:
a,b, y_train,y_test,cv = load_onehot_train_and_kfold(n_folds=4)

In [50]:
input_shape = 2895
output_shape = y_train.shape[1]
model = Sequential([
        Dense(256, input_dim=input_shape, init='glorot_uniform',activation='relu'),
        Dropout(0.25),
#         Dense(512, activation='relu'),
        Dense(output_shape, activation='softmax')
    ])

In [54]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [55]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_14 (Dense)                 (None, 256)           741376      dense_input_7[0][0]              
____________________________________________________________________________________________________
dropout_7 (Dropout)              (None, 256)           0           dense_14[0][0]                   
____________________________________________________________________________________________________
dense_15 (Dense)                 (None, 20)            5140        dropout_7[0][0]                  
Total params: 746516
____________________________________________________________________________________________________


In [None]:
history = model.fit(X_train.toarray(),y_train, validation_data=(X_test.toarray(),y_test), 
                    verbose=2, nb_epoch=256, batch_size=64)