In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping,ModelCheckpoint

from sklearn.model_selection import train_test_split

In [23]:
train_df=pd.read_json('train.json')
test_df=pd.read_json('test.json')

In [24]:
target=train_df['cuisine']
train=train_df.drop('cuisine',axis=1)
test=test_df

In [25]:
t=Tokenizer()
t.fit_on_texts(train['ingredients'])
train_encoded=t.texts_to_matrix(train['ingredients'],mode='tfidf')

In [26]:
cuisines=train_df['cuisine'].unique()
label2index={cuisine:i for i,cuisine in enumerate(cuisines)}
y=[]

for item in target:
    if item in label2index.keys():
        y.append(label2index[item])
y_encoded=to_categorical(y,20)

In [27]:
print(train_encoded.shape)
print(y_encoded.shape)

(29774, 6189)
(29774, 20)


In [28]:
def build_model():
    model=Sequential()
    model.add(Dense(256,input_shape=[train_encoded.shape[1], ],activation='relu',name='hidden_1'))
    model.add(Dropout(0.4, name='dropout_1'))  
    model.add(Dense(20,name='output'))
    model.compile(optimizer='adam',
                  loss='categorical_hinge',
                  metrics=['accuracy'])
    return model

In [29]:
X_train,X_val,y_train,y_val=train_test_split(train_encoded,y_encoded,test_size=0.2,random_state=22)

In [30]:
model=build_model()

In [32]:
monitor=[
    EarlyStopping(monitor='val_loss',patience=5,verbose=1),
    ModelCheckpoint('best-model-0.h5',monitor='val_loss',save_best_only=True,save_weights_only=True)
]

model.fit(X_train,y_train,
         validation_data=(X_val,y_val),
         epochs=6,
         callbacks=monitor,
         batch_size=128)

Train on 23819 samples, validate on 5955 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 00006: early stopping


<keras.callbacks.History at 0x18b380d4828>

In [33]:
test_encoded=t.texts_to_matrix(test_df['ingredients'],mode='tfidf')

In [34]:
model.load_weights('best-model-0.h5')
y_pred=model.predict(test_encoded).argmax(axis=1)

results=[]

for i in y_pred:
    for k,v in label2index.items():
        if v==i:
            results.append(k)

results[:10]

['korean',
 'italian',
 'italian',
 'filipino',
 'italian',
 'cajun_creole',
 'italian',
 'thai',
 'chinese',
 'southern_us']

In [35]:
submission=pd.DataFrame(list(zip(test_df['id'],results)),columns=['id','cuisine'])
submission.to_csv('submission.csv',header=True,index=False)