In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools

!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null

!apt-get update -qq 2>&1 > /dev/null

!apt-get -y install -qq google-drive-ocamlfuse fuse

from google.colab import auth

auth.authenticate_user()

from oauth2client.client import GoogleCredentials

creds = GoogleCredentials.get_application_default()

import getpass

!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL

vcode = getpass.getpass()

!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p drive 
!google-drive-ocamlfuse drive 

In [1]:
!dir

adc.json  drive  sample_data


In [2]:
import os
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import keras.backend as K
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.layers import *
from keras.models import Model


# For reproducibility.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)
session_conf = tf.ConfigProto(
    intra_op_parallelism_threads=1,
    inter_op_parallelism_threads=1
)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Using TensorFlow backend.


In [0]:
def remove_numbers(ing):
    # remove numbers from ingredients
    
    return [[re.sub("\d+", "", x) for x in y] for y in ing]

    
def remove_special_chars(ing):
    # remove certain special characters from ingredients
   
    ing = [[x.replace("-", " ") for x in y] for y in ing] 
    ing = [[x.replace("&", " ") for x in y] for y in ing] 
    ing = [[x.replace("'", " ") for x in y] for y in ing] 
    ing = [[x.replace("''", " ") for x in y] for y in ing] 
    ing = [[x.replace("%", " ") for x in y] for y in ing] 
    ing = [[x.replace("!", " ") for x in y] for y in ing] 
    ing = [[x.replace("(", " ") for x in y] for y in ing] 
    ing = [[x.replace(")", " ") for x in y] for y in ing] 
    ing = [[x.replace("/", " ") for x in y] for y in ing] 
    ing = [[x.replace("/", " ") for x in y] for y in ing] 
    ing = [[x.replace(",", " ") for x in y] for y in ing] 
    ing = [[x.replace(".", " ") for x in y] for y in ing] 
    ing = [[x.replace(u"\u2122", " ") for x in y] for y in ing] 
    ing = [[x.replace(u"\u00AE", " ") for x in y] for y in ing] 
    ing = [[x.replace(u"\u2019", " ") for x in y] for y in ing] 

    return ing
    
    
def make_lowercase(ing):
    # make all letters lowercase for all ingredients
    
    return [[x.lower() for x in y] for y in ing]
    
    
def remove_extra_whitespace(ing):
    # removes extra whitespaces
    
    return [[re.sub( '\s+', ' ', x).strip() for x in y] for y in ing] 
    
    
def stem_words(ing):
    # word stemming for ingredients
    
    lmtzr = WordNetLemmatizer()
    
    def word_by_word(strng):
        
        return " ".join(["".join(lmtzr.lemmatize(w)) for w in strng.split()])
    
    return [[word_by_word(x) for x in y] for y in ing] 
    
    
def remove_units(ing):
    # remove certain words from ingredients
    
    remove_list = ['g', 'lb', 's', 'n']
        
    def check_word(strng):
        
        s = strng.split()
        resw  = [word for word in s if word.lower() not in remove_list]
        
        return ' '.join(resw)

    return [[check_word(x) for x in y] for y in ing] 


In [0]:
### by all the words

df=pd.read_json('drive/Colab_Notebooks/SKT_Assignment/real_data/train.json')
X = df['ingredients'].values
Y = df['cuisine'].values

In [0]:
X = make_lowercase(X)           # 소문자로 변환
X = remove_numbers(X)           # 숫자 제거 
X = remove_special_chars(X)     # 특수 문자제거
X = remove_extra_whitespace(X)  # 추가 공백 제거
X = remove_units(X)             # ['g', 'lb', 's', 'n'] 와 같은 단위 제거
# X = stem_words(X)               # Lemmatization(원형화) nltk를 활용한 WordNetLemmatizer

In [0]:
df['ingredients_preprocessed'] = X

In [0]:
df['ingredients_as_sentence'] = df['ingredients_preprocessed'].apply(', '.join)

In [19]:
uniques = list(set([item for sublist in X for item in sublist]))
print(len(uniques))

6686


In [0]:
X = df['ingredients_as_sentence'].values

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2 ,random_state=2019, stratify = Y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(31819,) (7955,) (31819,) (7955,)


In [26]:
print(X_train[0])

bourbon whiskey, water, simple syrup, granulated sugar, fresh mint, powdered sugar, mint sprigs


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

In [0]:
full_text = df['ingredients_as_sentence'].values

In [0]:
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)

In [0]:
train_tokenized = tk.texts_to_sequences(X_train)
test_tokenized = tk.texts_to_sequences(X_test)

In [0]:
# Model[5]  참고
# max_len = 50
# maxlen = len(max((s for s in np.r_[x_train, x_test]), key=len))

max_len = 141

X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [0]:
embedding_path = "drive/Colab_Notebooks/SKT_Assignment/crawl-300d-2M.vec"
embed_size = 300
max_features = 30000

In [0]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [38]:
def k_to_one_hot(k_hot_vector):
    # This function converts k-hot target vector to one-hot target matrix
    
    classes = np.unique(k_hot_vector)
    one_hot_matrix = []
    
    for i in np.arange(len(classes)):
        row = (k_hot_vector == classes[i]).astype(int, copy = False)
        if len(one_hot_matrix) == 0:
            one_hot_matrix = row
        else:
            one_hot_matrix = np.vstack((one_hot_matrix, row))
            
    return classes, one_hot_matrix.conj().transpose()
    
classes, y = k_to_one_hot(y_train)
print(len(y[0]))

20


In [0]:
y_ohe = y

In [0]:
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
    
    inp = Input(shape = (max_len,))
    
    ###### changed 4 dimension
    #x = Embedding(4150, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = Embedding(4147, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    
    ###### changed 4 dimension
    x = Dense(20, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [46]:
model1 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=2, dense_units=32, dr=0.1, conv_size=32)

Train on 28637 samples, validate on 3182 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.15045, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.15045 to 0.13918, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.13918 to 0.10908, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.10908 to 0.10045, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.10045 to 0.09808, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.09808 to 0.09479, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.09479
Epoc

In [47]:
model2 = build_model1(lr = 1e-3, lr_d = 1e-10, units = 128, spatial_dr = 0.5, kernel_size1=3, kernel_size2=2, dense_units=64, dr=0.2, conv_size=32)

Train on 28637 samples, validate on 3182 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.12890, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.12890 to 0.11192, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.11192 to 0.10440, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.10440 to 0.09861, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.09861 to 0.09517, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.09517 to 0.09389, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.09389 to 0.09104,

In [0]:
def build_model2(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    file_path = "drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                  save_best_only = True, mode = "min")
    early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

    inp = Input(shape = (max_len,))
    
    ###### changed 4 dimension
    x = Embedding(4147, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(CuDNNGRU(units, return_sequences = True))(x1)
    x_lstm = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x1)
    
    x_conv1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x_conv1)
    max_pool1_gru = GlobalMaxPooling1D()(x_conv1)
    
    x_conv2 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool2_gru = GlobalAveragePooling1D()(x_conv2)
    max_pool2_gru = GlobalMaxPooling1D()(x_conv2)
    
    
    x_conv3 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_conv3)
    max_pool1_lstm = GlobalMaxPooling1D()(x_conv3)
    
    x_conv4 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool2_lstm = GlobalAveragePooling1D()(x_conv4)
    max_pool2_lstm = GlobalMaxPooling1D()(x_conv4)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool2_gru, max_pool2_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool2_lstm, max_pool2_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    
    ###### changed 4 dimension
    x = Dense(20, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 128, epochs = 20, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model

In [49]:
model3 = build_model2(lr = 1e-4, lr_d = 0, units = 64, spatial_dr = 0.5, kernel_size1=4, kernel_size2=3, dense_units=32, dr=0.1, conv_size=32)

Train on 28637 samples, validate on 3182 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.55143, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.55143 to 0.38734, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.38734 to 0.30604, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.30604 to 0.24678, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.24678 to 0.20577, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.20577 to 0.18159, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.18159 to 0.16599,

In [50]:
model4 = build_model2(lr = 1e-3, lr_d = 0, units = 64, spatial_dr = 0.5, kernel_size1=3, kernel_size2=3, dense_units=64, dr=0.3, conv_size=32)

Train on 28637 samples, validate on 3182 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.14259, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.14259 to 0.12117, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.12117 to 0.11177, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.11177 to 0.10683, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.10683 to 0.10653, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.10653 to 0.10446, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss improved from 0.10446 to 0.09755,

In [51]:
model5 = build_model2(lr = 1e-3, lr_d = 1e-7, units = 64, spatial_dr = 0.3, kernel_size1=3, kernel_size2=3, dense_units=64, dr=0.4, conv_size=64)

Train on 28637 samples, validate on 3182 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.13764, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 0.13764 to 0.11620, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 0.11620 to 0.10983, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 4/20

Epoch 00004: val_loss improved from 0.10983 to 0.10263, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 5/20

Epoch 00005: val_loss improved from 0.10263 to 0.09832, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 6/20

Epoch 00006: val_loss improved from 0.09832 to 0.09472, saving model to drive/Colab_Notebooks/SKT_Assignment/MODEL4SAVE/best_model.hdf5
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.09472
Epoc

In [52]:
pred1 = model1.predict(X_test, batch_size = 1024, verbose = 1)
pred = pred1
pred2 = model2.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred2
pred3 = model3.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred3
pred4 = model4.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred4
pred5 = model5.predict(X_test, batch_size = 1024, verbose = 1)
pred += pred5



In [0]:
predictions = np.round(np.argmax(pred, axis=1)).astype(int)

In [0]:
y_test = Lec.fit_transform(y_test)

In [59]:
import sklearn
from sklearn.metrics import classification_report

target_names = ['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek',
           'indian', 'irish', 'italian', 'jamaican', 'japanese', 'korean', 'mexican', 'moroccan',
           'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']


score = sklearn.metrics.accuracy_score(y_test, predictions)

print(score)
print(classification_report(y_test, predictions ,target_names=target_names))

0.671904462602137
              precision    recall  f1-score   support

   brazilian       0.67      0.04      0.08        93
     british       0.00      0.00      0.00       161
cajun_creole       0.72      0.65      0.69       309
     chinese       0.70      0.81      0.75       535
    filipino       0.47      0.06      0.11       151
      french       0.52      0.52      0.52       529
       greek       0.76      0.61      0.68       235
      indian       0.71      0.89      0.79       601
       irish       1.00      0.04      0.07       133
     italian       0.75      0.85      0.79      1568
    jamaican       0.78      0.07      0.12       105
    japanese       0.73      0.52      0.61       284
      korean       0.67      0.63      0.65       166
     mexican       0.86      0.86      0.86      1288
    moroccan       0.57      0.49      0.53       164
     russian       0.00      0.00      0.00        98
 southern_us       0.45      0.77      0.57       864
     span

  'precision', 'predicted', average, warn_for)


In [0]:
model1.