# Kaggle - What's Cooking
(https://www.kaggle.com/c/whats-cooking-kernels-only)

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
import re

### 1. Read Data

In [3]:
train_data = pd.read_json("./dataset/train.json")
test_data = pd.read_json("./dataset/test.json")

In [4]:
train_data.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
test_data.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [6]:
def trim_ingredient(ingredient_list):
    return [re.sub("[ ]{0,1}\([\w+\.\s]*\.\)[ ]{0,1}", "", ingredient) for ingredient in ingredient_list]

In [7]:
ingredients_set = np.empty(0)

In [8]:
test_ingredients_set = np.empty(0)

In [9]:
train_data["ingredients"]

0        [romaine lettuce, black olives, grape tomatoes...
1        [plain flour, ground pepper, salt, tomatoes, g...
2        [eggs, pepper, salt, mayonaise, cooking oil, g...
3                      [water, vegetable oil, wheat, salt]
4        [black pepper, shallots, cornflour, cayenne pe...
5        [plain flour, sugar, butter, eggs, fresh ginge...
6        [olive oil, salt, medium shrimp, pepper, garli...
7        [sugar, pistachio nuts, white almond bark, flo...
8        [olive oil, purple onion, fresh pineapple, por...
9        [chopped tomatoes, fresh basil, garlic, extra-...
10       [pimentos, sweet pepper, dried oregano, olive ...
11       [low sodium soy sauce, fresh ginger, dry musta...
12       [Italian parsley leaves, walnuts, hot red pepp...
13       [ground cinnamon, fresh cilantro, chili powder...
14       [fresh parmesan cheese, butter, all-purpose fl...
15       [tumeric, vegetable stock, tomatoes, garam mas...
16       [greek yogurt, lemon curd, confectioners sugar.

In [10]:
train_data["trimmed_ingredients"] = train_data["ingredients"].map(trim_ingredient)

In [11]:
test_data["trimmed_ingredients"] = test_data["ingredients"].map(trim_ingredient)

In [12]:
train_data.head()

Unnamed: 0,cuisine,id,ingredients,trimmed_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe..."


In [13]:
test_data.head()

Unnamed: 0,id,ingredients,trimmed_ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi...","[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...","[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...","[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...","[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l...","[ground black pepper, salt, sausage casings, l..."


In [14]:
for ingredient in train_data["trimmed_ingredients"]:
    ingredients_set = np.union1d(ingredients_set, ingredient) 

In [15]:
# for ingredient in test_data["trimmed_ingredients"]:
#     test_ingredients_set = np.unioni

In [16]:
ingredients_set

array(['1% low-fat buttermilk', '1% low-fat chocolate milk',
       '1% low-fat cottage cheese', ..., 'ziti', 'zucchini',
       'zucchini blossoms'], dtype='<U71')

In [17]:
len(ingredients_set)

6707

In [18]:
entire_ingredients_dict = dict()
for index, value in enumerate(ingredients_set):
    entire_ingredients_dict[value] = index

In [19]:
entire_ingredients_dict

{'red bliss potato': 5005,
 'sour cherries': 5714,
 'hellmannâ€™ or best food canola cholesterol free mayonnais': 3262,
 'dill seed': 2238,
 'baby corn': 710,
 'less sodium mushroom flavored soy sauce': 3708,
 'chile paste': 1557,
 'bone in skin on chicken thigh': 1010,
 'mild Italian sausage': 4092,
 'linguisa': 3762,
 'mortadella': 4182,
 'canned chicken broth': 1317,
 'egg whites': 2424,
 'boysenberries': 1094,
 'aged gouda': 528,
 'Cavenders Greek Seasoning': 78,
 'foccacia': 2638,
 'halibut fillets': 3214,
 'chili flakes': 1573,
 'ouzo': 4406,
 'veal breast': 6329,
 'red radishes': 5037,
 'clove': 1793,
 "devil's food cake mix": 2209,
 'bertolli vineyard premium collect marinara with burgundi wine sauc': 898,
 'fresh corn': 2691,
 'Greek feta': 162,
 'brown cardamom': 1162,
 'gluten-free bread': 2956,
 'granny smith apples': 3016,
 'ice cream salt': 3375,
 'Kraft Shredded Pepper Jack Cheese': 289,
 'watermelon': 6437,
 'cream filled chocolate sandwich cookies': 2019,
 'kinchay': 3

In [20]:
def convert_data_to_index_with_datatable(data_list):
    return np.array(list(map(lambda data: entire_ingredients_dict[data], data_list)))

In [21]:
def remove_data_if_not_exist(data_list):
    return None

In [22]:
train_data["trimmed_index"] = train_data["trimmed_ingredients"].map(convert_data_to_index_with_datatable)

In [23]:
test_data["trimmed_index"] = test_data["trimmed_ingredients"].map(convert_data_to_index_with_datatable)

KeyError: 'Grey Poupon Dijon Mustard'

In [24]:
train_data

Unnamed: 0,cuisine,id,ingredients,trimmed_ingredients,trimmed_index
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...","[5215, 949, 3026, 2877, 4562, 4904, 5398, 2871..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g...","[4702, 3159, 5302, 6139, 3128, 6083, 2429, 311..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g...","[2429, 4562, 5302, 4024, 1933, 3083, 3117, 289..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]","[6428, 6368, 6443, 5302]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe...","[951, 5444, 1971, 1408, 4354, 2892, 4101, 1213..."
5,jamaican,6602,"[plain flour, sugar, butter, eggs, fresh ginge...","[plain flour, sugar, butter, eggs, fresh ginge...","[4702, 5881, 1213, 2429, 2700, 5302, 3139, 410..."
6,spanish,42779,"[olive oil, salt, medium shrimp, pepper, garli...","[olive oil, salt, medium shrimp, pepper, garli...","[4336, 5302, 4053, 4562, 2877, 1699, 3438, 261..."
7,italian,3735,"[sugar, pistachio nuts, white almond bark, flo...","[sugar, pistachio nuts, white almond bark, flo...","[5881, 4672, 6465, 2629, 6307, 4336, 552, 2429..."
8,mexican,16903,"[olive oil, purple onion, fresh pineapple, por...","[olive oil, purple onion, fresh pineapple, por...","[4336, 4904, 2729, 4747, 4718, 1961, 1454, 312..."
9,italian,12734,"[chopped tomatoes, fresh basil, garlic, extra-...","[chopped tomatoes, fresh basil, garlic, extra-...","[1719, 2674, 2877, 2478, 3584, 2610]"


In [25]:
def create_runnable_data(data_series, datatable_dict):
    datatable_dict_len = len(datatable_dict)
    runnable_data = np.empty(0).reshape(0, datatable_dict_len)
    
    for data_list in data_series:
        data_row = np.zeros(datatable_dict_len)
        
        for index in data_list:
            data_row[index] = 1
        
        runnable_data = np.vstack([runnable_data, data_row])
    
    return runnable_data

In [26]:
# runnable_train_data = create_runnable_data(train_data["trimmed_index"], entire_ingredients_dict)

In [27]:
# np.savetxt("./runnable_train_data.csv", runnable_train_data, delimiter=",")

### @TODO
- 큰 파일을 스레드로 나눠서 읽기 

In [28]:
runnable_train_data = np.loadtxt("./runnable_train_data.csv", delimiter=",", dtype=np.float32)

In [30]:
runnable_train_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [31]:
labels = np.array(list(set(train_data["cuisine"])))

In [32]:
len(labels)

20

### 2. Making Model

In [33]:
tf.set_random_seed(777)

In [37]:
# Hyperparams
total_data_size = len(runnable_train_data)  # 39774
input_size = len(runnable_train_data[0])  # 6707
output_size = len(labels)  # 20
hidden_size_1 = 3000
hidden_size_2 = 1000
hidden_size_3 = 500

learning_rate = 1e-2
batch_size = 100
training_epochs = 100
keep_prob = 0.7

In [35]:
class MyModel(object):
    
    def __init__(self, sess, name):
        self.sess = sess
        self.name = name
        
        self.X = tf.placeholder(tf.float32, shape=[None, input_size])
        self.Y = tf.placeholder(tf.float32, shape=[None, output_size])
        self.keep_prob = tf.placeholder(tf.float32)
        
        self.hypothesis = None
        self.loss = None
        self.optimizer = None
        
        self.prediction = None
        self.is_correct = None
        self.accuracy = None
    
    def build_layers(self):
        with tf.variable_scope(self.name):
            with tf.variable_scope("Layer1"):
                W1 = tf.get_variable(name="W", shape=[input_size, hidden_size_1], initializer=tf.contrib.layers.xavier_initializer())
                b1 = tf.Variable(tf.random_normal([hidden_size_1]), name="b")
                logits1 = tf.matmul(self.X, W1) + b1
                L1 = tf.nn.relu(logits1)
                L1 = tf.nn.dropout(L1, keep_prob=self.keep_prob)
            
            with tf.variable_scope("Layer2"):
                W2 = tf.get_variable(name="W", shape=[hidden_size_1, hidden_size_2], initializer=tf.contrib.layers.xavier_initializer())
                b2 = tf.Variable(tf.random_normal([hidden_size_2]), name="b")
                logits2 = tf.matmul(L1, W2) + b2
                L2 = tf.nn.relu(logits2)
                L2 = tf.nn.dropout(L2, keep_prob=self.keep_prob)
            
            with tf.variable_scope("Layer3"):
                W3 = tf.get_variable(name="W", shape=[hidden_size_2, hidden_size_3], initializer=tf.contrib.layers.xavier_initializer())
                b3 = tf.Variable(tf.random_normal([hidden_size_3]), name="b")
                logits3 = tf.matmul(L2, W3) + b3
                L3 = tf.nn.relu(logits3)
                L3 = tf.nn.dropout(L3, keep_prob=self.keep_prob)
            
            with tf.variable_scope("Layer4"):
                W4 = tf.get_variable(name="W", shape=[hidden_size_3, output_size], initializer=tf.contrib.layers.xavier_initializer())
                b4 = tf.Variable(tf.random_normal([output_size]), name="b")
                logits4 = tf.matmul(L3, W4) + b4
            
        self.hypothesis = logits4
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.hypothesis, labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

        self.prediction = tf.argmax(self.hypothesis, axis=1)
        self.is_correct = tf.equal(self.prediction, tf.argmax(self.Y, axis=1))
        self.accuracy = tf.reduce_mean(tf.cast(self.is_correct, dtype=tf.float32))

    def train(x_train, y_train, keep_prob: float=.7):
        return self.sess(fetches=[self.loss, self.optimizer], feed_dict={self.X: x_train, self.Y: y_train, self.keep_prob: keep_prob})
    
    def predict(x_test, keep_prob: float=1.0):
        return self.sess(fetches=[self.prediction], feed_dict={self.X: x_test, self.keep_prob: keep_prob})
    
    def get_accuracy(x_test, y_test, keep_prob: float=1.0):
        return self.sess(fetches=[self.accuracy], feed_dict={self.X: x_test, self.Y: y_test, self.keep_prob: keep_prob})
        

### 3. Run

In [36]:
sess = tf.Session()

model = MyModel(sess=sess, name="WC_Model")
model.build_layers()

sess.run(tf.global_variables_initializer())


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
# Training
print("Training statred ...")

for epoch in range(training_epochs):
    avg_loss = 0
    
    for i in range(batch_iter):
        batch_xs = None
        batch_ys = None
        
        loss_val, _ = model.train(x_train=batch_xs, y_train=batch_ys, keep_prob=keep_prob)
        
        avg_loss += loss_val / batch_iter
    
    print("Epoch : {:04d}, Loss : {:9f}".format(epoch, avg_loss))

print("Training finished ...")

In [None]:
# Getting the accuracy
x_test = None
y_test = None

acc = model.get_accuracy(x_test=x_test, y_test=y_test, keep_prob=1.0)
print("Accuracy : {}".format(acc))