## read the dataset

In [85]:
import pandas as pd
df = pd.read_json("../data/train.json").set_index("id")
df.cuisine = df.cuisine.astype("category")
df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


## preprocess data

1. lowercase all
2. extract the stem of the word (so "tomatoes" and "tomato" are treated as a same ingredient)
3. remove "stopwords" (so "extra-virgin olive oil" and "olive oil" are the same ingredient)
4. replace whitespace (so that "olive oil" becomes "olive_oil" and gets treated as a single ingredient)
5. join ingredients (join ingredients back to a string format, so we can treat the recipe as a sentence)

In [86]:
%%writefile preprocess.py
## define a preprocessing function that can be used on a dataframe in other files
def preprocess_recipes(df):
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()


    import re
    meaningless_words = r'crush|crumbl|ground|minc|powder|chop|slice|free|less|sodium|kosher|plain|natur|extra-virgin'


    # functions operating on an recipe (sentence) level
    lower_recipe = lambda x:[word.lower() for word in x]
    stem_recipe = lambda x:[" ".join([porter.stem(word) for word in ing.split(' ')]) for ing in x]
    remove_meaningless_words_recipe = lambda x: [re.sub(meaningless_words,'',ing).strip() for ing in x]
    replace_whitespaces_recipe = lambda x: [re.sub(' +', '_', string) for string in x]


    # functions operating on a dataset level - to be used for df.assign 
    lower = lambda x: [lower_recipe(recipe) for recipe in x.ingredients]
    stem = lambda x: [stem_recipe(recipe) for recipe in x.ingredients] 
    remove_meaningless_words = lambda x: [remove_meaningless_words_recipe(recipe) for recipe in x.ingredients] 
    replace_whitespace = lambda x: [replace_whitespaces_recipe(recipe) for recipe in x.ingredients]
    join_ingredients = lambda x:[" ".join(recipe) for recipe in x.ingredients]

    return (df.assign(ingredients=lower)
            .assign(ingredients=stem)
            .assign(ingredients=remove_meaningless_words)
            .assign(ingredients=replace_whitespace)
            .assign(ingredients=join_ingredients)
            )


Overwriting preprocess.py


In [87]:
from preprocess import preprocess_recipes
df = preprocess_recipes(df)
df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,romain_lettuc black_oliv grape_tomato garlic p...
25693,southern_us,flour pepper salt tomato black_pepper thyme eg...
20130,filipino,egg pepper salt mayonais cook_oil green_chili ...
22213,indian,water veget_oil wheat salt
13162,indian,black_pepper shallot cornflour cayenn_pepper o...


The classes are unbalanced, which might cause a problem for our model

In [88]:
df['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

## Bag of words

In [89]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
count_vectorizer = CountVectorizer(max_features=50000, min_df=0, max_df=0.9, ngram_range=(1,2))

X = count_vectorizer.fit_transform(df['ingredients'].values)

df_y = pd.get_dummies(df['cuisine'])
Y = np.array(df_y)

In [90]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, Y, random_state=3628800, test_size=0.2)

## Neural network classifier

In [91]:
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()

model.add(Dense(activation="relu", input_dim=50000, units=150, kernel_initializer="uniform"))
model.add(Dropout(0.5, noise_shape=None, seed=None))
model.add(Dense(activation="softmax", units=20, kernel_initializer="uniform"))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy' , metrics = ['categorical_accuracy', 'top_k_categorical_accuracy'])
model.fit(X_train, y_train, batch_size = 128, epochs = 5, verbose=1)

score = model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.775366881761365
Test accuracy: 0.7769956


We see a decent accuracy, but upon submiting to kaggle the results are 0.17 which is pretty bad. I suspect its the imbalance of the classes, and we see here that top_5_categorical accuracy is really high, which suggest that the model learns the unbalance and doesnt predict properly. I could find a proper loss function to account for this.

## Logistic regression

In [95]:
from sklearn.model_selection import train_test_split

X = count_vectorizer.fit_transform(df['ingredients'].values)
Y = np.array(df['cuisine'])
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=1)

In [97]:
def get_label_weights():
    values = df['cuisine'].value_counts()
    c = zip(values.index, values)
    c = {k:v for k,v in c}
    s = sum(c.values())
    return {k:v/s for k,v in c.items()}


In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(class_weight='balanced', solver='newton-cg', multi_class='multinomial')

classifier = GridSearchCV(model, {'C':[1, 10]}, cv=5,)
classifier=classifier.fit(X_train,y_train)


In [105]:
from sklearn.metrics import accuracy_score

predict = classifier.predict(X_test)
accuracy_score(y_test, predict)

0.7614079195474545

## Submit

In [106]:
import pandas as pd
from preprocess import preprocess_recipes
def predict_test(model):
    df_submit = pd.read_json("../data/test.json").set_index("id")
    df_submit = preprocess_recipes(df_submit)
    X = count_vectorizer.fit_transform(df_submit['ingredients'].values)
    predicitons = model.predict(X)
    return pd.DataFrame(list(zip(df_submit.index.values, predicitons)), columns=['id','cuisine'])

In [108]:
predicions = predict_test(classifier)
predictions.to_csv('sub.csv', columns=['id', 'cuisine'], index=False)