In [1]:
import json
import numpy as np

In [2]:
with open('whats-cooking/train.json') as json_file:
    data = json.load(json_file)

In [3]:
with open('whats-cooking/test.json') as json_file:
    data_test = json.load(json_file)

## Information about the data

In [5]:
## NUMBER OF DISHES
len(data)

39774

In [4]:
## AMOUNT OF CUISINE

cuisine = []
for i in range(len(data)):
     cuisine.append(data[i]["cuisine"])

unique_cuisine = np.unique(cuisine)
print(unique_cuisine)
len(unique_cuisine)

['brazilian' 'british' 'cajun_creole' 'chinese' 'filipino' 'french'
 'greek' 'indian' 'irish' 'italian' 'jamaican' 'japanese' 'korean'
 'mexican' 'moroccan' 'russian' 'southern_us' 'spanish' 'thai'
 'vietnamese']


20

In [5]:
ingredients = []
for i in range(len(data)):
     ingredients.extend(data[i]["ingredients"])

unique_ingredients = np.unique(ingredients)
print(unique_ingredients)
len(unique_ingredients)

['(    oz.) tomato sauce' '(   oz.) tomato paste'
 '(10 oz.) frozen chopped spinach' ... 'ziti' 'zucchini'
 'zucchini blossoms']


6714

## Binary ingredients feature vector

In [6]:
ingredients = set()
for meal in data:
    ingredients = ingredients.union(set(meal["ingredients"]))

ingredients = list(ingredients)
ingredientIndex = {ingredient:index for index, ingredient in enumerate(ingredients)}

In [7]:
foodMatrix = np.zeros((len(data),len(ingredients)))

In [8]:
for index, meal in enumerate(data):
    for uniqueI in meal["ingredients"]:
        ingIndex = ingredientIndex[uniqueI]
        foodMatrix[index][ingIndex] = 1

##### Test Food Matrix

In [9]:
foodMatrix_test = np.zeros((len(data_test),len(ingredients)))

for index, meal in enumerate(data_test):
    for uniqueI in meal["ingredients"]:
        if uniqueI in ingredientIndex:
            ingIndex = ingredientIndex[uniqueI]
            foodMatrix_test[index][ingIndex] = 1

## Naive Bayes Classifier

In [52]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

In [53]:
from sklearn.model_selection import KFold

In [54]:
gnb = GaussianNB()
bnb = BernoulliNB()

In [55]:
kf = KFold (n_splits = 3)

In [56]:
#GaussianNB
accuracy_gnb = []
for train_index, test_index in kf.split(foodMatrix):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = foodMatrix[train_index], foodMatrix[test_index]
    y_train, y_test = np.array(cuisine)[train_index], np.array(cuisine)[test_index]
    
    accuracy_gnb.append(gnb.fit(X_train,y_train).score(X_test,y_test))

TRAIN: [13258 13259 13260 ... 39771 39772 39773] TEST: [    0     1     2 ... 13255 13256 13257]
TRAIN: [    0     1     2 ... 39771 39772 39773] TEST: [13258 13259 13260 ... 26513 26514 26515]
TRAIN: [    0     1     2 ... 26513 26514 26515] TEST: [26516 26517 26518 ... 39771 39772 39773]


In [57]:
accuracy_gnb

[0.37901644290239855, 0.3829386031075577, 0.37758334590435966]

In [58]:
#BernoulliNB
accuracy_bnb = []
for train_index, test_index in kf.split(foodMatrix):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = foodMatrix[train_index], foodMatrix[test_index]
    y_train, y_test = np.array(cuisine)[train_index], np.array(cuisine)[test_index]
    
    accuracy_bnb.append(bnb.fit(X_train,y_train).score(X_test,y_test))

TRAIN: [13258 13259 13260 ... 39771 39772 39773] TEST: [    0     1     2 ... 13255 13256 13257]
TRAIN: [    0     1     2 ... 39771 39772 39773] TEST: [13258 13259 13260 ... 26513 26514 26515]
TRAIN: [    0     1     2 ... 26513 26514 26515] TEST: [26516 26517 26518 ... 39771 39772 39773]


In [59]:
accuracy_bnb

[0.684190677326897, 0.6795142555438226, 0.6869060190073918]

## Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
clf = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial')

In [62]:
kf = KFold (n_splits = 3)

In [63]:
accuracy_logreg = []
for train_index, test_index in kf.split(foodMatrix):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = foodMatrix[train_index], foodMatrix[test_index]
    y_train, y_test = np.array(cuisine)[train_index], np.array(cuisine)[test_index]
    
    accuracy_logreg.append(clf.fit(X_train,y_train).score(X_test,y_test))

TRAIN: [13258 13259 13260 ... 39771 39772 39773] TEST: [    0     1     2 ... 13255 13256 13257]




TRAIN: [    0     1     2 ... 39771 39772 39773] TEST: [13258 13259 13260 ... 26513 26514 26515]




TRAIN: [    0     1     2 ... 26513 26514 26515] TEST: [26516 26517 26518 ... 39771 39772 39773]




In [64]:
accuracy_logreg

[0.7723638557851863, 0.7705536279981898, 0.7762860159903454]

## Predictions Test data

In [14]:
clf = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(foodMatrix, np.array(cuisine))



In [15]:
predictions = clf.predict(foodMatrix_test)

In [16]:
predictions

array(['british', 'southern_us', 'italian', ..., 'italian', 'southern_us',
       'mexican'], dtype='<U12')

## Export CSV

In [44]:
ids = [data_test[i]["id"] for i in range(len(data_test))]

In [51]:
import csv

with open('cuisine.csv', 'w') as csvfile:
    headers = ['id', 'cuisine']
    writer = csv.DictWriter(csvfile, fieldnames=headers)

    writer.writeheader()
    for ids, i in zip(ids, predictions):
        writer.writerow({headers[0]: ids, headers[1]: i})
        