## Read data

In [None]:
import pandas as pd
import numpy as np
import nltk

train_data = pd.read_json('data/train.json')

##Preprocess columns

In [None]:
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer

labels =  preprocessing.LabelEncoder()
train_data["cuisine"] = labels.fit_transform(train_data["cuisine"])
train_data["ingredients"] = train_data["ingredients"].astype(list)


##Make bag of words representation

In [None]:
import re
from nltk.stem.porter import PorterStemmer


def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(PorterStemmer().stem(item))
    return stemmed

def stem_tokenize(doc):
    stemmer = PorterStemmer()
    token_pattern = re.compile(r'[a-zA-Z]{2,}')
    tokens = token_pattern.findall(doc)
    stems = stem_tokens(tokens, stemmer)
    return stems

all_ingredients = np.concatenate(train_data["ingredients"])
vect = CountVectorizer(tokenizer=stem_tokenize, stop_words='english', token_pattern='[a-zA-Z]{2,}')
joined_ingredients = [x for x in map(lambda x: " ".join(x), train_data["ingredients"])]
words_vects = vect.fit_transform(joined_ingredients).toarray()
ingredients_names = vect.get_feature_names()

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

kfold = KFold(n=words_vects.shape[0], n_folds=3)

clf = MultinomialNB()

predictions = []
for train, test in kfold:
    train_predictors = words_vects[train,:]
    test_predictors = words_vects[test,:]
    train_target = train_data.iloc[train]["cuisine"]
    clf.fit(train_predictors, train_target)
    test_predictions = clf.predict(test_predictors)
    predictions.append(test_predictions)
predictions = np.concatenate(predictions)


In [None]:
def get_accuracy(actual, predicted):
    acc_counter = 0
    for i in range(len(actual)):
        if(actual[i] == predicted[i]):
            acc_counter += 1
    return acc_counter / len(actual)

accuracy = get_accuracy(train_data["cuisine"], predictions)
print(accuracy)

In [None]:
test_data = pd.read_json('data/kaggle_recipes/test.json')
test_data["ingredients"] = test_data["ingredients"].astype(list)
test_joined_ingredients = [x for x in map(lambda x: " ".join(x), test_data["ingredients"])]
test_words_vects = vect.transform(test_joined_ingredients).toarray()



## Fit on whole dataset and predict on test data:

In [None]:
target = train_data["cuisine"]
clf.fit(words_vects, target)

final_predictions = clf.predict(test_words_vects)
predicted_labels = labels.inverse_transform(final_predictions)
print(predicted_labels)

In [None]:
pd.concat([test_data["id"], pd.Series(predicted_labels, name="cuisine")], axis=1).to_csv('data/kaggle_recipes/submission.csv', index=False)