In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_json("train.json")
test = pd.read_json("test.json")

In [3]:
x_train = train.drop('cuisine', axis = 1)
y_train = pd.DataFrame(train['cuisine'])

# Split the 'features' and 'Yummly' data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_train, 
                                                    y_train , 
                                                    test_size = 0.5, 
                                                    random_state = 123)


In [9]:
# Define how the count vectorizor will 'clean' the strings of ingredients into something useful (take a list of strings and separate the elements by commas, force lowercase)
def xform_string(str_list):
    return ", ".join([
        str.lower()
        for str in str_list
    ])
def toUnicode(test_str):
  res = ''.join(r'\u{:04X}'.format(ord(chr)) for chr in test_str)
  return res


In [10]:
# Preprocessing
# regex pattern means: use unicode, recognize words with lowercase letters between 2 and 40 characters long
vector = CountVectorizer(
    preprocessor = xform_string,
    analyzer = "word",
    token_pattern = r"(?u)\b[a-z]{2,40}\b",
    max_features = 500
)

vector.fit(np.concatenate([x_train.ingredients, x_test.ingredients]))

CountVectorizer(max_features=500,
                preprocessor=<function xform_string at 0x7f6aad4f5710>,
                token_pattern='(?u)\\b[a-z]{2,40}\\b')

In [11]:
model = DecisionTreeClassifier(random_state = 123)
pipe = make_pipeline(vector, model)
pipe.fit(x_train.ingredients, y_train.cuisine)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=500,
                                 preprocessor=<function xform_string at 0x7f6aad4f5710>,
                                 token_pattern='(?u)\\b[a-z]{2,40}\\b')),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(random_state=123))])

In [16]:
results = pipe.predict(x_test.ingredients)
print(accuracy_score(y_test.cuisine, results))

0.601850455071152


Options for our first milestone:
1. Decide cuisine based on number of ingredients
  

*   Make  a dataframe with (id, number of ingredients, cuisine) and use decision Tree Classifier with depth = 1


2. one hot encode ingredients, and pick a token ingredient for every type
* Make pipeline and column transformer, then use decision tree classifier