# Assignment 2 
# (Author: Jan Klinkosz, id number: 394 342, kaggle nick: Johny7013)

First of all we need to import python modules that are necessary to run

In [36]:
import numpy as np
import pandas as pd
import re

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.metrics import accuracy_score

from nltk.corpus import wordnet as wn

import random as rn

Download data

In [37]:
train = pd.read_json('cooking_train.json')
test = pd.read_json('cooking_test.json')

Global data & functions

In [38]:
rn.seed(123)
models = {}
predictions_from_models = {}

number_of_cuisines = len(set(train.cuisine))

def initialise_dict(size):
    d = {}
    for i in range(size):
        d[i] = 0

    return d


# Solution number 1

Genarally, this solution is founded of extra trees models, wich were build for every pair of cuisines. After building N * (N - 1) models we hold a turnament (or series of turnament) where we every cuisine "play" with every other cuisine, wich means that we check what value was returned by model dedicated to decide between cuisine1 and cuisine2. We count number of models, wich pick particular cuisine and k cuisines with the best scores go to another tournament (another tournament is hold only with this k cuisines). At the end our prediction for particular record is the cuisine that stayed after all tournaments.

# Motivation behind this approach:

I found out that extra trees are quite good in deciding between just 2 cuisines (It is much simpler task, and cross validation on this "pair models" was really promising, because I got quite good accuracy ~ 85 - 90 on most of them, sometimes 75 - 78. I thought that if it simpler to get answear between just two of cuisines then the right one should get high score on its N - 1 models, and rest of them sth around expected value for random sampling (of course it's not precisie, because some of the cuisines are really similar to another but it's just my intuition). 

Feature engineering

In [39]:
# not all of them used in preprocessor beacause some of them gave worse results than the others

def has_no_numbers(input_string):
    return not bool(re.search(r'[\d]', input_string))


def is_noun(input_string):
    tmp = wn.synsets(input_string)

    # don't add if it isn't in dictionary
    
    return tmp and tmp[0].pos() == "n"


def is_noun_plus_unrecognised(input_string):
    tmp = wn.synsets(input_string)

    # if it is not in the dictionary add it too (just in case)
    # or if it is a noun (obviously add)

    return not tmp or tmp[0].pos() == "n"


def is_actual_ingredient(input_string):
    return has_no_numbers(input_string) and is_noun(input_string)


# line of words to single ones
def to_single_words(line):
    return ' '.join(line).lower().split()


# remove letter s from the end of words
# create singular from plural (heuristic approach)
def to_singular_form(s):
    if s[len(s) - 1] == 's':
        x = s[0:len(s) - 1]
    else:
        x = s
    return x


def preprocessor(line):
    single_words = to_single_words(line)
    without_numbers = list(filter(has_no_numbers, single_words))
    regularised = list(map(to_singular_form, without_numbers))
    return ' '.join(regularised).lower()




Functions to ensamble

In [40]:
# function to hold tournament
# list_of_tournament_cuisines - cuisines in tournament
# number_of_winners - best number_of_winners cuisine's guesses to pick
def make_tournament(list_of_tournament_cuisines, number_of_winners, number_of_prediction):
    d = initialise_dict(number_of_cuisines)

    for first in range(len(list_of_tournament_cuisines)):
        for second in range(first + 1, len(list_of_tournament_cuisines)):
            d[predictions_from_models[(list_of_tournament_cuisines[first], list_of_tournament_cuisines[second])][number_of_prediction]] += 1

    pairs_from_d = [(d[k], k) for k in d]
    pairs_from_d.sort(reverse=True)

    winners = []

    for k in range(number_of_winners):
        winners.append(pairs_from_d[k][1])

    winners.sort()

    return winners

# predict cuisines for test X on all N * (N - 1) models
def predict(X):

    for m in range(number_of_cuisines):
        for n in range(m + 1, number_of_cuisines):
            predictions_from_models[(m, n)] = models[(m, n)].predict(X)

    result = np.zeros(len(X))

    for m in range(len(X)):
        # get 5 cuisines with the best score
        winners_tournament1 = make_tournament(range(number_of_cuisines), 5, m)
        
        # get 2 cuisines with the best score from 5 that remained
        winners_tournament2 = make_tournament(winners_tournament1, 2, m)

        # get winner from last 2
        winner = make_tournament(winners_tournament2, 1, m)

        result[m] = winner[0]

    return result


Prepare the data for training and genarating a result

In [41]:
recipes = train.ingredients

# use preprocessor
vect = CountVectorizer(preprocessor=preprocessor)

# make one-hot vectors from recipes
vectors = vect.fit_transform(recipes).todense()

ingredients_one_hot_vectors = pd.DataFrame(data=vectors, columns=sorted(vect.vocabulary_))

X_train = ingredients_one_hot_vectors
y_train = train['cuisine']

# cuisines to integers
encoder = preprocessing.LabelEncoder()
encoder = encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_train = pd.Series(y_train)

# concatenate training data with cuisine types
data = pd.concat([X_train, pd.Series(y_train).rename("cuisine_type")], axis=1)
data.reset_index()

cuisines_data = {}

# split data from particular cuisine into one bucket
for i in range(number_of_cuisines):
    cuisine_i = data.loc[data["cuisine_type"] == i]
    cuisines_data[i] = (cuisine_i.drop(columns="cuisine_type"), cuisine_i["cuisine_type"])


Split the data

In [42]:
# I didn't used it when generating final score
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=323)

Params for generating models

In [43]:
# I submitted with n_estimators equals to 600 but it requires sth around 12 GB RAM so in notebook I decided
# to cut this parameter a little
# it may take some time to build all models especially with 600
et_pipeline_pair_models = Pipeline([
    ('classifier', ExtraTreesClassifier(n_estimators=50, random_state=671232, n_jobs=-1))
])

Train models

In [44]:
# Building models (could take some time - around 1 minute on my computer for n_estimators=50)

for i in range(number_of_cuisines):
    for j in range(i + 1, number_of_cuisines):
        X = pd.concat([cuisines_data[i][0], cuisines_data[j][0]], axis=0)
        y = pd.concat([cuisines_data[i][1], cuisines_data[j][1]], axis=0)
        models[(i, j)] = et_pipeline_pair_models.fit(X, y)
        et_pipeline_pair_models = clone(et_pipeline_pair_models)

In [45]:
# Used for testing with split (split also commented above)
# predicting
#prediction = predict(X_test)

# check accuracy
#print(accuracy_score(prediction, y_test))

Genrate predictions

In [46]:
X_test = test['ingredients']

# transform test data to appropriate form
vectors = vect.transform(X_test).todense()
X_test = pd.DataFrame(data=vectors, columns=sorted(vect.vocabulary_))

# could take some time too
prediction = predict(X_test)

# go back from integer to cuisine names
prediction = encoder.inverse_transform(prediction.astype(int))


Generate csv file with predictions to records with y value equal to unknown

In [47]:
# path to directory where predictions shall be placed
result_name = "jk394342_predictions_solution1.csv"

submission = test.copy()
submission['cuisine'] = prediction
submission.to_csv(result_name, index=False, columns=['id', 'cuisine'])

# Solution number 2

The only difference between solution number 1 and solution number 2 is that in solution number two there are no turnament. Predict function works this way: we randomly pick 2 cuisines form all of the cuisines, we get dedicated model for this pair of cuisines and we check which cousine is picked. The one picked stays in the set of cuisines and the other one is thrown away. We repeat this until there is only one cuisine left. This is our prediction for this record. Intuition behind this solution is quite similar to the one behind solution number 1.

Changed function

In [48]:
def predict(X):

    for m in range(number_of_cuisines):
        for n in range(m + 1, number_of_cuisines):
            predictions_from_models[(m, n)] = models[(m, n)].predict(X)

    result = np.zeros(len(X))

    for m in range(len(X)):
        x = number_of_cuisines - 1
        cuisines_numbers = list(range(number_of_cuisines))
        
        for n in range(number_of_cuisines - 1):
            rand1 = rn.randint(0, x)
            rand2 = rn.randint(0, x - 1)

            if rand1 == rand2:
                rand2 += 1

            if rand1 > rand2:
                rand1, rand2 = rand2, rand1

            if predictions_from_models[(cuisines_numbers[rand1], cuisines_numbers[rand2])][m] == cuisines_numbers[rand1]:
                cuisines_numbers.remove(cuisines_numbers[rand2])
            else:
                cuisines_numbers.remove(cuisines_numbers[rand1])

            x -= 1

        result[m] = cuisines_numbers[0]

    return result


Rest of the solution is the same

In [49]:
# Could take some time (like I said above)

recipes = train.ingredients

vect = CountVectorizer(preprocessor=preprocessor)

# make one-hot vectors from recipes
vectors = vect.fit_transform(recipes).todense()

ingredients_one_hot_vectors = pd.DataFrame(data=vectors, columns=sorted(vect.vocabulary_))

X_train = ingredients_one_hot_vectors
y_train = train['cuisine']

# cuisines to integers
encoder = preprocessing.LabelEncoder()
encoder = encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_train = pd.Series(y_train)


#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=123)


data = pd.concat([X_train, pd.Series(y_train).rename("cuisine_type")], axis=1)
data.reset_index()

cuisines_data = {}

# split data about particular cuisine into one bucket
for i in range(number_of_cuisines):
    cuisine_i = data.loc[data["cuisine_type"] == i]
    cuisines_data[i] = (cuisine_i.drop(columns="cuisine_type"), cuisine_i["cuisine_type"])

# For submited solution I used n_estimators=500 but it requires around 12 GB RAM so here I cut it a little
# it may take some time to build all models especially with 500
et_pipeline_pair_models = Pipeline([
    ('classifier', ExtraTreesClassifier(n_estimators=50, random_state=671232, n_jobs=-1))
])

print("Building models")

for i in range(number_of_cuisines):
    for j in range(i + 1, number_of_cuisines):
        X = pd.concat([cuisines_data[i][0], cuisines_data[j][0]], axis=0)
        y = pd.concat([cuisines_data[i][1], cuisines_data[j][1]], axis=0)
        models[(i, j)] = et_pipeline_pair_models.fit(X, y)
        et_pipeline_pair_models = clone(et_pipeline_pair_models)
        
# Used for testing with split (split also commented above)
# predicting
#prediction = predict(X_test)

# check accuracy
#print(accuracy_score(prediction, y_test))
        
test = pd.read_json('cooking_test.json')
X_test = test['ingredients']

vectors = vect.transform(X_test).todense()

X_test = pd.DataFrame(data=vectors, columns=sorted(vect.vocabulary_))

prediction = predict(X_test)

prediction = encoder.inverse_transform(prediction.astype(int))

# path to directory where predictions shall be placed
result_name = "jk394342_predictions_solution2.csv"

submission = test.copy()
submission['cuisine'] = prediction
submission.to_csv(result_name, index=False, columns=['id', 'cuisine'])

Building models


I also tried some solutions with nerual networks but it gave me worse results so I stayed with this solutions. Maybe I did sth wrong. I was sure that neural networks should gave better result but well ...