In [63]:
from collections import Counter

import json
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import warnings
from xgboost import XGBClassifier
import zipfile


pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [64]:
#Open the files
train_path = '/kaggle/input/whats-cooking/train.json.zip'
with zipfile.ZipFile(train_path, 'r') as zip_train:
    zip_train.extractall()

test_path = '/kaggle/input/whats-cooking/test.json.zip'
with zipfile.ZipFile(test_path, 'r') as zip_test:
    zip_test.extractall()

train_json_path = '/kaggle/working/train.json'

df_train = pd.read_json(train_json_path)
train = df_train

test_json_path = '/kaggle/working/test.json'
df_test = pd.read_json(test_json_path)

In [65]:
df_train.shape, df_test.shape

((39774, 3), (9944, 2))

In [66]:
df_train.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [67]:
df_test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [68]:
df_train.drop(['id'], axis=1, inplace=True)
test_id = df_test['id']
df_test.drop(['id'], axis=1, inplace=True)

In [69]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   cuisine      39774 non-null  object
 1   ingredients  39774 non-null  object
dtypes: object(2)
memory usage: 621.6+ KB


In [70]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9944 entries, 0 to 9943
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ingredients  9944 non-null   object
dtypes: object(1)
memory usage: 77.8+ KB


There are no NaN in this dataset

## We will begin with a brute force approach. Every ingredient of the dataframe will be a feature

In [71]:
#Checking every items of the ingredient column.
expanded_df = df_train['ingredients'].explode()
ingredient_counts = expanded_df.value_counts()
ingredient_counts_list = ingredient_counts.tolist()
print(len(ingredient_counts_list))

6714


In [72]:
#This would be way too much features, we will remove each ingredients that appear less than 4 times
filtered_ingredient_counts = ingredient_counts[ingredient_counts > 3]
filtered_ingredient_list = filtered_ingredient_counts.index.tolist()
print(len(filtered_ingredient_list))

3675


In [73]:
#Creating a column for each existing ingredient.
for ingredient in filtered_ingredient_list:
    df_train[ingredient.strip().lower()] = False

#Filling those column according to the ingredients column.
for index, row in df_train.iterrows():
    for ingredient in row['ingredients']:
        ingredient = ingredient.strip().lower()
        if ingredient in df_train.columns:
            df_train.at[index, ingredient] = True

#removing the ingredients column.
df_train.drop(['ingredients'], axis=1, inplace=True)

In [74]:
#We split the train dataframe into target and features
X = df_train.drop('cuisine', axis=1)
y = df_train['cuisine']

In [75]:
#Create K-Fold
kf = KFold(n_splits=5, random_state=0, shuffle=True)

In [76]:
"""
#We begin with an XGBoost model
model = XGBClassifier(use_label_encoder=False)

#Transform from pandas dataframe to numpy array for XGBoost
X_array = X.values
y_array = y.values

#Encode the target to numerical for XGBoost
le = LabelEncoder()
y_array = le.fit_transform(y.values)

accuracy_list = []
for train_index, valid_index in kf.split(X_array):
    X_train, X_valid = X_array[train_index], X_array[valid_index]
    y_train, y_valid = y_array[train_index], y_array[valid_index]
    
    #Train the model
    model.fit(X_train, y_train, eval_metric='logloss')

    #Make predictions on the validation set
    y_pred = model.predict(X_valid)

    #Evaluate the model and append the accuracy to the list
    accuracy = accuracy_score(y_valid, y_pred)
    accuracy_list.append(accuracy)

print(f'Mean Accuracy: {np.mean(accuracy_list)}')
"""

"\n#We begin with an XGBoost model\nmodel = XGBClassifier(use_label_encoder=False)\n\n#Transform from pandas dataframe to numpy array for XGBoost\nX_array = X.values\ny_array = y.values\n\n#Encode the target to numerical for XGBoost\nle = LabelEncoder()\ny_array = le.fit_transform(y.values)\n\naccuracy_list = []\nfor train_index, valid_index in kf.split(X_array):\n    X_train, X_valid = X_array[train_index], X_array[valid_index]\n    y_train, y_valid = y_array[train_index], y_array[valid_index]\n    \n    #Train the model\n    model.fit(X_train, y_train, eval_metric='logloss')\n\n    #Make predictions on the validation set\n    y_pred = model.predict(X_valid)\n\n    #Evaluate the model and append the accuracy to the list\n    accuracy = accuracy_score(y_valid, y_pred)\n    accuracy_list.append(accuracy)\n\nprint(f'Mean Accuracy: {np.mean(accuracy_list)}')\n"

We get an accuracy of 75% which is not great. For reference this accuracy would put this method in the bottom 33% of this Kaggle competition results.

In [77]:
#We try a Logistic Regression model
model = LogisticRegression()

#Transform from pandas dataframe to numpy array for Logistic Regression
X_array = X.values
y_array = y.values

#Encode the target to numerical for Logistic Regression
le = LabelEncoder()
y_array = le.fit_transform(y.values)

accuracy_list = []
for train_index, valid_index in kf.split(X_array):
    X_train, X_valid = X_array[train_index], X_array[valid_index]
    y_train, y_valid = y_array[train_index], y_array[valid_index]
    
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred = model.predict(X_valid)

    # Evaluate the model and append the accuracy to the list
    accuracy = accuracy_score(y_valid, y_pred)
    accuracy_list.append(accuracy)

print(f'Mean Accuracy: {np.mean(accuracy_list)}')

Mean Accuracy: 0.7761097302575921


We get an accuracy of 77.6% which is better and rank us in the top 50% of this Kaggle competition results.

In [78]:
#We process the test dataframe in the same way as the training dataframe.

for ingredient in filtered_ingredient_list:
    df_test[ingredient.strip().lower()] = False


for index, row in df_test.iterrows():
    for ingredient in row['ingredients']:
        ingredient = ingredient.strip().lower()
        if ingredient in df_test.columns:
            df_test.at[index, ingredient] = True

df_test.drop(['ingredients'], axis=1, inplace=True)

In [99]:
#Make predictions on the new data

X_test = df_test.values
y_test = model.predict(X_test)
#prediction_df = pd.DataFrame(y_test, columns=['cuisine'])

prediction_np = le.inverse_transform(y_test)
prediction_df = pd.DataFrame(prediction_np)

#Add back the index column
prediction_df = pd.concat([test_id,prediction_df],axis=1)
prediction_df.columns = ['id','cuisine']

prediction_df.to_csv('submission.csv', header=True, index=False)