In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_json('train.json')

In [3]:
df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
cuisine        39774 non-null object
id             39774 non-null int64
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [5]:
df['ingredients'][df['cuisine'] == 'greek'][0]

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles']

In [6]:
corpus = []
for i in range (0,39774):
    ingredients = ' '.join(df['ingredients'][i])
    corpus.append(ingredients)

In [7]:
corpus[0]

'romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles'

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
countvec = CountVectorizer(max_features=2000)

In [10]:
X = countvec.fit_transform(corpus).toarray()

In [11]:
X[0].__len__()

2000

In [12]:
y = df['cuisine']

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
randf = RandomForestClassifier(n_estimators=100)

In [16]:
randf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
randf.score(X_test, y_test)

0.7501904616791102

In [18]:
predictions = randf.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix, classification_report

In [20]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

   brazilian       0.79      0.50      0.61       164
     british       0.76      0.27      0.39       271
cajun_creole       0.81      0.64      0.72       530
     chinese       0.72      0.89      0.80       835
    filipino       0.80      0.48      0.60       225
      french       0.59      0.51      0.55       911
       greek       0.80      0.57      0.67       382
      indian       0.81      0.90      0.85       992
       irish       0.80      0.35      0.49       222
     italian       0.71      0.92      0.80      2581
    jamaican       0.92      0.51      0.65       150
    japanese       0.85      0.62      0.72       488
      korean       0.91      0.62      0.74       275
     mexican       0.84      0.93      0.88      2138
    moroccan       0.83      0.61      0.70       261
     russian       0.72      0.30      0.42       155
 southern_us       0.65      0.76      0.70      1426
     spanish       0.80    

In [21]:
print(X[0])

[0 0 0 ... 0 0 0]


In [22]:
from sklearn.cluster import KMeans

In [23]:
kmeans = KMeans(n_clusters=20)

In [24]:
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [25]:
dummy = kmeans.cluster_centers_

In [26]:
predictions = randf.predict(dummy)

In [27]:
predictions

array(['italian', 'italian', 'indian', 'italian', 'british', 'italian',
       'southern_us', 'thai', 'chinese', 'thai', 'italian', 'mexican',
       'italian', 'mexican', 'chinese', 'mexican', 'southern_us',
       'filipino', 'southern_us', 'french'], dtype=object)