In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [2]:
cuisines_df = pd.read_csv("data/cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
#Prepare our label 

cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [4]:
#Extract feature

cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [6]:
#Testing with logistic regression

lr = LogisticRegression(multi_class='ovr',solver='liblinear')
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.7964970809007507


In [7]:
#Testing the model w/ real data

print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

test= X_test.iloc[50].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()

ingredients: Index(['seaweed', 'sesame_oil', 'vegetable_oil'], dtype='object')
cuisine: korean




Unnamed: 0,0
japanese,0.749165
korean,0.155916
chinese,0.060249
indian,0.022665
thai,0.012005


In [8]:
#Classification report
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.73      0.70      0.71       230
      indian       0.87      0.89      0.88       222
    japanese       0.72      0.74      0.73       229
      korean       0.85      0.79      0.82       255
        thai       0.82      0.86      0.84       263

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.79      1199
weighted avg       0.80      0.80      0.80      1199



In [9]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [12]:
'''
Support-Vector clustering (SVC) is a child of the Support-Vector machines family of ML techniques (learn more about these below). 
In this method, you can choose a 'kernel' to decide how to cluster the labels. 
The 'C' parameter refers to 'regularization' which regulates the influence of parameters. 
The kernel can be one of several; here we set it to 'linear' to ensure that we leverage linear SVC. 
Probability defaults to 'false'; here we set it to 'true' to gather probability estimates. 
We set the random state to '0' to shuffle the data to get probabilities.
'''

C = 10

# Good algorithm cheat sheet @ https://github.com/NateSiwel/ML-For-Beginners/blob/main/4-Classification/3-Classifiers-2/images/map.png?raw=true

# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C), # Can be used for supervised & unsupervised learning
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
     'ADA': AdaBoostClassifier(n_estimators=100)
    
}

In [13]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 79.8% 
              precision    recall  f1-score   support

     chinese       0.71      0.77      0.74       232
      indian       0.87      0.85      0.86       255
    japanese       0.84      0.72      0.78       250
      korean       0.83      0.76      0.79       213
        thai       0.76      0.89      0.82       249

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199

Accuracy (train) for KNN classifier: 74.1% 
              precision    recall  f1-score   support

     chinese       0.62      0.75      0.68       232
      indian       0.87      0.75      0.80       255
    japanese       0.71      0.78      0.74       250
      korean       0.90      0.55      0.69       213
        thai       0.72      0.85      0.78       249

    accuracy                           0.74      1199
   macro avg       0.76      0.74      0.74    



Accuracy (train) for ADA: 71.1% 
              precision    recall  f1-score   support

     chinese       0.64      0.47      0.55       232
      indian       0.87      0.82      0.85       255
    japanese       0.67      0.68      0.67       250
      korean       0.67      0.73      0.70       213
        thai       0.69      0.83      0.75       249

    accuracy                           0.71      1199
   macro avg       0.71      0.71      0.70      1199
weighted avg       0.71      0.71      0.71      1199

