# Build More Classification Models

In [1]:
# 데이터 호출을 위한 pandas import 진행 및 csv 파일 출력

import pandas as pd
cuisines_df = pd.read_csv("./cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [2]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

# cuisine을 타겟으로 label을 생성하여 헤드 출력

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [4]:
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

# unnamed를 날리고 정리한 데이터 출력

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Try different classifiers

In [6]:
# import the needed libraries

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [9]:
# split your training and test data

X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

# 0.3으로 나누어 데이터를 각각 생성함

# Linear SVC classifier

Support-Vector clustering (SVC) is a child of the Support-Vector machines family of ML techniques.
In this method, we can choose a 'kernel' to decide how to cluster the labels.
The 'C' parameter refers to 'regularization' which regulates the influence of parameters.
The kernel can be one of several; here we set it to 'linear' to ensure that we leverage linear SVC.
Probability defaults to 'false'; here we set it to 'true' to gather probability estimates.
We set the random state to '0' to shuffle the data to get probabilities.

옥수수 내부의 부분을 커널이라 칭하는데, 

In [12]:
# start with a Linear SVC

C = 10
# Create different classifiers. # 커널 여부, 정류, r[[[[]]]]
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0)
    
}


In [13]:
# train your model using the Linear SVC and print out a report

n_classifiers = len(classifiers) #1개만 돌리는 classifier

for index, (name, classifier) in enumerate(classifiers.items()): #classifier 훈련 진행
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test) # 예측 진행
    accuracy = accuracy_score(y_test, y_pred) # 예측 값에 대한 정확도 측정
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100)) # 예측 결과 출력
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 79.2% 
              precision    recall  f1-score   support

     chinese       0.69      0.75      0.72       235
      indian       0.89      0.83      0.86       258
    japanese       0.82      0.79      0.81       248
      korean       0.82      0.77      0.79       231
        thai       0.76      0.82      0.79       227

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1199
weighted avg       0.80      0.79      0.79      1199



# K-Neighbors classifier (지도, 비지도 모두 활용 가능)

K-Neighbors is part of the "neighbors" family of ML methods, which can be used for both supervised and unsupervised learning. In this method, a predefined number of points is created and data are gathered around these points such that generalized labels can be predicted for the data.

In [17]:
# Apply the K-Neighbors classifier
# The previous classifier was good, and worked well with the data, but maybe we can get better accuracy.
# Try a K-Neighbors classifier.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    
}

In [18]:
# train your model using two classifiers  and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 79.2% 
              precision    recall  f1-score   support

     chinese       0.69      0.75      0.72       235
      indian       0.89      0.83      0.86       258
    japanese       0.82      0.79      0.81       248
      korean       0.82      0.77      0.79       231
        thai       0.76      0.82      0.79       227

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1199
weighted avg       0.80      0.79      0.79      1199

Accuracy (train) for KNN classifier: 74.9% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.71       235
      indian       0.86      0.76      0.81       258
    japanese       0.71      0.88      0.78       248
      korean       0.92      0.58      0.71       231
        thai       0.67      0.80      0.73       227

    accuracy                           0.75      1199
   macro avg       0.77      0.75      0.75    

The result is a little worse

# Support Vector Classifier

Support-Vector classifiers are part of the Support-Vector Machine family of ML methods that are used for classification and regression tasks. SVMs "map training examples to points in space" to maximize the distance between two categories. Subsequent data is mapped into this space so their category can be predicted.

In [24]:
# Let's try for a little better accuracy with a Support Vector Classifier.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    
}

In [25]:
# train your model using three classifiers and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 79.2% 
              precision    recall  f1-score   support

     chinese       0.69      0.75      0.72       235
      indian       0.89      0.83      0.86       258
    japanese       0.82      0.79      0.81       248
      korean       0.82      0.77      0.79       231
        thai       0.76      0.82      0.79       227

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1199
weighted avg       0.80      0.79      0.79      1199

Accuracy (train) for KNN classifier: 74.9% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.71       235
      indian       0.86      0.76      0.81       258
    japanese       0.71      0.88      0.78       248
      korean       0.92      0.58      0.71       231
        thai       0.67      0.80      0.73       227

    accuracy                           0.75      1199
   macro avg       0.77      0.75      0.75    

The result is quite good!

# Ensemble Classifiers (가장 최근에 나온 classifier)

Let's try some 'Ensemble Classifiers, specifically Random Forest and AdaBoost

The ensemble method of Machine Learning "combines the predictions of several base estimators" to improve the model's quality.

Random Forest, an averaging method, builds a 'forest' of 'decision trees' infused with randomness to avoid overfitting. The n_estimators parameter is set to the number of trees.

여러 개의 추론 트리를 만들어서 여러가지를 묶어 randomness를 적용하여 overfitting을 피하는 방식으로 동작한다.

AdaBoost fits a classifier to a dataset and then fits copies of that classifier to the same dataset. It focuses on the weights of incorrectly classified items and adjusts the fit for the next classifier to correct.

In [26]:
# Let's try for a better accuracy with a Random Forest and AdaBoost.

C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [27]:
# train your model using five classifiers and print out a report

n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 79.2% 
              precision    recall  f1-score   support

     chinese       0.69      0.75      0.72       235
      indian       0.89      0.83      0.86       258
    japanese       0.82      0.79      0.81       248
      korean       0.82      0.77      0.79       231
        thai       0.76      0.82      0.79       227

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1199
weighted avg       0.80      0.79      0.79      1199

Accuracy (train) for KNN classifier: 74.9% 
              precision    recall  f1-score   support

     chinese       0.69      0.72      0.71       235
      indian       0.86      0.76      0.81       258
    japanese       0.71      0.88      0.78       248
      korean       0.92      0.58      0.71       231
        thai       0.67      0.80      0.73       227

    accuracy                           0.75      1199
   macro avg       0.77      0.75      0.75    



Accuracy (train) for ADA: 70.7% 
              precision    recall  f1-score   support

     chinese       0.62      0.54      0.58       235
      indian       0.86      0.81      0.83       258
    japanese       0.64      0.73      0.69       248
      korean       0.73      0.72      0.72       231
        thai       0.68      0.73      0.70       227

    accuracy                           0.71      1199
   macro avg       0.71      0.71      0.70      1199
weighted avg       0.71      0.71      0.71      1199

