# ANN

## 1. Data input

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import urllib3
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_absolute_error
from sklearn import metrics
from sklearn.neural_network import MLPClassifier

df=pd.read_json('train.json')
df

Unnamed: 0,id,cuisine,ingredients
0,22675,italian,"[1% low-fat cottage cheese, low-fat marinara s..."
1,32288,southern_us,"[brown sugar, salt, eggs, butter, chopped peca..."
2,44406,thai,"[red chili peppers, bell pepper, garlic, fish ..."
3,29355,moroccan,"[water, green tea leaves, tangerine, fresh min..."
4,39350,chinese,"[vegetable oil, chile sauce, tomato paste, gar..."
...,...,...,...
29769,2278,japanese,"[soy sauce, sesame oil, garlic, sake, flour, g..."
29770,474,vietnamese,"[mint, garlic sauce, chinese chives, rice nood..."
29771,44229,indian,"[potatoes, vegetable broth, oil, cashew nuts, ..."
29772,20311,southern_us,"[butter, powdered sugar, cream cheese, soften,..."


## 2. Data preprocessing

In [2]:
from collections import Counter
features_all_list=[]
for i in df.ingredients:
    features_all_list+=i

features=list(set(features_all_list))
len(features)

6231

In [3]:
onehot_ingredients = np.zeros((df.shape[0], len(features)))
feature_lookup = sorted(features)

for index, row in df.iterrows():
    for ingredient in row['ingredients']:
        onehot_ingredients[index, feature_lookup.index(ingredient)] = 1
y = df.cuisine.values.reshape(-1,1)
y=y.ravel()

In [4]:
df_features = pd.DataFrame(onehot_ingredients)
d = {}
for i in range(len(features)):
    d[df_features.columns[i]] = features[i]
    
df_features = df_features.rename(columns=d)
df_features.shape

(29774, 6231)

## 3. Model construction

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.3, shuffle=True)

### model_1: solver用'adam'，官方建議在資料集較大的時候用'adam'這個優化方式。

In [6]:
model_1=MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(100, ), random_state=1)
model_1.fit(X_train,y_train)

MLPClassifier(alpha=1e-05, random_state=1)

### model_2: solver用'lbfgs'，官方建議在資料集較小的時候用'lbfgs'這個優化方式。

In [7]:
model_2=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100, ), random_state=1)
model_2.fit(X_train,y_train)

MLPClassifier(alpha=1e-05, random_state=1, solver='lbfgs')

## 4. Result

### model_1 (solver='adam')

In [8]:
y_true = y_test
y_pred_1 = model_1.predict(X_test)
con_matrix=pd.DataFrame(confusion_matrix(y_true,y_pred_1))
print(con_matrix)

    0   1    2    3   4    5    6    7   8     9   10   11   12    13   14  \
0   53   0    3    1   4    4    0    1   1     6   1    0    0    21    1   
1    1  73    1    1   1   24    3    3  20    17   0    2    0     3    0   
2    1   3  212    2   1   14    1    0   4    13   1    1    0    17    0   
3    1   1    4  462  10    5    1    6   0     4   0   29   22     8    1   
4    5   0    1   18  86    1    0    3   1     9   0    6    3    12    0   
5    1  11    9    7   3  305   15    7  10   131   2    1    2    18    3   
6    0   0    1    4   1    7  164    6   2    36   0    0    1     6    4   
7    2   3    2    4   4    8    7  574   1     5   1   20    0    12   15   
8    0  11    2    0   3    9    3    1  45     9   1    0    0     3    0   
9    1   6    9    7   4  124   44    5  10  1422   4    1    0    30    5   
10   5   2    1    1   3    1    2    5   0     4  74    0    0    16    0   
11   0   0    3   33   4   14    0   14   1     6   0  204   10 

In [9]:
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

   brazilian       0.60      0.45      0.51       118
     british       0.51      0.40      0.45       181
cajun_creole       0.67      0.61      0.64       349
     chinese       0.72      0.76      0.74       609
    filipino       0.56      0.49      0.52       174
      french       0.49      0.51      0.50       596
       greek       0.60      0.65      0.63       251
      indian       0.83      0.83      0.83       693
       irish       0.36      0.38      0.37       118
     italian       0.78      0.80      0.79      1781
    jamaican       0.77      0.58      0.66       128
    japanese       0.66      0.66      0.66       311
      korean       0.75      0.67      0.71       183
     mexican       0.84      0.87      0.85      1435
    moroccan       0.79      0.73      0.76       202
     russian       0.34      0.34      0.34       106
 southern_us       0.67      0.68      0.68       931
     spanish       0.36    

### model_2 (solver='lbfgs')

In [10]:
y_true = y_test
y_pred_2 = model_2.predict(X_test)
con_matrix=pd.DataFrame(confusion_matrix(y_true,y_pred_2))
print(con_matrix)

    0   1    2    3   4    5    6    7   8     9   10   11   12    13   14  \
0   55   2    4    0   3    2    0    1   1     8   3    0    0    18    0   
1    1  70    2    1   1   18    5    4  19    12   3    0    0     7    0   
2    0   3  239    1   1    9    0    1   2    15   1    1    0    13    0   
3    3   1    3  479  10    6    0    2   0     4   0   27   21     2    2   
4    5   0    1   10  98    3    0    2   1     4   3   10    2    10    0   
5    0   7   11    5   1  331    8    3   6   129   3    1    1    14    4   
6    0   1    2    1   1    8  168    4   0    37   0    0    1     1    4   
7    1   3    3    3   3    4    4  580   0     3   1   25    0    15   18   
8    0  10    0    1   0   13    2    0  50     8   1    0    0     2    0   
9    4   5   11    5   5  102   37    5   9  1487   2    2    0    27    7   
10   5   1    2    2   3    0    0    6   0     3  80    3    0    11    0   
11   3   0    3   32   4    5    1   17   0     9   0  211    8 

In [11]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

   brazilian       0.60      0.47      0.52       118
     british       0.53      0.39      0.45       181
cajun_creole       0.68      0.68      0.68       349
     chinese       0.77      0.79      0.78       609
    filipino       0.61      0.56      0.59       174
      french       0.56      0.56      0.56       596
       greek       0.69      0.67      0.68       251
      indian       0.85      0.84      0.84       693
       irish       0.39      0.42      0.41       118
     italian       0.80      0.83      0.82      1781
    jamaican       0.70      0.62      0.66       128
    japanese       0.65      0.68      0.67       311
      korean       0.78      0.70      0.74       183
     mexican       0.86      0.88      0.87      1435
    moroccan       0.74      0.74      0.74       202
     russian       0.39      0.39      0.39       106
 southern_us       0.70      0.70      0.70       931
     spanish       0.44    

## 5. Comparison & conclusion 

### model_1(solver='adam')的準確率是0.70，model_2(solver='lbfgs')的準確率是0.73。 兩種模型的表現差不多。lbfgs適合用在小型數據集（收斂更快），但在此作業裡它對大型數據的處理也很好。

### 助教們這學期辛苦了！ 