In [26]:
pip install --upgrade nbformat

You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd

#Original Dataset 
X_train_original = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_train.csv", index_col = False, delimiter = ',', header=0)
y_train = X_train_original.duration_label

X_train_name = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)

# Doc2Vec Datase
X_train_steps = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)
            
X_train_ingr = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)

In [4]:
X_test_name = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_name_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)

# Doc2Vec Datase
X_test_steps = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_steps_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)
            
X_test_ingr = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv", 
            index_col = False, 
            delimiter = ',', 
            header=None)

In [5]:
from sklearn import linear_model, naive_bayes
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model, svm, naive_bayes
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf 
import tensorflow.keras.utils as utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, Bidirectional

from plotly.subplots import make_subplots
import plotly.graph_objects as go

SEED_NO=0

## Sequential Model

In [6]:
#Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.hstack((X_train_ingr, X_train_name, X_train_steps))
X_train_3dim = np.reshape(X_train_3dim, (X_train_3dim.shape[0], 1, X_train_3dim.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

Transformed X_train Dimensions:           (40000, 1, 300)
One-hot Encoding Dimensions for y_train:  (40000, 3)


In [7]:
def Bidirectional_LSTM_clf(X, y, epochs_size, batch_size):
    model = Sequential()

    #Bidirectional LSTM
    model.add(Bidirectional(LSTM(X.shape[2], return_sequences=True, dropout=0.45, input_shape=(1, X.shape[2]))))
    model.add(Bidirectional(LSTM(X.shape[2],return_sequences=False, dropout=0.45, input_shape=(1, X.shape[2]))))

    #Adding dense layer to implement activation layer 
    model.add(Dense(X.shape[2], activation='tanh'))
    model.add(Dense(X.shape[2]*2, activation='relu'))
    model.add(Dense(X.shape[2], activation='relu'))
    model.add(Dense(3, activation='softmax')) #Softmax output for 3 corresponding categorical variables

    #Implement Loss function w.r.t probabiltiy over possible classes
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_history = model.fit(X, y, 
                            epochs=epochs_size, 
                            batch_size=batch_size, 
                            validation_split=0.2, #Only 80% of input data will be trained for diagnostics purposes
                            verbose=1)
    return model, model_history 

bi_lstm_model, bi_lstm_model_history = Bidirectional_LSTM_clf(X_train_3dim, y_train_onehot, 10, 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
# bi_lstm_model.summary()
# print("Bidirectional LSTM Model Training Acccuracy:          {:.4f}".format(bi_lstm_model_history.history['accuracy'][-1]))
# print("Bidirectional LSTM Model Cross-Validation Acccuracy:  {:.4f}".format(bi_lstm_model_history.history['val_accuracy'][-1]))

In [9]:
# Plot line for epoch and loss function
acc_df = pd.DataFrame(
            zip(np.arange(300), 
                # lstm_model_history.history['accuracy'],  
                bi_lstm_model_history.history['accuracy'],
                # lstm_model_history.history['val_accuracy'],  
                bi_lstm_model_history.history['val_accuracy']
                ),
            columns=["Epoch Iteration", 
                    # "LSTM Model (Training)", 
                    "Bidirectional LSTM Model (Training)",
                    # "LSTM Model (Cross Validation)",
                    "Bidirectional LSTM Model (Cross Validation)",
                    ],
            )

loss_df = pd.DataFrame(
            zip(np.arange(300), 
                # lstm_model_history.history['loss'],  
                bi_lstm_model_history.history['loss'],
                # lstm_model_history.history['val_loss'],  
                bi_lstm_model_history.history['val_loss']
                ),
            columns=["Epoch Iteration", 
                    # "LSTM Model (Training)", 
                    "Bidirectional LSTM Model (Training)",
                    # "LSTM Model (Cross Validation)",
                    "Bidirectional LSTM Model (Cross Validation)",
                    ],
            )

In [10]:
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=["Accuracy per Epoch Iteration", "Loss per Epoch Iteration"])
# fig.add_trace(
#     go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["LSTM Model (Training)"], 
#                 name="LSTM Model (Training)", legendgroup='group1', line_color='deepskyblue'),
#     row=1, col=1
# )
fig.add_trace(
    go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["Bidirectional LSTM Model (Training)"], 
                name="Bidirectional LSTM Model (Training)", legendgroup='group2', line_color='purple'),
    row=1, col=1
)
# fig.add_trace(
#     go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["LSTM Model (Cross Validation)"], 
#                 name="LSTM Model (Cross Validation)", legendgroup='group3', line_color='green'),
#     row=1, col=1
# )
fig.add_trace(
    go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["Bidirectional LSTM Model (Cross Validation)"], 
                name="Bidirectional LSTM Model (Cross Validation)", legendgroup='group4', line_color='red'),
    row=1, col=1
)

# fig.add_trace(
#     go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["LSTM Model (Training)"], 
#                 name="LSTM Model (Training)", legendgroup='group1', line_color='deepskyblue', showlegend = False),
#     row=1, col=2
# )

fig.add_trace(
    go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["Bidirectional LSTM Model (Training)"], 
                name="Bidirectional LSTM Model (Training)", legendgroup='group2', line_color='purple', showlegend = False),
    row=1, col=2
)
# fig.add_trace(
#     go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["LSTM Model (Cross Validation)"], 
#                 name="LSTM Model (Cross Validation)", legendgroup='group3', line_color='green', showlegend = False),
#     row=1, col=2
# )
fig.add_trace(
    go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["Bidirectional LSTM Model (Cross Validation)"], 
                name="Bidirectional LSTM Model (Cross Validation)", legendgroup='group4', line_color='red', showlegend = False), 
    row=1, col=2
)


fig.update_layout(height=600, width=1000, title_text="Accuracy & Loss per Epoch Iterations")
fig.show()

In [11]:
y_train = X_train_original.duration_label

# Modelling Linear Classifier to be used within benchmarks
print("--- \nMulti-class Logistic Regression ")
LR_clf = linear_model.LogisticRegression(random_state=SEED_NO,
                                         C=0.9, 
                                         max_iter = 50000,
                                         multi_class='multinomial')

# Compute Cross_validation score usign 5-fold and average the accuracy for each CSV files (parallel processing)
LR_accuracy = (cross_val_score(LR_clf, X_train_name, y_train, cv=5).mean() + \
        cross_val_score(LR_clf, X_train_steps, y_train, cv=5).mean() + \
        cross_val_score(LR_clf, X_train_ingr, y_train, cv=5).mean()) / 3
print("Multinomial Logistic Regression Accuracy:", LR_accuracy)

print("--- \nMultinomial Naive Bayes")
NB_clf = naive_bayes.GaussianNB()
NB_accuracy = (cross_val_score(NB_clf, X_train_name, y_train, cv=5).mean() + \
        cross_val_score(NB_clf, X_train_steps, y_train, cv=5).mean() + \
        cross_val_score(NB_clf, X_train_ingr, y_train, cv=5).mean()) / 3
print("Multinomial NB Accuracy", NB_accuracy)

--- 
Multi-class Logistic Regression 
Multinomial Logistic Regression Accuracy: 0.62045
--- 
Multinomial Naive Bayes
Multinomial NB Accuracy 0.5743916666666666


In [12]:
target_names =['1.0', '2.0', '3.0']

X_train = pd.concat([X_train_name, X_train_steps, X_train_ingr], axis=1)

#Split the data to test for LSTM accuracy 
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED_NO)

# #Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.reshape(np.array(X_train_lstm), newshape=(X_train_lstm.shape[0], 1, X_train_lstm.shape[1]))
X_test_3dim = np.reshape(np.array(X_test_lstm), newshape=(X_test_lstm.shape[0], 1, X_test_lstm.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train_lstm)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

bi_lstm_model, bi_lstm_model_history = Bidirectional_LSTM_clf(X_train_3dim, y_train_onehot, 10, 128)

Transformed X_train Dimensions:           (28000, 1, 300)
One-hot Encoding Dimensions for y_train:  (28000, 3)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
print("------- Bidirectional LSTM Classifier Report -------")
y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))
print(classification_report(y_test_lstm.astype(str), y_pred_BLSTM.astype(str), target_names=target_names, digits=4))

------- Bidirectional LSTM Classifier Report -------
              precision    recall  f1-score   support

         1.0     0.6486    0.8493    0.7355      5283
         2.0     0.7974    0.6205    0.6979      6110
         3.0     0.6433    0.3476    0.4513       607

    accuracy                         0.7074     12000
   macro avg     0.6964    0.6058    0.6283     12000
weighted avg     0.7241    0.7074    0.7020     12000



## Exporting files to CSV 

In [14]:
def export_pred_to_csv(y_pred, fname):
    pd.DataFrame(zip(np.arange(1, len(y_pred)+1), y_pred), columns=["id", "duration_label"]).to_csv("{}".format(fname), header=True, index=False)

In [15]:
# Modelling Linear Classifier to be used within benchmarks
NB_clf = naive_bayes.GaussianNB()

# Fit the model and name 
NB_clf.fit(X_train_name, y_train)
y_pred_name = NB_clf.predict(X_test_name)

# Fit the model and predict steps
NB_clf.fit(X_train_steps, y_train)
y_pred_steps = NB_clf.predict(X_test_steps)

# Fit the model and predict ingredietns 
NB_clf.fit(X_train_ingr, y_train)
y_pred_ingr = NB_clf.predict(X_test_ingr)

# Package the prediction 
y_pred_NB_pool = pd.DataFrame({"name": y_pred_name, "steps": y_pred_steps, "ingr": y_pred_ingr})
y_pred_NB_pool = y_pred_NB_pool.mode(axis=1)[0]

In [16]:
# Modelling Linear Classifier to be used within benchmarks
LR_clf = linear_model.LogisticRegression(random_state=SEED_NO,
                                         C=0.9, 
                                         max_iter = 50000,
                                         multi_class='multinomial')
# Fit the model and name 
LR_clf.fit(X_train_name, y_train)
y_pred_name = LR_clf.predict(X_test_name)

# Fit the model and predict steps
LR_clf.fit(X_train_steps, y_train)
y_pred_steps = LR_clf.predict(X_test_steps)

# Fit the model and predict ingredietns 
LR_clf.fit(X_train_ingr, y_train)
y_pred_ingr = LR_clf.predict(X_test_ingr)

# Package the prediction 
y_pred_LR_pool = pd.DataFrame({"name": y_pred_name, "steps": y_pred_steps, "ingr": y_pred_ingr})
y_pred_LR_pool = y_pred_LR_pool.mode(axis=1)[0]

In [17]:
#Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.hstack((X_train_ingr, X_train_name, X_train_steps))
X_train_3dim = np.reshape(X_train_3dim, (X_train_3dim.shape[0], 1, X_train_3dim.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

Transformed X_train Dimensions:           (40000, 1, 300)
One-hot Encoding Dimensions for y_train:  (40000, 3)


In [18]:
print("------- Bidirectional LSTM Classifier Report -------")
y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))
print(classification_report(y_test_lstm.astype(str), y_pred_BLSTM.astype(str), target_names=target_names, digits=4))

------- Bidirectional LSTM Classifier Report -------
              precision    recall  f1-score   support

         1.0     0.6486    0.8493    0.7355      5283
         2.0     0.7974    0.6205    0.6979      6110
         3.0     0.6433    0.3476    0.4513       607

    accuracy                         0.7074     12000
   macro avg     0.6964    0.6058    0.6283     12000
weighted avg     0.7241    0.7074    0.7020     12000



In [19]:
export_pred_to_csv(y_pred_BLSTM, "BLSTM_y_pred_vec100.csv")
export_pred_to_csv(y_pred_NB_pool, "NB_y_pred_vec100.csv")
export_pred_to_csv(y_pred_LR_pool, "LR_y_pred_vec100.csv")