In [5]:
import numpy as np
import pandas as pd
import scipy

from sklearn import linear_model, naive_bayes
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf 
import tensorflow.keras.utils as utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, Bidirectional
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

SEED_NO=0

In [6]:
# Load the Training Data
X_train_name = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_vec.npz')
X_train_steps = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_vec.npz')
X_train_ingr = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_steps_vec.npz')

# Load the Test Data
X_test_name = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_ingr_vec.npz')
X_test_steps = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_ingr_vec.npz')
X_test_ingr = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_steps_vec.npz')

#Grab Original Label
X_train_original = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_train.csv", index_col = False, delimiter = ',', header=0)
y_train = X_train_original.duration_label

In [7]:
Kbest_name =  SelectKBest(chi2, k=300).fit(X_train_name, y_train)
Kbest_steps = SelectKBest(chi2, k=300).fit(X_train_steps, y_train)
Kbest_ingr = SelectKBest(chi2, k=300).fit(X_train_ingr, y_train)


In [8]:
X_train_name, X_train_steps, X_train_ingr = Kbest_name.transform(X_train_name), Kbest_steps.transform(X_train_steps), Kbest_ingr.transform(X_train_ingr)
X_test_name, X_test_steps, X_test_ingr = Kbest_name.transform(X_test_name), Kbest_steps.transform(X_test_steps), Kbest_ingr.transform(X_test_ingr)

X_train = np.concatenate((X_train_name.toarray(), X_train_steps.toarray(), X_train_ingr.toarray()), axis=1)
X_test = np.concatenate((X_test_name.toarray(), X_test_steps.toarray(), X_test_ingr.toarray()), axis=1)

In [9]:
X_train = pd.DataFrame(X_train)

# Normalize the data to optimize for high learning rate. 
sc = StandardScaler()
X_train, X_test = sc.fit_transform(X_train), sc.fit_transform(X_test)
X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)
print(X_test.shape)

(10000, 900)


In [10]:
NB_clf = naive_bayes.GaussianNB()
NB_accuracy = cross_val_score(NB_clf, X_train, y_train, cv=5).mean()
print("Cross Validation Accuracy for Naive Bayes: ", NB_accuracy)

Cross Validation Accuracy for Naive Bayes:  0.658425


In [11]:
LR_clf = linear_model.LogisticRegression(random_state=SEED_NO,
                                         C=0.9, 
                                         max_iter = 50000,
                                   multi_class='multinomial')
lr_fit = LR_clf.fit(X_train, y_train)
LR_accuracy = cross_val_score(LR_clf, X_train, y_train, cv=5).mean()
print("Cross Validation Accuracy for Logistic Regression: ", LR_accuracy)

Cross Validation Accuracy for Logistic Regression:  0.7986


In [12]:
# #Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.reshape(np.array(X_train), newshape=(X_train.shape[0], 1, X_train.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

Transformed X_train Dimensions:           (40000, 1, 900)
One-hot Encoding Dimensions for y_train:  (40000, 3)


In [13]:
def Bidirectional_LSTM_clf(X, y, epochs_size, batch_size):
    model = Sequential()

    #Bidirectional LSTM
    model.add(Bidirectional(LSTM(X.shape[2], return_sequences=True, dropout=0.45, input_shape=(1, X.shape[2]))))
    model.add(Bidirectional(LSTM(X.shape[2],return_sequences=False, dropout=0.45, input_shape=(1, X.shape[2]))))

    #Adding dense layer to implement activation layer 
    model.add(Dense(X.shape[2], activation='tanh'))
    model.add(Dense(X.shape[2]*2, activation='relu'))
    model.add(Dense(X.shape[2], activation='relu'))
    model.add(Dense(3, activation='softmax')) #Softmax output for 3 corresponding categorical variables

    #Implement Loss function w.r.t probabiltiy over possible classes
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_history = model.fit(X, y, 
                            epochs=epochs_size, 
                            batch_size=batch_size, 
                            validation_split=0.2, #Only 80% of input data will be trained for diagnostics purposes
                            verbose=1)
    return model, model_history 

In [14]:
# bi_lstm_model.summary()
# print("Bidirectional LSTM Model Training Acccuracy:          {:.2f}".format(bi_lstm_model_history.history['accuracy'][-1]))
# print("Bidirectional LSTM Model Cross-Validation Acccuracy:  {:.2f}".format(bi_lstm_model_history.history['val_accuracy'][-1]))

In [15]:
target_names =['1.0', '2.0', '3.0']

#Split the data to test for LSTM accuracy 
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED_NO)

# #Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.reshape(np.array(X_train_lstm), newshape=(X_train_lstm.shape[0], 1, X_train_lstm.shape[1]))
X_test_3dim = np.reshape(np.array(X_test_lstm), newshape=(X_test_lstm.shape[0], 1, X_test_lstm.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train_lstm)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

bi_lstm_model, bi_lstm_model_history = Bidirectional_LSTM_clf(X_train_3dim, y_train_onehot, 10, 128)

Transformed X_train Dimensions:           (28000, 1, 900)
One-hot Encoding Dimensions for y_train:  (28000, 3)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
print("------- Bidirectional LSTM Classifier Report -------")
y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))
print(classification_report(y_test_lstm.astype(str), y_pred_BLSTM.astype(str), target_names=target_names, digits=4))

------- Bidirectional LSTM Classifier Report -------
              precision    recall  f1-score   support

         1.0     0.7775    0.7594    0.7684      5283
         2.0     0.7857    0.8126    0.7989      6110
         3.0     0.7140    0.6129    0.6596       607

    accuracy                         0.7791     12000
   macro avg     0.7591    0.7283    0.7423     12000
weighted avg     0.7785    0.7791    0.7784     12000



## Export prediction labels to CSV

In [17]:
def export_pred_to_csv(y_pred, fname):
    pd.DataFrame(zip(np.arange(1, len(y_pred)+1), y_pred), columns=["id", "duration_label"]).to_csv("{}".format(fname), header=True, index=False)

In [20]:
# Predict using Naive Bayes
NB_clf.fit(X_train, y_train)
y_pred_NB = NB_clf.predict(X_test)

# Predict using Logistic Regression
LR_clf.fit(X_train, y_train)
y_pred_LR = LR_clf.predict(X_test)

#Predict using BLSTM 
y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))

In [23]:
export_pred_to_csv(y_pred_BLSTM, "BLSTM_y_pred_chi300.csv")
export_pred_to_csv(y_pred_NB, "NB_y_pred_chi300.csv")
export_pred_to_csv(y_pred_LR, "LR_y_pred_chi300.csv")