In [153]:
import numpy as np
import pandas as pd
import scipy

from sklearn import linear_model, naive_bayes
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf 
import tensorflow.keras.utils as utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, LSTM, Bidirectional
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

SEED_NO=0

In [175]:
# Load the Training Data
X_train_name = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_vec.npz')
X_train_steps = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_ingr_vec.npz')
X_train_ingr = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/train_steps_vec.npz')

# Load the Test Data
X_test_name = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_ingr_vec.npz')
X_test_steps = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_ingr_vec.npz')
X_test_ingr = scipy.sparse.load_npz(r'./COMP30027_2021_Project2_datasets/recipe_text_features_countvec/test_steps_vec.npz')

In [176]:
X_train_original = pd.read_csv(r"./COMP30027_2021_Project2_datasets/recipe_train.csv", index_col = False, delimiter = ',', header=0)
y_train = X_train_original.duration_label

In [177]:
Kbest_name =  SelectKBest(chi2, k=100).fit(X_train_name, y_train)
Kbest_steps = SelectKBest(chi2, k=100).fit(X_train_steps, y_train)
Kbest_ingr = SelectKBest(chi2, k=100).fit(X_train_ingr, y_train)

In [178]:
#Selecting the k-best form respective datasets
X_train_name, X_train_steps, X_train_ingr = Kbest_name.transform(X_train_name), Kbest_steps.transform(X_train_steps), Kbest_ingr.transform(X_train_ingr)
X_test_name, X_test_steps, X_test_ingr = Kbest_name.transform(X_test_name), Kbest_steps.transform(X_test_steps), Kbest_ingr.transform(X_test_ingr)

In [179]:
#Transform date sources into X_train, Y_train 
X_train = np.concatenate((X_train_name.toarray(), X_train_steps.toarray(), X_train_ingr.toarray()), axis=1)
X_test = np.concatenate((X_test_name.toarray(), X_test_steps.toarray(), X_test_ingr.toarray()), axis=1)

In [180]:
X_train = pd.DataFrame(X_train)

# Normalize the data to optimize for logistic regression
sc = StandardScaler()
X_train, X_test = sc.fit_transform(X_train), sc.fit_transform(X_test)
X_train, X_test = pd.DataFrame(X_train), pd.DataFrame(X_test)
print(X_test.shape)

(10000, 300)


In [181]:
pca = PCA(n_components=3, random_state=SEED_NO)
components = pca.fit_transform(X_train)
labels = {
    str(i): f"PCA {i+1} ({var:.1f}%)" for i, var in enumerate(pca.explained_variance_ratio_ * 100)
} 

#Representing Single Value Decomposition in Low Dimension Settings
fig = px.scatter_matrix(components, 
                        labels=labels, 
                        dimensions=range(pca.components_.shape[0]), 
                        color=y_train,
                        title="Total Explained Ratio (R-Sq): {:.2f}%".format(pca.explained_variance_ratio_.sum()*100),
                        width=800, height=500
                       ).update_traces(diagonal_visible=False, marker=dict(size=3))
fig.show()

In [182]:
# Implementing logistic regression 
LR_clf = linear_model.LogisticRegression(random_state=SEED_NO,
                                         C=0.9, 
                                         max_iter = 50000,
                                   multi_class='multinomial')
lr_fit = LR_clf.fit(X_train, y_train)
LR_accuracy = cross_val_score(LR_clf, X_train, y_train, cv=5).mean()
print("Cross Validation Accuracy for Logistic Regression: ", LR_accuracy)

Cross Validation Accuracy for Logistic Regression:  0.7951750000000001


In [183]:
#Implementing Guassian Naive Bayes
B_clf = naive_bayes.GaussianNB()
NB_accuracy = cross_val_score(NB_clf, X_train, y_train, cv=5).mean()
print("Cross Validation Accuracy for Naive Bayes: ", NB_accuracy)

Cross Validation Accuracy for Naive Bayes:  0.652475


In [184]:
# #Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.reshape(np.array(X_train), newshape=(X_train.shape[0], 1, X_train.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)


Transformed X_train Dimensions:           (40000, 1, 300)
One-hot Encoding Dimensions for y_train:  (40000, 3)


In [185]:
def Bidirectional_LSTM_clf(X, y, epochs_size, batch_size):
    model = Sequential()

    #Bidirectional LSTM
    model.add(Bidirectional(LSTM(X.shape[2], return_sequences=True, dropout=0.45, input_shape=(1, X.shape[2]))))
    model.add(Bidirectional(LSTM(X.shape[2],return_sequences=False, dropout=0.45, input_shape=(1, X.shape[2]))))

    #Adding dense layer to implement activation layer 
    model.add(Dense(X.shape[2], activation='tanh'))
    model.add(Dense(X.shape[2]*2, activation='relu'))
    model.add(Dense(X.shape[2], activation='relu'))
    model.add(Dense(3, activation='softmax')) #Softmax output for 3 corresponding categorical variables

    #Implement Loss function w.r.t probabiltiy over possible classes
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_history = model.fit(X, y, 
                            epochs=epochs_size, 
                            batch_size=batch_size, 
                            validation_split=0.2, #Only 80% of input data will be trained for diagnostics purposes
                            verbose=1)
    return model, model_history 

bi_lstm_model, bi_lstm_model_history = Bidirectional_LSTM_clf(X_train_3dim, y_train_onehot, 10, 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [186]:
bi_lstm_model.summary()
print("Bidirectional LSTM Model Training Acccuracy:          {:.2f}".format(bi_lstm_model_history.history['accuracy'][-1]))
print("Bidirectional LSTM Model Cross-Validation Acccuracy:  {:.2f}".format(bi_lstm_model_history.history['val_accuracy'][-1]))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_4 (Bidirection (None, 1, 600)            1442400   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 600)               2162400   
_________________________________________________________________
dense_8 (Dense)              (None, 300)               180300    
_________________________________________________________________
dense_9 (Dense)              (None, 600)               180600    
_________________________________________________________________
dense_10 (Dense)             (None, 300)               180300    
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 903       
Total params: 4,146,903
Trainable params: 4,146,903
Non-trainable params: 0
____________________________________________

In [187]:
# Plot line for epoch and loss function
acc_df = pd.DataFrame(
            zip(np.arange(36), 
                # lstm_model_history.history['accuracy'],  
                bi_lstm_model_history.history['accuracy'],
                # lstm_model_history.history['val_accuracy'],  
                bi_lstm_model_history.history['val_accuracy']
                ),
            columns=["Epoch Iteration", 
                    # "LSTM Model (Training)", 
                    "Bidirectional LSTM Model (Training)",
                    # "LSTM Model (Cross Validation)",
                    "Bidirectional LSTM Model (Cross Validation)",
                    ],
            )

loss_df = pd.DataFrame(
            zip(np.arange(36), 
                # lstm_model_history.history['loss'],  
                bi_lstm_model_history.history['loss'],
                # lstm_model_history.history['val_loss'],  
                bi_lstm_model_history.history['val_loss']
                ),
            columns=["Epoch Iteration", 
                    # "LSTM Model (Training)", 
                    "Bidirectional LSTM Model (Training)",
                    # "LSTM Model (Cross Validation)",
                    "Bidirectional LSTM Model (Cross Validation)",
                    ],
            )

In [188]:
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=["Accuracy per Epoch Iteration", "Loss per Epoch Iteration"])

fig.add_trace(
    go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["Bidirectional LSTM Model (Training)"], 
                name="Bidirectional LSTM Model (Training)", legendgroup='group2', line_color='purple'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=acc_df['Epoch Iteration'], y=acc_df["Bidirectional LSTM Model (Cross Validation)"], 
                name="Bidirectional LSTM Model (Cross Validation)", legendgroup='group4', line_color='red'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["Bidirectional LSTM Model (Training)"], 
                name="Bidirectional LSTM Model (Training)", legendgroup='group2', line_color='purple', showlegend = False),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=loss_df['Epoch Iteration'], y=loss_df["Bidirectional LSTM Model (Cross Validation)"], 
                name="Bidirectional LSTM Model (Cross Validation)", legendgroup='group4', line_color='red', showlegend = False), 
    row=1, col=2
)

fig.update_layout(height=600, width=1000, title_text="Accuracy & Loss per Epoch Iterations")
fig.show()

In [189]:
X_test_3dim = np.reshape(np.array(X_test), newshape=(X_test.shape[0], 1, X_test.shape[1]))
print("Transformed X_test Dimensions:          ", X_test_3dim.shape)

y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))
print(y_pred_BLSTM.shape)

#Naive Bayes fit & predict output
NB_clf.fit(X_train, y_train)
y_pred_NB = NB_clf.predict(X_test)

#Logistic Regression fit & predict output
LR_clf.fit(X_train, y_train)
y_pred_LR = LR_clf.predict(X_test)

Transformed X_test Dimensions:           (10000, 1, 300)
(10000,)


In [169]:
def export_pred_to_csv(y_pred, fname):
    pd.DataFrame(zip(np.arange(1, len(y_pred)+1), y_pred), columns=["id", "duration_label"]).to_csv("{}".format(fname), header=True, index=False)

In [190]:
target_names =['1.0', '2.0', '3.0']
y_cv_pred_LR = cross_val_predict(LR_clf, X_train, y_train, cv=5)
y_cv_pred_NB = cross_val_predict(NB_clf, X_train, y_train, cv=5)

print("------- Logistic Regression Classifier Report -------")
# print("Accuracy Score: ", accuracy_score(y_train.astype(str), y_pred_NB.astype(str)))
print(classification_report(y_train.astype(str), y_cv_pred_LR.astype(str), target_names=target_names, digits=4))
print("----------- Naive Bayes Classifier Report -----------")
# print("Accuracy Score: ", accuracy_score(y_train.astype(str), y_pred_LR.astype(str)))
print(classification_report(y_train.astype(str), y_cv_pred_NB.astype(str), target_names=target_names, digits=4))

------- Logistic Regression Classifier Report -------
              precision    recall  f1-score   support

         1.0     0.7656    0.8232    0.7934     17705
         2.0     0.8271    0.7867    0.8064     20246
         3.0     0.7644    0.6364    0.6945      2049

    accuracy                         0.7952     40000
   macro avg     0.7857    0.7488    0.7648     40000
weighted avg     0.7967    0.7952    0.7949     40000

----------- Naive Bayes Classifier Report -----------
              precision    recall  f1-score   support

         1.0     0.6200    0.8801    0.7275     17705
         2.0     0.8610    0.4563    0.5965     20246
         3.0     0.3090    0.6237    0.4133      2049

    accuracy                         0.6525     40000
   macro avg     0.5966    0.6534    0.5791     40000
weighted avg     0.7260    0.6525    0.6451     40000



In [None]:
#Split the data for Classificaiton report purposes
X_train_cr, X_test_cr, y_train_cr, y_test_cr = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED_NO)

# #Stacking all Vectorized feature and reshape them into  3-D Array
X_train_3dim = np.reshape(np.array(X_train_cr), newshape=(X_train_cr.shape[0], 1, X_train_cr.shape[1]))
X_test_3dim = np.reshape(np.array(X_test_cr), newshape=(X_test_cr.shape[0], 1, X_test_cr.shape[1]))
print("Transformed X_train Dimensions:          ", X_train_3dim.shape)

# One-hot Encoding for Y_train
y_train_le = LabelEncoder().fit_transform(y_train_cr)
y_train_onehot = utils.to_categorical(y_train_le)
print("One-hot Encoding Dimensions for y_train: ", y_train_onehot.shape)

# Training Bidirectional BLSTM Model
bi_lstm_model, bi_lstm_model_history = Bidirectional_LSTM_clf(X_train_3dim, y_train_onehot, 10, 128)

In [None]:
print("------- Bidirectional LSTM Classifier Report -------")
y_pred_BLSTM = np.array(list(map(lambda x: float(x+1), # Increment output to match category as np.argmax return position of one-hot array
                        np.argmax(bi_lstm_model.predict(X_test_3dim), axis=-1))))
print(classification_report(y_test_lstm.astype(str), y_pred_BLSTM.astype(str), target_names=target_names, digits=4))

In [None]:
target_names =['1.0', '2.0', '3.0']

#Naive Bayes fit & predict output
NB_clf.fit(X_train_cr, y_train_cr)
y_pred_NB = NB_clf.predict(X_test_cr)

#Logistic Regression fit & predict output
LR_clf.fit(X_train_cr, y_train_cr)
y_pred_LR = LR_clf.predict(X_test_cr)

print("------- Logistic Regression Classifier Report -------")
# print("Accuracy Score: ", accuracy_score(y_train.astype(str), y_pred_NB.astype(str)))
print(classification_report(y_train.astype(str), y_cv_pred_LR.astype(str), target_names=target_names, digits=4))
print("----------- Naive Bayes Classifier Report -----------")
# print("Accuracy Score: ", accuracy_score(y_train.astype(str), y_pred_LR.astype(str)))
print(classification_report(y_train.astype(str), y_cv_pred_NB.astype(str), target_names=target_names, digits=4))

## Export prediction labels to CSV

In [None]:
def export_pred_to_csv(y_pred, fname):
    pd.DataFrame(zip(np.arange(1, len(y_pred)+1), y_pred), columns=["id", "duration_label"]).to_csv("{}".format(fname), header=True, index=False)

In [None]:
export_pred_to_csv(y_pred_BLSTM, "BLSTM_y_pred_chi100.csv")
export_pred_to_csv(y_pred_NB, "NB_y_pred_chi100.csv")
export_pred_to_csv(y_pred_LR, "LR_y_pred_chi100.csv")

# Miscellaneous Code -- PLEASE IGNORE BELOW (For Competition purposes only)

In [129]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier, AdaBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

# mlpc_clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

In [57]:
y_pred_mlpc = mlpc_clf.predict(X_test_cr)
accuracy_score(y_pred_mlpc, y_test_cr)

0.9803333333333333

In [152]:
# export_pred_to_csv(bi_lstm_model.predict(X_test), "Spare_y_pred_comp.csv")

In [133]:
# estimators = [
#     ('rf', RandomForestClassifier(n_estimators=100, random_state=SEED_NO)),
#     ('gnb', naive_bayes.GaussianNB()),
#     ('knn', KNeighborsClassifier()),
#     ('dt',  DecisionTreeClassifier()),
#     # ('qda', QuadraticDiscriminantAnalysis())
# ]
# sc_clf = StackingClassifier(
#      estimators=estimators, final_estimator=linear_model.LogisticRegression()
# )

In [134]:
sc_clf.fit(X_train_cr, y_train_cr).score(X_test_cr, y_test_cr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8011666666666667

In [135]:
export_pred_to_csv(sc_clf.predict(X_test), "Spare_y_pred_comp.csv")