In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

import numpy as np
import pandas as pd
import datetime

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM, Input, Layer, BatchNormalization, Bidirectional, TextVectorization
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

print(tf.__version__)

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
#from scripts.processing import *

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# The maximum number of words to be used. (most frequent)
VOCAB_SIZE = 1000000

# Dimension of the dense embedding.
EMBEDDING_DIM = 128

# Max number of words in each review.
MAX_LENGTH = 200

# Define ANN hyperparameter
EPOCHS = 2
BATCH_SIZE = 512

In [None]:
# load the dataset and use specified dataset
# TODO change dfr to dataset when csv clean
df = pd.read_csv('../data/review_1819_eng.csv')#_preprocessed_final.csv')
# dfr = dfr[DFR_START:DFR_END]

In [None]:
# define feature and target
review = df[['text']]
# TODO delete apply after changes in csv
target = df["useful"].apply(lambda x: 1 if x > 0 else 0).values


In [None]:
# split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(review, target, test_size = 0.20, random_state = RSEED)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
# initialize vectorizer
vectorize_layer = TextVectorization(
 standardize='lower_and_strip_punctuation', 
 ngrams=3,
 max_tokens=VOCAB_SIZE,
 output_mode='int',
 output_sequence_length=MAX_LENGTH)

# fit vectorizer
vectorize_layer.adapt(X_train)

In [None]:
# build model architecture
# TODO comment
text_in = Input(shape=(1,), dtype=tf.string)
vector = vectorize_layer(text_in)
embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(vector)
text_bidir_lstm_1 = Bidirectional(LSTM(128, return_sequences=True))(embedding)
text_dropout_1 = Dropout(0.5)(text_bidir_lstm_1)
text_bidir_lstm_2 = Bidirectional(LSTM(128))(text_dropout_1)
text_dropout_2 = Dropout(0.5)(text_bidir_lstm_2)
text_dense = Dense(64, activation='relu')(text_dropout_2)
text_out = Dropout(0.5)(text_dense)

dense_1 = Dense(128, activation='relu')(text_out)
dropout_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(32, activation='relu')(dropout_1)
dropout_2 = Dropout(0.5)(dense_2)

out = Dense(1, activation='sigmoid')(dropout_2)
model = Model(inputs=[text_in], outputs=[out])

In [None]:
#compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# save history in tensorboard
# TODO get tensorboard working again
log_dir = "../logs/new/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Create a callback that saves the model's weights
# TODO save several checkpoints
checkpoint_path = "../training/model_extended_3_no_attention/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
# set parameters and train the model
history = model.fit(X_train, y_train, shuffle=True ,
                    epochs=EPOCHS, batch_size=BATCH_SIZE, 
                    validation_split=0.2,
                    callbacks=[cp_callback])

model.save('../saved_model/model')

In [None]:
# predict on test set
y_pred = (model.predict(X_test) > 0.5).astype('int32')

In [None]:
# show the classification report
print(classification_report(y_test, y_pred))

# show confusion matrix
sns.heatmap(tf.math.confusion_matrix(y_test, y_pred), annot=True, fmt='g', cmap='viridis_r', linewidth=0.01, linecolor='k', vmin=0, vmax=45000)

In [None]:
tf.keras.utils.plot_model(model, "multi_input_and_output_model_2.png", show_shapes=True)