In [None]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

import numpy as np
import pandas as pd
import datetime

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import Precision

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

print(tf.__version__)

# ignore the warnings
import warnings
warnings.filterwarnings("ignore")

# set Randomseed
RSEED = 42

# import needed functions
from scripts.processing import *

# Load the TensorBoard notebook extension
%load_ext tensorboard



In [None]:
# The maximum number of words to be used. (most frequent)
vocab_size = 100000

# Dimension of the dense embedding.
embedding_dim = 128

# Max number of words in each review.
max_length = 400

# Truncate and padding options
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [None]:
# load the dataset and use the first 500k (to be faster on testruns)

dfr = pd.read_csv('../data/yelp_dataset/review_1819.csv')
dfr = dfr[:500000]


In [None]:
# reduce dataset to text and rating
dataset = dfr[['text', 'useful', 'stars']]

In [None]:
# filter for only english reviews and remove the language line used for filtering
dataset = language_processing(dfr)
dataset.drop('language', axis=1, inplace=True)

#apply function for textcleaning and make sure everything looks as planned
dataset["text"] = dataset["text"].apply(clean_text)

In [None]:
# define feature and target
review = dataset["text"].values
target = dataset["useful"].apply(lambda x: 1 if x > 0 else 0).values

# split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(review, target, test_size = 0.20, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
#apply tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# transform feature to tensors and pad for better comparison
train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_seq = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print('Shape of train tensor:', train_padded.shape)
print('Shape of validation tensor:', test_padded.shape)

In [None]:
#initiate labels for target
training_labels = y_train
test_labels = y_test

In [None]:
# initiate and define the model
model = Sequential()

model.add(Embedding(vocab_size, embedding_dim, input_length=train_padded.shape[1]))

# model.add(Conv1D(48, 5, activation='relu', padding='valid'))
# model.add(GlobalMaxPooling1D())
# model.add(Dropout(0.5))
# model.add(Flatten())
# model.add(Dropout(0.5))

model.add(Conv1D(48, 5, activation='relu', padding='valid'))
model.add(LSTM(64))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

# model.add(LSTM(128))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(0.5))

# model.add(Bidirectional(LSTM(64)))
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation='sigmoid'))

model.add(Dense(1, activation='sigmoid'))

In [None]:
#compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# save history in tensorboard
log_dir = "../logs/new/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Create a callback that saves the model's weights
checkpoint_path = "../training/model_4/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
# set parameters and train the model
epochs = 4
batch_size = 128

history = model.fit(train_padded, training_labels, shuffle=True ,
                    epochs=epochs, batch_size=batch_size, 
                    validation_split=0.2,
                    callbacks=[tensorboard_callback, cp_callback])

model.save('../saved_model/model_4')

In [None]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(test_padded, test_labels, batch_size=128)

In [None]:
pred_labels = (model.predict(test_padded) > 0.5).astype('int32')
sns.heatmap(tf.math.confusion_matrix(test_labels, pred_labels), annot=True, fmt='g')

In [None]:
# show the classification report
print(classification_report(pred_labels, test_labels))

In [None]:
text_seq = tokenizer.texts_to_sequences(review)
text_padded = pad_sequences(text_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
dataset['helpfulness'] = model.predict(text_padded)

In [None]:
dataset.sort_values('helpfulness')

In [None]:
dataset.loc[1414387].text

In [None]:
dfr['helpfulness'] = dataset['helpfulness']

In [None]:
dfr_ea = dfr.dropna(axis = 0)

In [None]:
dfr_ea.loc[170477]

In [None]:
dfr_ea.query('business_id == "toRNyzwkG59NYJP2ti-qTQ"').sort_values('helpfulness')