<a href="https://colab.research.google.com/github/Mozzer2310/COMP34711-Deep-Learning/blob/main/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import glob
import string
from nltk.tokenize import word_tokenize
import tensorflow as tf
import numpy as np


class NeuralNetwork:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.classification = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.reviews = []
        self.classification = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

        # Flatten the processed reviews, to get a single list, convert to a set to get the vocab list
        self.vocab = set(
            [item for sublist in self.reviews for item in sublist.split(" ")])

    def process_raw(self, raw: str):
        # split over the lines (## defines a line and is on each new line as defined by README)
        lines = raw.splitlines()
        # remove '[t]' tags
        lines = [ele for ele in lines if ele != "[t]"]

        # process each line in the text, add the result to an array and add review class to an array
        for line in lines:
            # Check that the line isn't empty
            if len(line) != 0:
                # Process the line, get returned processed line and its review info for classifying
                processed_review, review_info = self.process_line(line)
                # Only consider reviews which can be classified, i.e. have been classified in text file
                if len(review_info) != 0:
                    # Consider weights of reviews, in the case that a review is part positive and part negative
                    # the weights will help when classifying a review if it is 'more' postive than negative, and vice versa
                    num_pos = 3 * review_info.count("+3") + 2 * review_info.count(
                        "+2") + review_info.count("+1") + review_info.count("+")
                    num_neg = 3 * review_info.count("-3") + 2 * review_info.count(
                        "-2") + review_info.count("-1") + review_info.count("-")
                    # 1 for postive and 0 for negative review, add to list
                    if num_pos > num_neg:
                        self.classification.append(1)
                        # add the review to an array
                        self.reviews.append(processed_review)
                    elif num_pos < num_neg:
                        self.classification.append(0)
                        # add the review to an array
                        self.reviews.append(processed_review)

    def process_line(self, line: str):
        # Get the substring before the ## delimiter, if not present return empty values for error handling
        try:
            delim_index = line.index("##")
        except ValueError:
            delim_index = None
        if delim_index == None:
            return [], ""
        # sub-string before the delimiter is the information about the class of review
        review_info = line[:delim_index]
        # sub-string after the delimiter is the review
        line = line[delim_index+2:]

        # Convert to lower case
        line_lwr = line.lower()
        # # Remove everything except alpha characters, numbers, and whitespace
        # line_clean = re.sub(r'[^a-zA-Z0-9\s]+', '', line_lwr)
        line_lwr.translate(str.maketrans('', '', string.punctuation))
        # Tokenize the review
        line_tokens = " ".join(word_tokenize(line_lwr))

        return line_tokens, review_info

    def test(self):
        train_list_reviews = self.reviews[:1800]
        train_list_class = self.classification[:1800]
        test_list_reviews = self.reviews[1800:]
        test_list_class = self.classification[1800:]

        BATCH_SIZE = 32

        ###################################
        vocab_size = len(self.vocab)
        encoded_train_reviews = [tf.keras.preprocessing.text.one_hot(d, vocab_size) for d in train_list_reviews]
        encoded_test_reviews = [tf.keras.preprocessing.text.one_hot(d, vocab_size) for d in test_list_reviews]
        # print(f'encoded reviews: {encoded_train_reviews}')
        max_length = max([len(sublist) for sublist in encoded_train_reviews + encoded_test_reviews])
        print(max_length)
        train_padded = tf.keras.preprocessing.sequence.pad_sequences(encoded_train_reviews, maxlen=max_length, padding='post')
        # print(train_padded)
        train_labels = np.array(train_list_class)
        test_padded = tf.keras.preprocessing.sequence.pad_sequences(encoded_test_reviews, maxlen=max_length, padding='post')
        # print(train_padded)
        test_labels = np.array(test_list_class)
        num_words = vocab_size
        maxlen=max_length

        #### Using Tokenizer ####
        # num_words = 1000
        # oov_token = '<UNK>'
        # pad_type = 'post'
        # trunc_type = 'post'

        # # Tokenize training data
        # tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token=oov_token)
        # tokenizer.fit_on_texts(train_list_reviews)

        # # Get training data word index
        # word_index = tokenizer.word_index

        # # Encode training data sentences into sequences
        # train_sequences = tokenizer.texts_to_sequences(train_list_reviews)

        # # Get max training sequence length
        # maxlen = max([len(x) for x in train_sequences])

        # # Pad the training sequences
        # train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type)

        # # Convert training labels to numpy array
        # train_labels = np.array(train_list_class)

        # # Tokenize the test data
        # test_sequences = tokenizer.texts_to_sequences(test_list_reviews)
        # test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

        # # Convert test labels to numpy array
        # test_labels = np.array(test_list_class)

        # Performs well
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(
                input_dim=num_words,
                output_dim=64,
                input_length=maxlen),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1)

        ])

        # Performs badly
        # model = tf.keras.Sequential([
        #     tf.keras.layers.Embedding(
        #         input_dim=num_words,
        #         output_dim=32,
        #         input_length=maxlen),
        #     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, recurrent_dropout=0.2)),
        #     # tf.keras.layers.Dense(64, activation='relu'),
        #     tf.keras.layers.Dense(1, activation="sigmoid")
        # ])

        model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(1e-4),
                      metrics=['accuracy'])

        history = model.fit(
            train_padded,
            train_labels,
            epochs=20,
            batch_size=BATCH_SIZE)

        test_loss, test_acc = model.evaluate(test_padded, test_labels)
        print(test_acc)

    def nfold_cv(self, n: int = 5):
        pos_inds = np.where(np.array(self.classification) == 1)
        neg_inds = np.where(np.array(self.classification) == 0)

        pos_reviews = list(np.array(self.reviews)[pos_inds])
        neg_reviews = list(np.array(self.reviews)[neg_inds])
        print(pos_reviews)
        print(neg_reviews)


In [20]:
def main():
    neural = NeuralNetwork()
    # specify the directory path to the review files
    corpora = neural.read_data("product_reviews")

    neural.preprocess(corpora)
    # print(neural.vocab)
    # print(len(neural.reviews))
    # print(len(neural.classification))
    # print(neural.classification.count(1))
    # print(neural.classification.count(0))
    # print(neural.classification.count(1)//5)
    # print(neural.classification.count(0)//5)
    neural.nfold_cv()
    # neural.test()


test = main()


77
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.7142857313156128
