<a href="https://colab.research.google.com/github/Mozzer2310/COMP34711-Deep-Learning/blob/main/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import glob
import tensorflow as tf
import numpy as np


class NeuralNetwork:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.classification = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.reviews = []
        self.classification = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

    def process_raw(self, raw: str):
        # split over the lines (## defines a line and is on each new line as defined by README)
        lines = raw.splitlines()
        # remove '[t]' tags
        lines = [ele for ele in lines if ele != "[t]"]

        # process each line in the text, add the result to an array and add review class to an array
        for line in lines:
            # Check that the line isn't empty
            if len(line) != 0:
                # Process the line, get returned processed line and its review info for classifying
                processed_review, review_info = self.process_line(line)
                # Only consider reviews which can be classified, i.e. have been classified in text file
                if len(review_info) != 0:
                    # Consider weights of reviews, in the case that a review is part positive and part negative
                    # the weights will help when classifying a review if it is 'more' postive than negative, and vice versa
                    num_pos = 3 * review_info.count("+3") + 2 * review_info.count(
                        "+2") + review_info.count("+1") + review_info.count("+")
                    num_neg = 3 * review_info.count("-3") + 2 * review_info.count(
                        "-2") + review_info.count("-1") + review_info.count("-")
                    # 1 for postive and 0 for negative review, add to list
                    if num_pos > num_neg:
                        self.classification.append(1)
                        # add the review to an array
                        self.reviews.append(processed_review)
                    elif num_pos < num_neg:
                        self.classification.append(0)
                        # add the review to an array
                        self.reviews.append(processed_review)

    def process_line(self, line: str):
        # Get the substring before the ## delimiter, if not present return empty values for error handling
        try:
            delim_index = line.index("##")
        except ValueError:
            delim_index = None
        if delim_index == None:
            return [], ""
        # sub-string before the delimiter is the information about the class of review
        review_info = line[:delim_index]
        # sub-string after the delimiter is the review
        line = line[delim_index+2:]

        return line, review_info

    def test(self):
        train_list_reviews = self.reviews[:1800]
        train_list_class = self.classification[:1800]
        test_list_reviews = self.reviews[1800:]
        test_list_class = self.classification[1800:]

        train_dataset = tf.data.Dataset.from_tensor_slices(
            (train_list_reviews, train_list_class))
        test_dataset = tf.data.Dataset.from_tensor_slices(
            (test_list_reviews, test_list_class))
        print(train_dataset)
        print(test_dataset)

        BUFFER_SIZE = 10000
        BATCH_SIZE = 32
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(
            BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
        test_dataset = test_dataset.batch(
            BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

        # for example, label in train_dataset.take(1):
        #     print('texts: ', example.numpy()[:3])
        #     print()
        #     print('labels: ', label.numpy()[:3])

        VOCAB_SIZE = 5000
        encoder = tf.keras.layers.TextVectorization(
            max_tokens=VOCAB_SIZE)
        encoder.adapt(train_dataset.map(lambda text, label: text))

        vocab = np.array(encoder.get_vocabulary())
        print(vocab[:20])

        model = tf.keras.Sequential([
            encoder,
            tf.keras.layers.Embedding(
                input_dim=len(encoder.get_vocabulary()),
                output_dim=32,
                # Use masking to handle the variable sequence lengths
                mask_zero=True),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        # model = tf.keras.Sequential([
        #     encoder,
        #     tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 32, mask_zero=True),
        #     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,  return_sequences=True)),
        #     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
        #     tf.keras.layers.Dense(32, activation='relu'),
        #     tf.keras.layers.Dropout(0.5),
        #     tf.keras.layers.Dense(1)
        # ])

        model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      optimizer=tf.keras.optimizers.Adam(1e-4),
                      metrics=['accuracy'])

        history = model.fit(train_dataset, epochs=10,
                            validation_data=test_dataset,
                            validation_steps=10)

        test_loss, test_acc = model.evaluate(test_dataset)
        print(test_acc)

        # positive = ("The arm band is fantastic and it doesn't budge even at the gym.")
        # negative = ("This router was a huge disapointment.")
        # predictions = model.predict(np.array([positive, negative]))
        # print(predictions)

    def nfold_cv(self, n: int = 5):
        pos_inds = np.where(np.array(self.classification) == 1)
        neg_inds = np.where(np.array(self.classification) == 0)

        pos_reviews = list(np.array(self.reviews)[pos_inds])
        neg_reviews = list(np.array(self.reviews)[neg_inds])
        print(pos_reviews)
        print(neg_reviews)


2022-12-07 17:59:27.854619: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-07 17:59:28.434090: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib
2022-12-07 17:59:28.434144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib


In [2]:
def main():
    neural = NeuralNetwork()
    # specify the directory path to the review files
    corpora = neural.read_data("product_reviews")

    neural.preprocess(corpora)
    print(len(neural.reviews))
    print(len(neural.classification))
    print(neural.classification.count(1))
    print(neural.classification.count(0))
    print(neural.classification.count(1)//5)
    print(neural.classification.count(0)//5)
    # neural.nfold_cv()
    neural.test()


test = main()


2094
2094
1351
743
270
148
<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>
<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
['' '[UNK]' 'the' 'and' 'to' 'i' 'a' 'it' 'is' 'of' 'this' 'with' 'you'
 'for' 'that' 'in' 'have' 'but' 'my' 'not']


2022-12-07 17:59:29.059831: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-07 17:59:29.077817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/lib
2022-12-07 17:59:29.077837: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-12-07 17:59:29.078135: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.704081654548645
