In [1]:
import glob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class NeuralNetwork:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.classification = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.reviews = []
        self.classification = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

    def process_raw(self, raw: str):
        # split over the lines (## defines a line and is on each new line as defined by README)
        lines = raw.splitlines()
        # remove '[t]' tags
        lines = [ele for ele in lines if ele != "[t]"]

        # process each line in the text, add the result to an array and add review class to an array
        for line in lines:
            # Check that the line isn't empty
            if len(line) != 0:
                # Process the line, get returned processed line and its review info for classifying
                processed_review, review_info = self.process_line(line)
                # Only consider reviews which can be classified, i.e. have been classified in text file
                if len(review_info) != 0:
                    # Consider weights of reviews, in the case that a review is part positive and part negative
                    # the weights will help when classifying a review if it is 'more' postive than negative, and vice versa
                    num_pos = 3 * review_info.count("+3") + 2 * review_info.count("+2") + review_info.count("+1") + review_info.count("+")
                    num_neg = 3 * review_info.count("-3") + 2 * review_info.count("-2") + review_info.count("-1") + review_info.count("-")
                    # 1 for postive and 0 for negative review, add to list
                    # if num_pos > num_neg:
                    #     self.classification.append(1)
                    #     # add the review to an array
                    #     self.reviews.append(processed_review)
                    # elif num_pos < num_neg:
                    #     self.classification.append(0)
                    #     # add the review to an array
                    #     self.reviews.append(processed_review)
                    # else:
                    #     print(f"{review_info} pos: {num_pos} neg: {num_neg}")
                    if num_pos > num_neg:
                        self.classification.append(1)
                        # add the review to an array
                        self.reviews.append(processed_review)
                    else:
                        self.classification.append(0)
                        # add the review to an array
                        self.reviews.append(processed_review)

    def process_line(self, line: str) -> tuple[list, str]:
        stop_words = set(stopwords.words('english'))
        # Add stopwords not in nltk list, these would appear in top 50 list otherwise
        stop_words.update(["ive", "im"])

        # Get the substring before the ## delimiter, if not present return empty values for error handling
        try:
            delim_index = line.index("##")
        except:
            delim_index = None
        if delim_index == None:
            return [], ""
        # sub-string before the delimiter is the information about the class of review
        review_info= line[:delim_index]
        # sub-string after the delimiter is the review
        line = line[delim_index+2:]

        # Convert to lower case
        line_lwr = line.lower()
        # Remove everything except alpha characters, numbers, and whitespace
        line_clean = re.sub(r'[^a-z0-9\s]+', '', line_lwr)
        # Tokenize the line
        line_tokens = word_tokenize(line_clean)
        # Remove stopwords
        filtered_line = [w for w in line_tokens if w not in stop_words]

        return filtered_line, review_info


In [2]:
def main():
    neural = NeuralNetwork()
    corpora = neural.read_data("product_reviews") # specify the directory path to the review files

    neural.preprocess(corpora)
    print(len(neural.reviews))
    print(len(neural.classification))
 
test = main()

2129
1351
