In [17]:
import glob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class NeuralNetwork:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.classification = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.reviews = []
        self.classification = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

    def process_raw(self, raw: str):
        # split over the lines (## defines a line and is on each new line as defined by README)
        lines = raw.replace("[t]", "").splitlines()
        # remove '[t]' tags
        # lines = [ele for ele in lines if ele != "[t]"]

        # process each line in the text, add the result to an array
        for line in lines:
            if len(line) != 0:
                processed_review, review_type = self.process_line(line)
                if len(review_type) != 0:
                    num_pos = 3 * review_type.count("+3") + 2 * review_type.count("+2") + review_type.count("+1")
                    num_neg = 3 * review_type.count("-3") + 2 * review_type.count("-2") + review_type.count("-1")
                    # print(review_type, end=" ")
                    # print(f"pos: {num_pos} neg: {num_neg}")
                    if num_pos > num_neg:
                        self.classification.append(1)
                    else:
                        self.classification.append(0)
                    self.reviews.append(processed_review)

    def process_line(self, line: str) -> tuple[list, str]:
        stop_words = set(stopwords.words('english'))
        # Add stopwords not in nltk list, these would appear in top 50 list otherwise
        stop_words.update(["ive", "im"])

        # Get the substring before the ## delimiter
        try:
            review_type = line[:line.index("##")]
        except:
            print(line)
            review_type = ""

        # Remove any information before '##'
        line = re.sub(r'^.*?##', '', line)
        # Convert to lower case
        line_lwr = line.lower()
        # Remove everything except alpha characters, numbers, and whitespace
        line_clean = re.sub(r'[^a-z0-9\s]+', '', line_lwr)
        # Tokenize the line
        line_tokens = word_tokenize(line_clean)
        # Remove stopwords
        filtered_line = [w for w in line_tokens if w not in stop_words]

        return filtered_line, review_type


In [18]:
def main():
    neural = NeuralNetwork()
    corpora = neural.read_data("product_reviews") # specify the directory path to the review files

    neural.preprocess(corpora)
 
test = main()









































































































































































































































































































variety of colours[+2] - Colors: 10 variety of HOT colors made it difficult to choose from. 
touch pad[-2], design[-1], volume control[-1] Weak Points: - Touch Pad: design could be better - Volume Control: controlled through the vertical strip, but purchasing a headphone with built-in volume control will do the trick. 



touchpad[-1][u] The touch buttons do take a little getting used to and I don't like how the scroll button is the same button used as an "enter" key, but you get used to it and it isn't a big deal. 

built[+3][u] I've dropped the device several times onto hard surfaces, and it still works flawlessly.     




You may need to briefly disable various spam blockers, cookie swatters