In [1]:
import glob
import random
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np


class DistributionalSemantics:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.reviews_sampled = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora):
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

        # Flatten the processed reviews, to get a single list, convert to a set to get the vocab list
        self.vocab = set(
            [item for sublist in self.reviews for item in sublist])

    def process_raw(self, raw: str):
        # split over the lines (## defines a line and is on each new line as defined by README)
        lines = raw.splitlines()
        # remove '[t]' tags
        lines = [ele for ele in lines if ele != "[t]"]

        # process each line in the text, add the result to an array
        for line in lines:
            processed_review = self.process_line(line)
            self.reviews.append(processed_review)

    def process_line(self, line: str) -> list:
        stop_words = set(stopwords.words('english'))
        # Add stopwords not in nltk list, these would appear in top 50 list otherwise
        stop_words.update(["ive", "im"])

        # Remove any information before '##'
        line = re.sub(r'^.*?##', '', line)
        # Convert to lower case
        line_lwr = line.lower()
        # Remove everything except alpha characters, numbers, and whitespace
        line_clean = re.sub(r'[^a-z0-9\s]+', '', line_lwr)
        # Tokenize the line
        line_tokens = word_tokenize(line_clean)
        # Remove stopwords
        filtered_line = [w for w in line_tokens if w not in stop_words]

        return filtered_line

    def find_target_words(self) -> tuple[set, set]:
        # Flatten list of reviews into one list
        words = [item for sublist in self.reviews for item in sublist]
        vocab_occurance = []
        # Count the number of occurances of each word in the vocab
        for word in self.vocab:
            vocab_occurance.append(words.count(word))

        # Convert to numpy arrays
        np_vocab_occurnace = np.array(vocab_occurance)
        np_vocab = np.array(list(self.vocab))
        # get the indices from argsort of the number of occurances in descending order
        inds = np_vocab_occurnace.argsort()[::-1]

        # Sort the vocab list by the occurances (from indices) get top 50 results
        target_words = np_vocab[inds][:50]
        # Reverse target words to get psuedo_words
        psuedo_words = [word[::-1] for word in target_words]

        return set(target_words), set(psuedo_words)

    def replace_target_words(self, target_words: set):
        # Copy reviews into a new array where I will replace 50% of target words with psuedo words
        self.reviews_sampled = self.reviews.copy()
        # loop over the target words
        for target in target_words:
            indices = []
            # Loop over the reviews
            for i in range(len(self.reviews)):
                # get the indices where the target word occurs in the review
                jj = np.where(
                    np.array(self.reviews[i]).astype(str) == target)[0]
                # Add the indices in the form (review index, word index)
                for j in jj:
                    indices.append((i, j))

            # Calculate half of the number of occurences, using DIV
            half = len(indices) // 2
            # Randomly generate a list from the list of indices half the size
            samples = random.sample(indices, half)
            # For each index to replace replace the target word at that point with the reveresed version
            for sample in samples:
                self.reviews_sampled[sample[0]][sample[1]] = target[::-1]


In [2]:
def main():
    dist_sem = DistributionalSemantics()
    corpora = dist_sem.read_data("product_reviews") # specify the directory path to the review files

    dist_sem.preprocess(corpora)
    print(len(dist_sem.vocab))
    print(len(dist_sem.reviews))
    target_words, psuedo_words = dist_sem.find_target_words()
    print(target_words)
    dist_sem.replace_target_words(target_words)

test = main()

6410
4261
{'great', 'camera', 'computer', 'really', 'take', 'quality', 'using', 'could', 'better', 'want', 'micro', 'use', 'router', 'new', 'would', 'dont', 'get', 'well', 'little', 'buy', 'product', 'sound', 'features', 'time', 'first', 'software', 'still', 'one', 'need', 'problem', 'phone', 'good', 'much', 'diaper', 'problems', 'battery', 'work', 'zen', 'easy', 'mp3', 'player', 'also', 'used', 'music', 'even', 'go', 'ipod', 'best', 'like', 'creative'}
