In [1]:
import glob
import random
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from sklearn.cluster import KMeans
from numpy.linalg import svd, matrix_rank


class DistributionalSemantics:

    def __init__(self) -> None:
        self.vocab = set()
        self.reviews = []
        self.reviews_sampled = []
        self.target_words = []
        self.psuedo_words = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")
        #############################
        # collect reviews by [t] tag remove ipod and cannon_powershot
        # TODO: implement something to include these files
        # file_paths.remove(path + "/ipod.txt")
        # file_paths.remove(path + "/Canon_PowerShot_SD500.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.reviews = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

        # Flatten the processed reviews, to get a single list, convert to a set to get the vocab list
        self.vocab = set(
            [item for sublist in self.reviews for item in sublist])

    def process_raw(self, raw: str):
        # try only considering what is between the [t] tags
        reviews = raw.split("[t]\n")
        if len(reviews) <= 2:
            if len(reviews) == 1:  # iPod file
                raw_lines = reviews[0].splitlines()
            else:  # canon power shot file
                raw_lines = reviews[1].splitlines()

            SPLIT_NUM = 50
            reviews = []
            for split in range(SPLIT_NUM):
                lower = (len(raw_lines)//SPLIT_NUM) * split
                if split == SPLIT_NUM-1:
                    reviews.append("\n".join(raw_lines[lower:]))
                else:
                    upper = (len(raw_lines)//SPLIT_NUM) * (split+1)
                    reviews.append("\n".join(raw_lines[lower:upper]))
        else:
            reviews = [x for x in reviews if x != ""]
        # print(reviews)
        for review in reviews:
            processed_review = self.process_review(review)
            self.reviews.append(processed_review)

    def process_review(self, review: str) -> list:
        stop_words = set(stopwords.words('english'))
        # Add stopwords not in nltk list, these would appear in top 50 list otherwise
        stop_words.update(["ive", "im"])

        # only considering what is between the [t] tags
        stemmed_reviews = []
        lines = review.splitlines()
        for line in lines:
            try:
                delim_index = line.index("##")
                line = line[delim_index+2:]
            except ValueError:
                line = line

            # Convert to lower case
            line_lwr = line.lower()
            # Remove everything except alpha characters, numbers, and whitespace
            line_clean = re.sub(r'[^a-z0-9\s]+', '', line_lwr)
            # Tokenize the review
            line_tokens = word_tokenize(line_clean)
            # Remove stopwords
            filtered_reviews = [w for w in line_tokens if w not in stop_words]

            # Stemming with Snowball Stemmer
            snow_stemmer = SnowballStemmer(language='english')
            stemmed_reviews.extend([snow_stemmer.stem(word)
                                   for word in filtered_reviews])

        return stemmed_reviews

    def find_target_words(self):
        # Flatten list of reviews into one list
        words = [item for sublist in self.reviews for item in sublist]
        vocab_occurance = []
        # Count the number of occurances of each word in the vocab
        for word in self.vocab:
            vocab_occurance.append(words.count(word))

        # Convert to numpy arrays
        np_vocab_occurnace = np.array(vocab_occurance)
        np_vocab = np.array(list(self.vocab))
        # get the indices from argsort of the number of occurances in descending order
        inds = np_vocab_occurnace.argsort()[::-1]

        # Sort the vocab list by the occurances (from indices) get top 50 results
        self.target_words = list(np_vocab[inds][:50])
        # Reverse target words to get psuedo_words
        self.psuedo_words = [word[::-1] for word in self.target_words]

    def replace_target_words(self):
        # Copy reviews into a new array where I will replace 50% of target words with psuedo words
        self.reviews_sampled = self.reviews.copy()
        # loop over the target words
        for target in self.target_words:
            indices = []
            # Loop over the reviews
            for i in range(len(self.reviews)):
                # get the indices where the target word occurs in the review
                jj = np.where(
                    np.array(self.reviews[i]).astype(str) == target)[0]
                # Add the indices in the form (review index, word index)
                for j in jj:
                    indices.append((i, j))

            # Calculate half of the number of occurences, using DIV
            half = len(indices) // 2
            # Randomly generate a list from the list of indices half the size
            samples = random.sample(indices, half)
            # For each index to replace replace the target word at that point with the reveresed version
            for sample in samples:
                self.reviews_sampled[sample[0]][sample[1]] = target[::-1]

    def contruct_feature_mat(self):
        feature_mat = []
        term_vec = self.target_words.copy() + self.psuedo_words.copy()
        for term in term_vec:
            feature_vec = []
            for i in range(len(self.reviews_sampled)):
                feature_vec.append(self.reviews_sampled[i].count(term))
            feature_mat.append(feature_vec)

        self.feature_mat = np.array(feature_mat)

    def cluster(self):
        u, s, v = svd(self.feature_mat)
        k = matrix_rank(self.feature_mat)
        U = u[:, :k]
        S = np.diag(s)[:k, :k]
        V = v[:, :k]
        dense_term = np.matmul(U, S)
        print(dense_term.shape)
        kmeans = KMeans(n_clusters=50).fit(dense_term)
        result = kmeans.labels_
        print(result)

        correct = []
        for i in range(50):
            correct.append(result[i] == result[i+50])

        print(correct.count(True)/correct.count(False)*100)


In [2]:
def main():
    dist_sem = DistributionalSemantics()
    # specify the directory path to the review files
    corpora = dist_sem.read_data("product_reviews")

    dist_sem.preprocess(corpora)
    # print(dist_sem.vocab)
    print(len(dist_sem.vocab))
    print(len(dist_sem.reviews))
    dist_sem.find_target_words()
    print(dist_sem.target_words)
    print(dist_sem.psuedo_words)
    dist_sem.replace_target_words()
    dist_sem.contruct_feature_mat()
    dist_sem.cluster()


test = main()


4706
422
['use', 'get', 'one', 'ipod', 'phone', 'router', 'camera', 'work', 'player', 'like', 'great', 'time', 'batteri', 'problem', 'good', 'diaper', 'dont', 'product', 'would', 'also', 'zen', 'take', 'need', 'comput', 'want', 'realli', 'look', 'featur', 'well', 'go', 'qualiti', 'easi', 'thing', 'buy', 'even', 'first', 'micro', 'instal', 'creativ', 'much', 'review', 'make', 'better', 'softwar', 'pictur', 'littl', 'sound', 'purchas', 'bag', 'music']
['esu', 'teg', 'eno', 'dopi', 'enohp', 'retuor', 'aremac', 'krow', 'reyalp', 'ekil', 'taerg', 'emit', 'irettab', 'melborp', 'doog', 'repaid', 'tnod', 'tcudorp', 'dluow', 'osla', 'nez', 'ekat', 'deen', 'tupmoc', 'tnaw', 'illaer', 'kool', 'rutaef', 'llew', 'og', 'itilauq', 'isae', 'gniht', 'yub', 'neve', 'tsrif', 'orcim', 'latsni', 'vitaerc', 'hcum', 'weiver', 'ekam', 'retteb', 'rawtfos', 'rutcip', 'lttil', 'dnuos', 'sahcrup', 'gab', 'cisum']
(100, 100)
[24 30 10 37  8 33  4 46 23 49 32 44 13 42  0 35 25 14 17 41  1 16 15 47
 32  9 32  9 12 1