<a href="https://colab.research.google.com/github/Mozzer2310/COMP34711-Deep-Learning/blob/main/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab generated code to mount drive, remove/comment out if not needed
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import glob
import random
import re
import copy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from numpy.linalg import svd, matrix_rank

# Download nltk punkt for tokenizer
nltk.download('punkt')


class DistributionalSemantics:

    def __init__(self) -> None:
        self.vocab = set()
        self.docs = []
        self.docs_sampled = []
        self.target_words = []
        self.psuedo_words = []

    def read_data(self, path: str) -> list:
        # Find all the .txt files at the path, remove the README from the list
        file_paths = glob.glob(path + "/*.txt")
        file_paths.remove(path + "/README.txt")

        corpora = []
        # Read each file in the list of files
        for file_path in file_paths:
            f = open(file_path, "r")
            # Add the data to an array of corpora
            corpora.append(f.read())

        return corpora

    def preprocess(self, corpora: list):
        self.docs = []
        # process the raw data of each corpus in the list
        for corpus in corpora:
            self.process_raw(corpus)

        # Flatten the processed docs, to get a single list, convert to a set to get the vocab list
        self.vocab = set(
            [item for sublist in self.docs for item in sublist])

    def process_raw(self, raw: str):
        stop_words = set(ENGLISH_STOP_WORDS).copy()
        # Add stopwords not in sklearn list, these would appear in top 50 list otherwise
        stop_words.update(["ive", "im", "dont"])

        # remove [t] tag lines
        lines = raw.splitlines()
        lines = [line for line in lines if line not in ["[t]"]]

        stemmed_doc = []
        for line in lines:
            # find the position in the string of the delimiter '##'
            # remove the content before the delimiter and the delimiter itself
            try:
                delim_index = line.index("##")
                line = line[delim_index+2:]
            # If delimiter not present the line is okay to process
            except ValueError:
                line = line

            # Convert to lower case
            line_lwr = line.lower()
            # Remove everything except alpha characters, numbers, and whitespace
            line_clean = re.sub(r'[^a-z0-9\s]+', '', line_lwr)
            # Tokenize the document
            line_tokens = word_tokenize(line_clean)
            # Remove stopwords
            filtered_docs = [w for w in line_tokens if w not in stop_words]

            # Stemming with Snowball Stemmer
            snow_stemmer = SnowballStemmer(language='english')
            stemmed_doc.extend([snow_stemmer.stem(word)
                                for word in filtered_docs])

        self.docs.append(stemmed_doc)

    def find_target_words(self):
        # Flatten list of docs into one list
        words = [item for sublist in self.docs for item in sublist]
        vocab_occurance = []
        # Count the number of occurances of each word in the vocab
        for word in self.vocab:
            vocab_occurance.append(words.count(word))

        # Convert to numpy arrays
        np_vocab_occurnace = np.array(vocab_occurance)
        np_vocab = np.array(list(self.vocab))
        # get the indices from argsort of the number of occurances in descending order
        inds = np_vocab_occurnace.argsort()[::-1]

        # Sort the vocab list by the occurances (from indices) get top 50 results
        self.target_words = list(np_vocab[inds][:50])
        # Reverse target words to get psuedo_words
        self.psuedo_words = [word[::-1] for word in self.target_words]

        # add psuedo_words to vocab
        self.vocab.update(self.psuedo_words)
        self.vocab_list = list(self.vocab)

    def replace_target_words(self):
        # Copy docs into a new array where I will replace 50% of target words with psuedo words
        self.docs_sampled = copy.deepcopy(self.docs)
        # loop over the target words
        for target in self.target_words:
            indices = []
            # Loop over the docs
            for i in range(len(self.docs_sampled)):
                # get the indices where the target word occurs in the doc
                jj = np.where(
                    np.array(self.docs_sampled[i]).astype(str) == target)[0]
                # Add the indices in the form (doc index, word index)
                for j in jj:
                    indices.append((i, j))

            # Calculate half of the number of occurences, using DIV
            half = len(indices) // 2
            # Randomly generate a list from the list of indices half the size
            samples = random.sample(indices, half)
            # For each index to replace replace the target word at that point with the reveresed version
            for sample in samples:
                self.docs_sampled[sample[0]][sample[1]] = target[::-1]

    def contruct_feature_mat(self, context_window: int = 20):
        feature_mat = []
        # the N dimension is the target and psuedowords
        term_vec = self.target_words.copy() + self.psuedo_words.copy()
        # combine and flatten all the reviews into one list
        flat_sampled_doscs = np.array(
            [item for sublist in self.docs_sampled for item in sublist])
        # loop over the terms
        for term in term_vec:
            # instantiate the feature vector as an array of 0s size of the vocab
            feature_vec = [0] * len(self.vocab)
            # Get all the indices of term in the flat list
            indices = np.where(flat_sampled_doscs == term)[0]
            # for each occurance of the word find the words in the context
            # window around it, and update the feature vector to show a 1
            # if the word occurs in the context window
            for ind in indices:
                # edge case where the window extends past the start of the
                # flat list
                if ind-context_window < 0:
                    context_words = list(
                        flat_sampled_doscs[:ind+context_window+1])
                else:
                    context_words = list(flat_sampled_doscs[ind -
                                                            context_window:ind+context_window+1])
                # remove middle term, which is the term we are checking the context for
                context_words.pop(len(context_words)//2 + 1)
                for word in context_words:
                    try:
                        feature_ind = self.vocab_list.index(word)
                    except ValueError:
                        continue
                    feature_vec[feature_ind] = 1
            # add the feature vec to the matrix
            feature_mat.append(feature_vec)
        # set the feature matrix
        self.feature_mat = np.array(feature_mat)

    def cluster(self) -> float:
        # Single value decomposition, convert spare feature vecture
        # into dense feature vecture
        u, s, v = svd(self.feature_mat)
        k = matrix_rank(self.feature_mat)
        U = u[:, :k]
        S = np.diag(s)[:k, :k]
        V = v[:, :k]
        dense_term = np.matmul(U, S)
        # Use war Hierarchical clustering with 50 clusters
        clustering = AgglomerativeClustering(n_clusters=50).fit(dense_term)
        result = list(clustering.labels_)

        # Check if the target words are group with their psuedoword
        correct = []
        for i in range(50):
            correct.append(result[i] == result[i+50])

        # Accuracy is the number of correctly group target and psuedoword pairs
        # divided by the total number of clusters, as a percentage
        accuracy = (correct.count(True)/50)*100

        return accuracy


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
def main():
    dist_sem = DistributionalSemantics()
    # specify the directory path to the review files
    corpora = dist_sem.read_data(
        "/content/drive/MyDrive/COMP34711-Deep-Learning/product_reviews")

    dist_sem.preprocess(corpora)
    print(len(dist_sem.vocab))
    print(len(dist_sem.docs))
    dist_sem.find_target_words()
    print(dist_sem.target_words)
    print(dist_sem.psuedo_words)
    print(len(dist_sem.vocab))

    accuracies = []
    for i in range(10):
        dist_sem.replace_target_words()
        dist_sem.contruct_feature_mat(context_window=20)
        accuracies.append(dist_sem.cluster())

    print(f"Accuracies: {accuracies}")
    print(f"Mean of Accuracies: {np.mean(accuracies)}")
    print(f"Stand Deviation of Accuracies: {np.std(accuracies)}")
    # TODO: hyper-parameter selection (context window size)


test = main()


4598
9
['use', 'ipod', 'phone', 'router', 'camera', 'work', 'just', 'player', 'like', 'great', 'time', 'batteri', 'problem', 'good', 'diaper', 'product', 'zen', 'need', 'comput', 'want', 'realli', 'look', 'featur', 'qualiti', 'easi', 'thing', 'buy', 'micro', 'instal', 'creativ', 'review', 'make', 'better', 'softwar', 'pictur', 'littl', 'sound', 'bag', 'purchas', 'music', 'song', 'did', 'tri', 'connect', 'mp3', 'set', 'bit', 'new', 'lot', 'doe']
['esu', 'dopi', 'enohp', 'retuor', 'aremac', 'krow', 'tsuj', 'reyalp', 'ekil', 'taerg', 'emit', 'irettab', 'melborp', 'doog', 'repaid', 'tcudorp', 'nez', 'deen', 'tupmoc', 'tnaw', 'illaer', 'kool', 'rutaef', 'itilauq', 'isae', 'gniht', 'yub', 'orcim', 'latsni', 'vitaerc', 'weiver', 'ekam', 'retteb', 'rawtfos', 'rutcip', 'lttil', 'dnuos', 'gab', 'sahcrup', 'cisum', 'gnos', 'did', 'irt', 'tcennoc', '3pm', 'tes', 'tib', 'wen', 'tol', 'eod']
4646
Accuracies: [68.0, 66.0, 68.0, 64.0, 68.0, 66.0, 64.0, 70.0, 70.0, 62.0]
Mean of Accuracies: 66.6
Stand 