Before running the codes **please use GPU** on collab so that results show up quicker.<br>
Otherwise the results will come out very slow, which takes more than 10 hours.

In [None]:
pip install nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cd drive/MyDrive/Colab\ Notebooks
# set path to /content/.... as absolute path instead?

In [None]:
!ls

drive  sample_data


In [None]:
# All imports
import os
import re
import string
import time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import sample

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet, words
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.tokenize import TweetTokenizer, word_tokenize
from sklearn.cluster import KMeans, AgglomerativeClustering


# for use in removing stop words
nltk.download('stopwords')

# required for pos tagging
nltk.download('averaged_perceptron_tagger')

# required for lemmatization
nltk.download('wordnet')
# required for wordnet
nltk.download('omw-1.4')

print(torch.cuda.device_count())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


1


In [None]:
stop_words = set(stopwords.words("english"))

stemmer = nltk.SnowballStemmer("english", ignore_stopwords = False)
# NB path AND corpus_after_token_reversal SHOULD BE CHANGED TO MATCH
# THE CORPUS PATH ON THE SPECIFIC MACHINE

# Folder path for the corpus
corpus_path = "drive/MyDrive/34711-Cwk-S-DeepLearning_Minjun/product_reviews"

# Folder path where the reverse token corpus should be stored
corpus_after_token_reversal = r"drive/MyDrive/34711-Cwk-S-DeepLearning_Minjun/processed_reviews"
file_pattern = r".*"
original_corpus = nltk.corpus.PlaintextCorpusReader(corpus_path, file_pattern)
print(original_corpus.fileids())

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']


**It should return:**

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']

<br>/////////////////////////////<br>
readme.txt is not included in this corpus

In [None]:
# Core utility function for document cleaning
# Works recursively, split the text into sentences/review, then for each 
# sentence/review perform cleaning 
def process_doc(text, remove_punctuation, case_fold, stem,
                remove_stopwords, remove_short_tokens, tokenize_by, manual_remove_list = [],
                remove_nonalphabetical = False):

  if (tokenize_by == "sentence"):
    sentences = nltk.RegexpTokenizer("##", gaps = True).tokenize(text)
    sentences = [process_doc(sentence, remove_punctuation, case_fold, stem, 
                             remove_stopwords, remove_short_tokens, "words", manual_remove_list) 
                  for sentence in sentences]
    return sentences
  
  if (tokenize_by == "sentiments"):
    sentiments = nltk.RegexpTokenizer("\[[\-+][0-9]\]", gaps = True).tokenize(text)
    sentiments = [process_doc(sentiment, remove_punctuation, case_fold, stem, 
                              remove_stopwords, remove_short_tokens, "words", manual_remove_list)
                  for sentiment in sentiments]
    return sentiments

  # tokenizing in terms of reviews with [t]
  if (tokenize_by == "reviews"):
    reviews = nltk.RegexpTokenizer("\[t\]", gaps = True).tokenize(text)
    reviews = [process_doc(review, remove_punctuation, case_fold, stem, 
                              remove_stopwords, remove_short_tokens, "words", manual_remove_list)
                for review in reviews]
    return reviews
  
  # tokenizing in terms of words excluding punctuation
  if (tokenize_by == "words"):
    words = nltk.WordPunctTokenizer().tokenize(text)
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"]
      # words = [w.strip("") for w in words]

    # preprocessing the tokenised words
    if (case_fold):
      words = [w.lower() for w in words]
    if (remove_short_tokens):
      words = [w for w in words if len(w) > 2]
    if (stem):
      words = [w if w in manual_remove_list else stemmer.stem(w) for w in words]
    if (remove_stopwords):
      words = [w for w in words if w not in stop_words and w != "n't" 
                and w != "'s"
                and w != "'m"
                and w != "'re"
                and w != "'ve"]
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"]
    if (remove_nonalphabetical):
      words = [w for w in words if w.isalpha()]
    return words

def process_corpus(corpus, remove_punctuation:bool, case_fold:bool, stem:bool,
                  remove_stopwords:bool, remove_short_tokens, tokenize_by:str, remove_nonalphabetical):
  docs = [word for fileid in corpus.fileids() 
            for word in process_doc(corpus.raw(fileid), remove_punctuation, case_fold,
                                    stem, remove_stopwords, remove_short_tokens, 
                                    tokenize_by, remove_nonalphabetical)
         ]
  return docs

def most_frequent(words, n, do_print:bool):
  freqDist = nltk.FreqDist(words)
  most_common = freqDist.most_common(n)
  if (do_print):
    i = 1
    for (w, count) in most_common:
      print(i , w , count)
      i += 1
  return most_common

# core function for generating corpus with reversed words
# the corpus of reversed words is stored as files in the path specified by the variable:
# corpus_after_token_reversal
def generate_corpus_half_tokens_reversed(corpus, token_tuple_list, override_folder):
  if not override_folder and os.path.exists(corpus_after_token_reversal):
    return
  if not os.path.exists(corpus_after_token_reversal):
    os.mkdir(corpus_after_token_reversal)
  # indecies_per_word = {word : list of 0s and 1s}
  # if indecies_per_word[word][i] == 1
  # the i-th occurrence of word needs to be reversed
  indecies_per_word = {}
  
  # pointers keeps track of how many occurrences of each word we have met
  pointers = {}

  for (word, frequency) in token_tuple_list:
    # construct an array with an equal number of 0-s and ones
    indecies = np.ones(frequency)
    indecies[:int(frequency/2)] = 0
    
    # shuffle it    
    np.random.shuffle(indecies)
    indecies_per_word[word] = indecies
    pointers[word] = -1
  fileids = corpus.fileids()
  for fileid in fileids:
    # tokenize the document
    tokens = process_doc(corpus.raw(fileid), False, True, False, False, False, "words", False)
    with_reversal = []
    for token in tokens:
      if (token in indecies_per_word):
        # update the number of occurrences of the token
        pointers[token] += 1
       # determine whether to reverse the token
        if (indecies_per_word[token][pointers[token]] == 1):
         token = token[::-1]
      with_reversal.append(token)
    doc = " ".join(with_reversal)
    
    f = open(os.path.join(corpus_after_token_reversal,fileid), "w")
    f.write(doc)
    f.close()
  
  # for (word, pointer) in pointers.items():
  #   print (word, len(indecies_per_word[word]) - pointer - 1)

In [None]:
#### Step 1 ####

print("Most frequent 50 tokens in corpus after document cleaning and pre-processing")

# cleaning and pre-processing
# then save the processed corpus in different directory
processed_corpus = process_corpus(original_corpus, True, True, False, True, True, "words", True)

# Choosing top 50 frequently occured words
top_50 = most_frequent(processed_corpus, 50, True)
cluster_words = set()
for (word, freq) in top_50:
  cluster_words.add(word)
  cluster_words.add(word[::-1])
# generate_corpus_half_tokens_reversed(original_corpus, most_frequent_tokens, True)

Most frequent 50 tokens in corpus after document cleaning and pre-processing
1 use 353
2 phone 320
3 one 316
4 ipod 314
5 router 313
6 camera 292
7 player 269
8 get 252
9 battery 239
10 like 195
11 great 192
12 quality 176
13 good 176
14 zen 174
15 diaper 171
16 product 166
17 would 158
18 also 156
19 time 145
20 software 145
21 sound 144
22 well 138
23 really 136
24 micro 136
25 features 128
26 computer 128
27 easy 125
28 even 123
29 first 121
30 used 120
31 creative 118
32 much 115
33 better 114
34 champ 113
35 work 112
36 want 107
37 size 105
38 music 105
39 norton 104
40 little 101
41 need 100
42 pictures 99
43 works 99
44 still 97
45 buy 96
46 problem 96
47 mp3 96
48 price 91
49 life 91
50 using 91


Most frequent 50 tokens in corpus after document cleaning and pre-processing<br>
1 use 353<br>
2 phone 320<br>
3 one 316<br>
4 ipod 314<br>
5 router 313<br>
6 camera 292<br>
7 player 269<br>
8 get 252<br>
9 battery 239<br>
10 like 195<br>
11 great 192<br>
12 quality 176<br>
13 good 176<br>
14 zen 174<br>
15 diaper 171<br>
16 product 166<br>
17 would 158<br>
18 also 156<br>
19 time 145<br>
20 software 145<br>
21 sound 144<br>
22 well 138<br>
23 really 136<br>
24 micro 136<br>
25 features 128<br>
26 computer 128<br>
27 easy 125<br>
28 even 123<br>
29 first 121<br>
30 used 120<br>
31 creative 118<br>
32 much 115<br>
33 better 114<br>
34 champ 113<br>
35 work 112<br>
36 want 107<br>
37 size 105<br>
38 music 105<br>
39 norton 104<br>
40 little 101<br>
41 need 100<br>
42 pictures 99<br>
43 works 99<br>
44 still 97<br>
45 buy 96<br>
46 problem 96<br>
47 mp3 96<br>
48 price 91<br>
49 life 91<br>
50 using 91<br>

In [None]:
# cleanning and pre-processing

def clean_all_sentences(corpus_path, stemming, stopwords_removal, manual_remove_list = ["§", "―","•","\t","←","→"]):

  corpus = nltk.corpus.PlaintextCorpusReader(corpus_path, file_pattern)
  out = []
  for fileid in corpus.fileids():
    sentences = process_doc(corpus.raw(fileid), True, True, stemming, stopwords_removal, True, "sentence", manual_remove_list, True)
    out.extend(sentences)
  return out

# generating word to index and index to word dictonary
def generate_w_to_idx_and_idx_to_w(corpus):
  word_to_idx = {}
  idx_to_word = {}
  i = 0
  for sentence in corpus:
    for word in sentence:
      if (word not in word_to_idx):
        word_to_idx[word] = i
        idx_to_word[i] = word
        i += 1

  return (word_to_idx, idx_to_word)


def get_context_window_tuples(word_to_idx, sentences, window, key_words):
  tuples = []
  for sentence in sentences:
    for i in range(window, len(sentence) - window):
        context = []
        middle_word = word_to_idx[sentence[i]]
        for j in range (i - window, i + window + 1):
          if i != j:
            context.append(word_to_idx[sentence[j]])
        tuples.append((context, word_to_idx[sentence[i]]))

  return tuples


# def merge_term_to_term_dicts(term_to_term_dicts):
#   term_to_term = defaultdict(lambda: defaultdict(int))
#   for term_to_term_dict in term_to_term_dicts:
#     for (key_word, freqs) in term_to_term_dict.items():
#       for (value_word, freq) in freqs.items():
#         term_to_term[key_word][value_word] += freq
#   return term_to_term

# Used Skip-Grams Model
def get_skipgrams(sentences, word_to_idx, window, neg_sample_count):
  word = []
  context = []
  y = []
  for sentence in sentences:
    for i in range(len(sentence)):
      cont = [word_to_idx[sentence[idx]] for idx in range(max(0, i - window), min(len(sentence), i + window + 1)) if idx != i]
      blacklist = set(cont)
      word.extend([word_to_idx[sentence[i]]] * (len(cont)))
      context.extend(cont)

  return(word, context)


def get_batches(words, contexts, batch_size):
  shuffled_idxs = sample(range(0, len(words)), len(words))
  batches = []

  batch_word, batch_context = [], []
  for i in range(len(words)):
    idx = shuffled_idxs[i]
    batch_word.append(words[idx])
    batch_context.append(contexts[idx])
    if (i + 1) % batch_size == 0 or i + 1 == len(words):
      batches.append((
        torch.from_numpy(np.array(batch_word)),
        torch.from_numpy(np.array(batch_context))
      ))
      batch_word, batch_context = [], []

  return batches


def get_x_tensors(x_y_tuples):
  tensors = []
  for tuple in x_y_tuples:
    tensors.append(torch.tensor(tuple[0], dtype=torch.long))

  return tensors


def get_y_tensors(tuples, num_classes):
  tensors = []
  for tuple in tuples:
    tensors.append(torch.tensor([tuple[1]]))
  return tensors
# def get_context_words_count(term_to_term_dict):
#   counts = defaultdict(int)
#   for (key, freqs) in term_to_term_dict.items():
#     for (context, freq) in freqs.items():
#       counts[context] += freq
  
#   return counts


# def get_coocurrences_alpha(context_counts, alpha=0.75):
#   sum = 0
#   for context_count in context_counts.values():
#     sum += pow(context_count, alpha)
#   return sum

# def generate_context_word_mapping(term_to_term_dict):
#   i = 0
#   mapping = {}
#   for (key, freqs) in term_to_term_dict.items():
#     for (context, freq) in freqs.items():
#       if context not in mapping:
#         mapping[context] = i
#         i += 1
#   return mapping

# def calculate_all_coocurrences(count_dict):
#   sum = 0
#   for val in count_dict.values():
#     sum += val
#   return sum

In [None]:
class Word2Vec_Skipgram(nn.Module):
  def __init__(self, embedding_size, vocab_size) -> None:
        super(Word2Vec_Skipgram, self).__init__()
        # matches each word to a vector of values
        self.embedding_words = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        # convert the linear layer output to a probability distribution
        self.log_softmax = nn.LogSoftmax(dim = 1)
    
  def forward(self, words):
      words_emb = self.embedding_words(words)
      scores = self.linear(words_emb)
      log_probs = self.log_softmax(scores)
      return log_probs

In [None]:
def train_skipgram_model(model, epochs, batch_size, learning_rate, print_feature_vector, words, contexts):
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  loss_function = nn.NLLLoss()
  for epoch in range(epochs):
    total_loss = 0
    for inputs,targets in get_batches(words=words, contexts=contexts, batch_size=300):
      optimizer.zero_grad()
      inputs, targets = inputs.to(device), targets.to(device)
      y_hat = model(inputs)
      loss = loss_function(y_hat, targets)
      loss.backward()
      optimizer.step()
      total_loss += loss
    if (print_feature_vector):
      print(epoch, total_loss)

In [None]:
def clustering_get_accuracy(n_clusters, keys, matrix, cluster_method, flag_empty_clusters, print_cluster=False):   
  if (cluster_method == "kmeans"):                
    cluster_algo = KMeans(n_clusters)
  elif (cluster_method == "agglomerative"):
      cluster_algo = AgglomerativeClustering(
        n_clusters=n_clusters
      )
  elif (cluster_method == "agglomerative_complete"):
    cluster_algo = AgglomerativeClustering(
      n_clusters=n_clusters,
      linkage="complete",
      affinity="cosine"
    )
  cluster_algo.fit(matrix)
  clusters = []
  for i in range(50):
    clusters.append(set())
  i = 0
  for label in cluster_algo.labels_:
    clusters[label].add(keys[i])
    i += 1
  correct = 0
  for cluster in clusters:
    if (flag_empty_clusters and len(cluster) == 0):
      print("EMPTY CLUSTER DETECTED")
    if (print_cluster): 
      print(cluster)
    for word in cluster:
      if word[::-1] in cluster:
        correct += 1
  return correct / len(keys)

In [None]:
def get_target_words_embeddings(target_words, embedding_matrix, word_to_idx):
  keys = []
  matrix = []
  for key in target_words:
    idx = word_to_idx[key]
    matrix.append(embedding_matrix[idx])
    keys.append(key)
  return (keys, matrix)

In [None]:

def run_experiment(iterations, corpus_path, training_epochs, embedding_dims, window_size, cluster_method, learning_rate, stemming, stopwords_removal, print_feature_vector = False):
  accuracy = np.zeros(iterations)

  # Get the 50 most common words for the experiment
  processed_corpus = process_corpus(original_corpus, True, True, False, True, True, "words", True)
  top_50 = most_frequent(processed_corpus, 50, False)
  cluster_words = set()

  for (word, freq) in top_50:
    cluster_words.add(word)
    cluster_words.add(word[::-1])
  
  for iteration in range(iterations):
    # reverse half of instances of most common words at random
    generate_corpus_half_tokens_reversed(original_corpus, top_50, True)
    # clean sentences
    sentences = clean_all_sentences(corpus_after_token_reversal, stemming, stopwords_removal, cluster_words)

    # set up data
    (word_to_idx, idx_to_word) = generate_w_to_idx_and_idx_to_w(sentences)
    vocab_size = len(word_to_idx)
    tuples = get_context_window_tuples(word_to_idx, sentences, window_size, cluster_words)
    (words, contexts) = get_skipgrams(sentences, word_to_idx, window_size, 10)

    # train model
    skipgrams_model = Word2Vec_Skipgram(embedding_dims, vocab_size=vocab_size).to(device)
    train_skipgram_model(skipgrams_model, training_epochs, 500, learning_rate, False, words, contexts)

    # get embeddings
    embedding_matrix = skipgrams_model.embedding_words.weight.detach().cpu().numpy()
    (target_words, embeddings) = get_target_words_embeddings(cluster_words, embedding_matrix, word_to_idx)

    # perform clustering
    accuracy[iteration] = clustering_get_accuracy(50, target_words, embeddings, cluster_method, True, print_feature_vector)
    if (print_feature_vector):
      print("Iteration", iteration + 1, "Accuracy:", accuracy[iteration])
  return ("Average accuracy:", np.mean(accuracy), "Standard diviation:", np.std(accuracy))

In [None]:
print("Performance with window size 1:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with window size 2:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=2, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with window size 3:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=3, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with window size 4:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=4, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with window size 5:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=5, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with window size 10:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=100, window_size=10, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

The results took hours to come out, so I am recording them in this text.<br> 
Performance with window size 1: ('Average accuracy:', 0.8720000000000001, 'Standard diviation:', 0.029933259094191558)<br>
Performance with window size 2: ('Average accuracy:', 0.8560000000000001, 'Standard diviation:', 0.008000000000000007)<br>
Performance with window size 3: ('Average accuracy:', 0.8400000000000001, 'Standard diviation:', 0.055136195008360894)<br>
Performance with window size 4: ('Average accuracy:', 0.788, 'Standard diviation:', 0.06013318551349164)<br>
Performance with window size 5: ('Average accuracy:', 0.716, 'Standard diviation:', 0.06499230723708765)<br>
Performance with window size 10: ('Average accuracy:', 0.5680000000000001, 'Standard diviation:', 0.015999999999999966)

In [None]:
print("Performance with word embedding dimension length of 50:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=50, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with embedding dimension length of 100:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=100, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with embedding dimension length of 150:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=150, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with embedding dimension length of 200:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=200, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))
print("Performance with embedding dimension length of 300:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with embedding dimension length of 400:",
    run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=400, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

Performance with word embedding dimension length of 50: ('Average accuracy:', 0.8200000000000001, 'Standard diviation:', 0.07266360849833982)<br>
Performance with embedding dimension length of 100: ('Average accuracy:', 0.884, 'Standard diviation:', 0.023323807579381222)<br>
Performance with embedding dimension length of 150: ('Average accuracy:', 0.8960000000000001, 'Standard diviation:', 0.01959591794226544)<br>
Performance with embedding dimension length of 200: ('Average accuracy:', 0.868, 'Standard diviation:', 0.04833218389437827)<br>
Performance with embedding dimension length of 300: ('Average accuracy:', 0.86, 'Standard diviation:', 0.03794733192202055)<br>
Performance with embedding dimension length of 400: ('Average accuracy:', 0.8959999999999999, 'Standard diviation:', 0.057131427428342804)<br>

In [None]:
print("Performance after Stemming: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=True, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance after removing stopwords:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=True, 
               print_feature_vector=False))

print("Performance after stemming and removing stopwords:",
      run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=True, stopwords_removal=True, 
               print_feature_vector=False))

print("Performance without both pre-processing: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

Performance after removing stopwords and stemming: ('Average accuracy:', 0.8160000000000001, 'Standard diviation:', 0.04630334761116089)


Performance after Stemming:  ('Average accuracy:', 0.9039999999999999, 'Standard diviation:', 0.03878143885933062)<br>
Performance after removing stopwords: ('Average accuracy:', 0.7959999999999999, 'Standard diviation:', 0.032)<br>
Performance after stemming and removing stopwords: ('Average accuracy:', 0.8160000000000001, 'Standard diviation:', 0.04630334761116089)<br>
Performance without both pre-processing:  ('Average accuracy:', 0.9400000000000001, 'Standard diviation:', 0.025298221281347004)<br>

In [None]:
print("Performance with 10 Epochs: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=10, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with 20 Epochs: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with 30 Epochs: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=30, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

print("Performance with 40 Epochs: ",
       run_experiment(iterations=5, corpus_path=corpus_path, training_epochs=40, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=False))

Performance with 10 Epochs:  ('Average accuracy:', 0.892, 'Standard diviation:', 0.02400000000000002)
Performance with 20 Epochs:  ('Average accuracy:', 0.884, 'Standard diviation:', 0.023323807579381222)
Performance with 30 Epochs:  ('Average accuracy:', 0.876, 'Standard diviation:', 0.014966629547095779)
Performance with 40 Epochs:  ('Average accuracy:', 0.8800000000000001, 'Standard diviation:', 0.04898979485566354)


Performance with 10 Epochs:  ('Average accuracy:', 0.892, 'Standard diviation:', 0.02400000000000002)<br>
Performance with 20 Epochs:  ('Average accuracy:', 0.884, 'Standard diviation:', 0.023323807579381222)<br>
Performance with 30 Epochs:  ('Average accuracy:', 0.876, 'Standard diviation:', 0.014966629547095779)<br>
Performance with 40 Epochs:  ('Average accuracy:', 0.8800000000000001, 'Standard diviation:', 0.04898979485566354)<br>

In [None]:
 ### From the experiments, 20 epoches and dimension length of 300 with window_size 1 had the 
 print("Best performance: ",
       run_experiment(iterations = 5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=False, stopwords_removal=False, 
               print_feature_vector=True))

{'good', 'doog', 'great', 'taerg'}
{'software', 'erawtfos'}
{'gnisu', 'using'}
{'size', 'ezis'}
{'tcudorp', 'product'}
{'serutcip', 'pictures'}
{'much', 'hcum'}
{'krow', 'work'}
{'ekil', 'like'}
{'even', 'neve'}
{'tsrif', 'first'}
{'desu', 'used'}
{'orcim', 'micro', 'creative', 'evitaerc'}
{'champ', 'pmahc'}
{'esu', 'use'}
{'yub', 'buy'}
{'retteb', 'better'}
{'player', 'reyalp'}
{'dopi', 'ipod'}
{'want', 'tnaw'}
{'easy', 'ysae'}
{'llew', 'well'}
{'also', 'osla'}
{'serutaef', 'features'}
{'llits'}
{'would', 'dluow'}
{'retuor', 'router'}
{'norton', 'notron'}
{'time', 'emit'}
{'cisum', 'music'}
{'yrettab', 'battery'}
{'sound', 'dnuos'}
{'aremac', 'camera'}
{'computer'}
{'teg', 'get'}
{'repaid', 'diaper'}
{'eno', 'one'}
{'little', 'elttil'}
{'price', 'ecirp'}
{'ytilauq', 'quality'}
{'problem', 'melborp'}
{'3pm', 'mp3'}
{'enohp', 'phone'}
{'life', 'efil'}
{'need', 'deen'}
{'really', 'yllaer'}
{'works', 'skrow'}
{'retupmoc'}
{'zen', 'nez'}
{'still'}<br>
Iteration 1 Accuracy: 0.96<br>
{'champ', 'repaid', 'diaper', 'pmahc'}
{'yub', 'get', 'buy'}
{'aremac', 'camera', 'product'}
{'computer', 'retupmoc'}
{'serutcip', 'pictures'}
{'need', 'deen'}
{'gnisu', 'using'}
{'serutaef', 'features'}
{'even', 'neve'}
{'time', 'emit'}
{'krow', 'work'}
{'really', 'yllaer'}
{'much', 'hcum'}
{'easy', 'ysae'}
{'doog', 'great'}
{'ekil', 'like'}
{'little', 'elttil'}
{'llits', 'still'}
{'retteb', 'better'}
{'problem', 'melborp'}
{'want', 'tnaw'}
{'software', 'erawtfos'}
{'size', 'ecirp'}
{'would', 'dluow'}
{'yrettab', 'battery'}
{'price'}
{'desu', 'used'}
{'teg'}
{'ytilauq', 'quality'}
{'llew', 'well'}
{'sound', 'dnuos'}
{'dopi', 'ipod'}
{'life', 'efil'}
{'norton', 'notron'}
{'tsrif', 'first'}
{'player', 'reyalp'}
{'also', 'osla'}
{'cisum', 'music'}
{'retuor', 'router'}
{'3pm', 'mp3'}
{'eno', 'one'}
{'enohp', 'phone'}
{'creative', 'evitaerc'}
{'works', 'skrow'}
{'orcim', 'micro'}
{'esu', 'use'}
{'tcudorp'}
{'good', 'taerg'}
{'zen', 'nez'}
{'ezis'}<br>
Iteration 2 Accuracy: 0.88<br>
{'easy', 'ysae', 'well', 'llew'}
{'want', 'need', 'deen', 'tnaw'}
{'orcim', 'micro', 'creative', 'evitaerc'}
{'software', 'erawtfos'}
{'teg', 'get'}
{'good', 'doog', 'great', 'taerg'}
{'serutcip', 'pictures'}
{'gnisu', 'using'}
{'size', 'ezis'}
{'would', 'dluow'}
{'much', 'hcum'}
{'ekil', 'like'}
{'time', 'emit'}
{'esu', 'use'}
{'serutaef', 'features'}
{'little', 'elttil'}
{'even', 'neve'}
{'really', 'yllaer'}
{'price', 'ecirp'}
{'aremac', 'camera'}
{'sound', 'dnuos'}
{'champ', 'pmahc'}
{'krow', 'work'}
{'eno', 'one'}
{'tsrif', 'first'}
{'player', 'reyalp'}
{'dopi', 'ipod'}
{'3pm', 'mp3'}
{'life', 'efil'}
{'norton', 'notron'}
{'desu', 'used'}
{'problem', 'melborp'}
{'llits'}
{'enohp', 'phone'}
{'retteb', 'better'}
{'product'}
{'computer'}
{'also'}
{'yub', 'buy'}
{'yrettab', 'battery'}
{'repaid', 'diaper'}
{'still'}
{'retuor', 'router'}
{'zen', 'nez'}
{'cisum', 'music'}
{'tcudorp'}
{'works', 'skrow'}
{'retupmoc'}
{'osla'}
{'ytilauq', 'quality'}<br>
Iteration 3 Accuracy: 0.92<br>
{'also', 'osla'}
{'want', 'need', 'deen', 'tnaw'}
{'llits', 'still'}
{'orcim', 'micro', 'creative', 'evitaerc'}
{'serutaef', 'features'}
{'krow', 'work'}
{'gnisu', 'using'}
{'computer', 'retupmoc'}
{'even', 'neve'}
{'much', 'hcum'}
{'teg', 'get'}
{'retteb', 'better'}
{'ekil', 'like'}
{'yub', 'buy'}
{'tsrif', 'first'}
{'esu', 'use'}
{'desu', 'used'}
{'little', 'elttil'}
{'player', 'reyalp'}
{'size', 'ezis'}
{'sound', 'dnuos'}
{'llew', 'well'}
{'really', 'yllaer'}
{'good', 'doog'}
{'software', 'erawtfos'}
{'would', 'dluow'}
{'retuor', 'router'}
{'pictures'}
{'eno', 'one'}
{'champ', 'pmahc'}
{'yrettab', 'battery'}
{'cisum', 'music'}
{'dopi', 'ipod'}
{'enohp', 'phone'}
{'ytilauq', 'quality'}
{'time', 'emit'}
{'works', 'skrow'}
{'price', 'ecirp'}
{'life', 'efil'}
{'repaid', 'diaper'}
{'easy', 'ysae'}
{'zen', 'nez'}
{'tcudorp'}
{'problem', 'melborp'}
{'product'}
{'aremac', 'camera'}
{'norton', 'notron'}
{'3pm', 'mp3'}
{'great', 'taerg'}
{'serutcip'}<br>
Iteration 4 Accuracy: 0.96<br>
{'want', 'need', 'deen', 'tnaw'}
{'champ', 'repaid', 'diaper', 'pmahc'}
{'price', 'ecirp'}
{'good', 'doog', 'great', 'taerg'}
{'also', 'serutcip'}
{'ekil', 'like'}
{'time', 'emit'}
{'dopi', 'ipod'}
{'easy', 'ysae', 'well'}
{'little', 'elttil'}
{'orcim', 'micro', 'creative', 'evitaerc'}
{'much', 'hcum'}
{'serutaef', 'features'}
{'software', 'erawtfos'}
{'dnuos', 'pictures'}
{'computer', 'retupmoc'}
{'would', 'dluow'}
{'desu', 'used'}
{'teg', 'get'}
{'esu', 'use'}
{'life', 'efil'}
{'really', 'yllaer'}
{'tsrif', 'first'}
{'norton', 'notron'}
{'llits'}
{'retteb', 'better'}
{'3pm', 'mp3'}
{'player', 'reyalp'}
{'works', 'skrow'}
{'aremac', 'camera'}
{'eno', 'one'}
{'yub', 'buy'}
{'enohp', 'phone'}
{'product'}
{'size', 'ezis'}
{'llew'}
{'ytilauq', 'quality'}
{'even', 'neve'}
{'still'}
{'yrettab', 'battery'}
{'retuor', 'router'}
{'gnisu'}
{'osla'}
{'problem', 'melborp'}
{'using'}
{'krow', 'work'}
{'zen', 'nez'}
{'sound'}
{'cisum', 'music'}
{'tcudorp'}<br>
Iteration 5 Accuracy: 0.86<br>
Best performance:  ('Average accuracy:', 0.916, 'Standard diviation:', 0.04079215610874227)

In [None]:
 ### From the experiments, 20 epoches and dimension length of 300 with window_size 1 had the 
 print("Best performance with preprocessing: ",
       run_experiment(iterations = 5, corpus_path=corpus_path, training_epochs=20, embedding_dims=300, window_size=1, 
               cluster_method="agglomerative_complete", learning_rate=0.01, stemming=True, stopwords_removal=True, 
               print_feature_vector=True))

{'elttil', 'really'}
{'micro', 'orcim', 'evitaerc', 'creative'}
{'llits', 'price'}
{'use', 'esu', 'using'}
{'erawtfos', 'software'}
{'pmahc', 'champ', 'diaper', 'repaid'}
{'retupmoc', 'computer'}
{'yllaer', 'still'}
{'player', 'reyalp'}
{'tnaw', 'ekil'}
{'like', 'used'}
{'get', 'yub', 'buy'}
{'skrow', 'works', 'krow', 'work'}
{'problem', 'melborp'}
{'enohp', 'phone'}
{'llew', 'well'}
{'features', 'serutaef'}
{'sound', 'dnuos'}
{'first', 'tsrif'}
{'retteb', 'better'}
{'ipod', 'dopi'}
{'camera', 'aremac'}
{'serutcip', 'pictures'}
{'quality', 'ytilauq', 'doog'}
{'time'}
{'even', 'neve'}
{'osla', 'also'}
{'hcum', 'much'}
{'would', 'dluow'}
{'nez', 'zen'}
{'notron', 'norton'}
{'router', 'retuor'}
{'life', 'efil'}
{'3pm', 'mp3'}
{'product', 'tcudorp'}
{'little', 'one'}
{'deen'}
{'size', 'ezis'}
{'yrettab', 'battery'}
{'want'}
{'ecirp'}
{'ysae', 'easy'}
{'gnisu'}
{'emit'}
{'good', 'great', 'taerg'}
{'eno'}
{'need'}
{'music', 'cisum'}
{'desu'}
{'teg'}
Iteration 1 Accuracy: 0.74
{'pmahc', 'cham

{'elttil', 'really'}
{'micro', 'orcim', 'evitaerc', 'creative'}
{'llits', 'price'}
{'use', 'esu', 'using'}
{'erawtfos', 'software'}
{'pmahc', 'champ', 'diaper', 'repaid'}
{'retupmoc', 'computer'}
{'yllaer', 'still'}
{'player', 'reyalp'}
{'tnaw', 'ekil'}
{'like', 'used'}
{'get', 'yub', 'buy'}
{'skrow', 'works', 'krow', 'work'}
{'problem', 'melborp'}
{'enohp', 'phone'}
{'llew', 'well'}
{'features', 'serutaef'}
{'sound', 'dnuos'}
{'first', 'tsrif'}
{'retteb', 'better'}
{'ipod', 'dopi'}
{'camera', 'aremac'}
{'serutcip', 'pictures'}
{'quality', 'ytilauq', 'doog'}
{'time'}
{'even', 'neve'}
{'osla', 'also'}
{'hcum', 'much'}
{'would', 'dluow'}
{'nez', 'zen'}
{'notron', 'norton'}
{'router', 'retuor'}
{'life', 'efil'}
{'3pm', 'mp3'}
{'product', 'tcudorp'}
{'little', 'one'}
{'deen'}
{'size', 'ezis'}
{'yrettab', 'battery'}
{'want'}
{'ecirp'}
{'ysae', 'easy'}
{'gnisu'}
{'emit'}
{'good', 'great', 'taerg'}
{'eno'}
{'need'}
{'music', 'cisum'}
{'desu'}
{'teg'}<br>
Iteration 1 Accuracy: 0.74<br>
{'pmahc', 'champ', 'diaper', 'repaid'}
{'3pm', 'music', 'cisum', 'mp3'}
{'elttil', 'price', 'ecirp'}
{'osla', 'also'}
{'gnisu', 'use', 'esu'}
{'krow', 'work'}
{'ekil', 'need'}
{'problem', 'melborp'}
{'want', 'deen'}
{'llits', 'still'}
{'retteb', 'better'}
{'great', 'taerg'}
{'features', 'serutaef'}
{'nez', 'zen', 'player'}
{'ipod', 'dopi'}
{'good', 'doog'}
{'skrow', 'works'}
{'reyalp', 'product'}
{'hcum', 'much'}
{'retupmoc', 'computer'}
{'would', 'dluow'}
{'get', 'teg'}
{'life', 'efil'}
{'first', 'tsrif'}
{'evitaerc', 'creative'}
{'size', 'ezis'}
{'using'}
{'serutcip', 'pictures'}
{'yllaer', 'really'}
{'emit', 'time'}
{'ysae', 'easy'}
{'llew', 'well'}
{'enohp', 'phone'}
{'neve'}
{'camera', 'aremac'}
{'eno', 'one'}
{'yub', 'buy'}
{'router', 'retuor'}
{'used', 'desu'}
{'yrettab', 'battery'}
{'tnaw'}
{'even'}
{'tcudorp'}
{'erawtfos', 'software'}
{'sound', 'dnuos'}
{'quality', 'ytilauq'}
{'micro', 'orcim'}
{'notron', 'norton'}
{'like'}
{'little'}<br>
Iteration 2 Accuracy: 0.84<br>
{'hcum', 'also', 'much'}
{'dopi', 'player', 'reyalp'}
{'gnisu', 'retupmoc', 'computer'}
{'llits', 'really'}
{'great', 'llew', 'well', 'taerg'}
{'emit', 'time'}
{'champ', 'product'}
{'price', 'ecirp'}
{'neve', 'ipod'}
{'get', 'teg'}
{'micro', 'orcim', 'evitaerc', 'creative'}
{'features', 'desu'}
{'nez', 'zen'}
{'elttil', 'little'}
{'yub', 'buy'}
{'sound', 'dnuos'}
{'skrow', 'works', 'work'}
{'good', 'doog'}
{'yllaer', 'osla'}
{'want', 'tnaw'}
{'use', 'esu'}
{'serutcip', 'pictures'}
{'camera', 'aremac'}
{'first', 'tsrif'}
{'problem', 'melborp'}
{'still'}
{'retteb', 'better'}
{'would', 'dluow'}
{'3pm', 'mp3'}
{'like', 'ekil'}
{'erawtfos', 'software'}
{'notron', 'norton'}
{'life', 'efil'}
{'size', 'ezis'}
{'quality', 'ytilauq'}
{'even'}
{'pmahc', 'diaper', 'repaid'}
{'eno', 'one'}
{'yrettab', 'battery'}
{'router', 'retuor'}
{'ysae', 'easy'}
{'serutaef'}
{'using'}
{'deen'}
{'enohp', 'phone'}
{'krow'}
{'used'}
{'need'}
{'tcudorp'}
{'music', 'cisum'}<br>
Iteration 3 Accuracy: 0.76<br>
{'pmahc', 'champ', 'diaper', 'repaid'}
{'yub', 'still', 'buy'}
{'want', 'need'}
{'serutcip', 'pictures'}
{'nez', 'zen', 'much'}
{'like', 'hcum'}
{'good', 'great', 'taerg'}
{'features', 'serutaef'}
{'ipod', 'dopi'}
{'get', 'teg'}
{'gnisu', '3pm', 'mp3'}
{'tnaw', 'ekil'}
{'eno', 'one'}
{'product', 'tcudorp'}
{'used', 'desu'}
{'would', 'dluow'}
{'elttil', 'little'}
{'price', 'ecirp'}
{'ysae', 'easy'}
{'micro', 'orcim'}
{'neve', 'really'}
{'problem', 'melborp'}
{'osla', 'also'}
{'emit', 'time'}
{'first', 'tsrif'}
{'erawtfos', 'software'}
{'quality', 'ytilauq'}
{'size', 'ezis'}
{'skrow', 'works', 'work'}
{'llew', 'well'}
{'notron', 'norton'}
{'sound', 'dnuos'}
{'music', 'cisum'}
{'retteb', 'better'}
{'player', 'reyalp'}
{'use', 'esu'}
{'yrettab', 'battery'}
{'retupmoc', 'computer'}
{'evitaerc', 'creative'}
{'enohp', 'phone'}
{'life', 'efil'}
{'llits'}
{'camera', 'aremac'}
{'krow'}
{'even'}
{'router', 'retuor'}
{'doog'}
{'using'}
{'deen'}
{'yllaer'}<br>
Iteration 4 Accuracy: 0.8<br>
{'like', 'ekil'}
{'sound', 'dnuos'}
{'llits', 'elttil'}
{'emit', 'used', 'deen', 'ecirp'}
{'pmahc', 'champ', 'diaper', 'repaid'}
{'still', 'price'}
{'get', 'teg'}
{'good', 'great', 'doog', 'taerg'}
{'even', 'neve'}
{'retteb', 'better'}
{'skrow', 'works', 'work'}
{'product', 'tcudorp'}
{'problem', 'melborp'}
{'erawtfos', 'software'}
{'router', 'retuor'}
{'features', 'serutaef'}
{'osla', 'really', 'also'}
{'enohp', 'phone'}
{'use', 'esu', 'using'}
{'life', 'efil'}
{'want'}
{'retupmoc', 'computer'}
{'serutcip', 'pictures'}
{'yub', 'buy'}
{'eno', 'one'}
{'ysae', 'easy'}
{'would', 'dluow'}
{'llew', 'well'}
{'evitaerc', 'creative'}
{'first', 'tsrif'}
{'camera', 'aremac'}
{'gnisu'}
{'3pm', 'mp3'}
{'quality', 'ytilauq'}
{'little'}
{'time'}
{'player', 'reyalp'}
{'hcum', 'much'}
{'yrettab', 'battery'}
{'desu'}
{'size', 'ezis'}
{'yllaer'}
{'krow'}
{'notron', 'norton'}
{'tnaw'}
{'nez', 'zen'}
{'need'}
{'ipod', 'dopi'}
{'micro', 'orcim'}
{'music', 'cisum'}<br>
Iteration 5 Accuracy: 0.8<br>
Best performance with preprocessing:  ('Average accuracy:', 0.7879999999999999, 'Standard diviation:', 0.03487119154832538)