<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/NLP_CW2_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import torch
import os
import re
import collections
import random
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
porter = nltk.PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
PATH_TO_REVIEWS = "/content/drive/MyDrive/Colab Notebooks/product_reviews"

In [None]:
file_contents = ""
for filename in os.listdir(PATH_TO_REVIEWS):
  # skip readme
  if filename == 'README.txt':
    continue
  # append contents of other files to file_contents string
  starting_corpus_size = len(file_contents)
  with open(os.path.join(PATH_TO_REVIEWS, filename)) as f:
    file_contents += f.read()
  print("After appending", filename, "corpus sized increased to", starting_corpus_size, "->", len(file_contents))

After appending Nokia_6600.txt corpus sized increased to 0 -> 56093
After appending norton.txt corpus sized increased to 56093 -> 95013
After appending Linksys_Router.txt corpus sized increased to 95013 -> 151947
After appending MicroMP3.txt corpus sized increased to 151947 -> 259727
After appending Diaper_Champ.txt corpus sized increased to 259727 -> 294831
After appending Hitachi_router.txt corpus sized increased to 294831 -> 325078
After appending Canon_S100.txt corpus sized increased to 325078 -> 353887
After appending Canon_PowerShot_SD500.txt corpus sized increased to 353887 -> 378520
After appending ipod.txt corpus sized increased to 378520 -> 436566


In [None]:
list_of_sentences = nltk.tokenize.sent_tokenize(file_contents)
list_of_sentences[:5]

["[t]\nphone[+3][u]##I've had this beauty for nearly 2 months now and I truely love it.",
 'battery life[-2]##The only disappointment so far has been battery life.',
 "battery life[-2][u]##Mine generally requires a charege every 48 hours or so and I don't really talk on it that much.",
 '##Why is this phone so great?',
 '##Simple.']

## 1. Preprocessing


In [None]:
def textPreprocessing(list_of_sentences):
  output = []
  for text_as_string in list_of_sentences:
    processed_sentence = []
    case_folded = text_as_string.casefold()
    tokenized = nltk.tokenize.word_tokenize(case_folded)
    for token in tokenized:
      if token in nltk.corpus.stopwords.words('english'):
        continue
      # if token doesn't contain alphanumeric characters
      if re.match('^\W+$', token):
        continue
      # if token is encodes semantic information
      if re.match('^[+-]\d$', token):
        continue
      stemmed = porter.stem(token)
      processed_sentence.append(stemmed)
    output.append(processed_sentence)
  return output

In [None]:
tokenized_and_preprocessed = textPreprocessing(list_of_sentences)
tokenized_and_preprocessed[:5]

[['phone', 'u', "'ve", 'beauti', 'nearli', '2', 'month', 'trueli', 'love'],
 ['batteri', 'life', 'disappoint', 'far', 'batteri', 'life'],
 ['batteri',
  'life',
  'u',
  'mine',
  'gener',
  'requir',
  'chareg',
  'everi',
  '48',
  'hour',
  "n't",
  'realli',
  'talk',
  'much'],
 ['phone', 'great'],
 ['simpl']]

## 2. Pseudowords

In [None]:
def getTop50Words(list_of_tokenized_sentences):
  # unwrap the sublists
  list_of_all_tokens = [token for sent in list_of_tokenized_sentences for token in sent]
  top_50_tokens = collections.Counter(list_of_all_tokens).most_common()[:50]
  # top_50_tokens has a format of ('word', number_of_occurances)
  # in our case we only need to check if tokens match and don't need the number of occurances
  top_50_without_freq = [t[0] for t in top_50_tokens]
  return top_50_without_freq

In [None]:
def replaceTopWordsWithPseudo(list_of_tokenized_sentences, top_50_without_freq):
  output = []
  for list_of_tokenized_words in list_of_tokenized_sentences:
    pseudoworded_sentence = []
    for token in list_of_tokenized_words:
      if token not in top_50_without_freq:
        continue
      # reverse 50% of occurances
      if random.uniform(0, 1) > 0.5:
        pseudoworded_sentence.append(token[::-1])
      else:
        pseudoworded_sentence.append(token)
    if len(pseudoworded_sentence) > 0:
      output.append(pseudoworded_sentence)
  return output
      

In [None]:
top_50_words = getTop50Words(tokenized_and_preprocessed)
top_50_words

['use',
 "n't",
 "'s",
 'phone',
 'u',
 'router',
 'one',
 'get',
 'ipod',
 'camera',
 'player',
 'work',
 'batteri',
 'diaper',
 'like',
 'product',
 'great',
 'time',
 'featur',
 'problem',
 'good',
 'would',
 'look',
 'zen',
 'qualiti',
 "'ve",
 'instal',
 'also',
 'sound',
 'take',
 'need',
 'softwar',
 'comput',
 'pictur',
 'want',
 'realli',
 'micro',
 'go',
 'well',
 'even',
 'thing',
 'easi',
 'buy',
 "'m",
 'creativ',
 'first',
 'review',
 'make',
 'much',
 'bag']

In [None]:
half_replaced_with_pseudo = replaceTopWordsWithPseudo(tokenized_and_preprocessed,
                                                      top_50_words)
half_replaced_with_pseudo[:5]

[['enohp', 'u', "'ve"],
 ['batteri', 'batteri'],
 ['batteri', 'u', "n't", 'illaer', 'much'],
 ['phone', 'taerg'],
 ['phone']]

## 3. D-dimensional vector to encode top 50

In [120]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=2)

In [121]:
w2v_model.build_vocab(half_replaced_with_pseudo, progress_per=10000)

In [122]:
w2v_model.train(half_replaced_with_pseudo, total_examples=w2v_model.corpus_count, epochs=1000, report_delay=1)

(830583, 10296000)

In [123]:
# prepare model for inference
w2v_model.init_sims(replace=True)

In [124]:
w2v_model.wv.most_similar(positive=["batteri"])

[('irettab', 0.9293215274810791),
 ("s'", 0.8212628364562988),
 ('og', 0.81708824634552),
 ('one', 0.7709792852401733),
 ('doog', 0.769716739654541),
 ('reyalp', 0.7568912506103516),
 ("m'", 0.7543979287147522),
 ('rutaef', 0.750193178653717),
 ('dluow', 0.746510922908783),
 ('much', 0.7316174507141113)]

## 4. Apply clustering

In [125]:
class ClustererWrapper:
  def __init__(self, cluster_fn, classify_fn):
    self.cluster_fn = cluster_fn
    self.classify_fn = classify_fn

In [126]:
def testClusteringAlgo(matrix, clusterer):
  clusterer.cluster_fn(matrix)
  correct_clusterings = 0
  incorrect_clusterings = 0
  for word in top_50_words:
    pseudoword = word[::-1]
    word_vector = w2v_model.wv[word]
    pseudoword_vector = w2v_model.wv[pseudoword] 
    word_class = clusterer.classify_fn(word_vector)
    pseudoword_class = clusterer.classify_fn(pseudoword_vector)
    if word_class == pseudoword_class:
      correct_clusterings += 1
    else:
      print("Incorrectly classified:")
      incorrect_clusterings += 1
    
  print("correct_clusterings", correct_clusterings)
  print("incorrect_clusterings", incorrect_clusterings)

In [127]:
matrix = [] 
for word in top_50_words:
  matrix.append(w2v_model.wv[word])
  matrix.append(w2v_model.wv[word[::-1]])

## 5. Check whether word and its corresponding pseudoword are grouped together

In [129]:
clusterer = nltk.cluster.KMeansClusterer(2, nltk.cluster.euclidean_distance, avoid_empty_clusters=True)
wrapped_clusterer = ClustererWrapper(cluster_fn=lambda matrix: clusterer.cluster(matrix, True),
                                     classify_fn=lambda word: clusterer.classify(word))
testClusteringAlgo(matrix, wrapped_clusterer)

correct_clusterings 47
incorrect_clusterings 3
