In [18]:
import os
import copy
import string
import time
import random
import numpy as np
import gensim

from sklearn.cluster import KMeans
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


from gensim.models import Word2Vec
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet, words
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.tokenize import TweetTokenizer, word_tokenize

from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
file_path = "drive/MyDrive/34711-Cwk-S-DeepLearning_Minjun/product_reviews/product_reviews"
files = os.listdir(file_path)

In [21]:


translational_table = str.maketrans(
    "",
    "",
    (string.punctuation) + "§―•\t←→",
)

correct_words = words.words()
stop_words = set(stopwords.words("english"))

wordnet_lemmatizer = WordNetLemmatizer()
# tweet_tokenizer = TweetTokenizer()


def pos_tagger(nltk_tag):
    """
    Take a POS Tag, and return a wordnet equivalent tag to use for lemmatization
    """
    if nltk_tag == None:
        return None

    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None


final_processed_reviews = []
processed_words = []
for filename in files:
    with open(file_path + "/" + filename, "r", encoding="utf-8-sig") as file:
        raw_text = file.read()

    raw_lines = [
        line for line in raw_text.lower().replace("/", " ").split("\n") if line != ""
    ]

    reviews = []
    t_separate = []
    for i in range(len(raw_lines)):
        sentence = raw_lines[i].split("##")

        if len(sentence) < 2:
            reviews.append(copy.deepcopy(t_separate))
            t_separate = []
        else:
            t_separate.append(sentence[1])

        if i == len(raw_lines) - 1:
            reviews.append(copy.deepcopy(t_separate))

    reviews = [review for review in reviews if len(review) > 0]

    for review in reviews:
        processed_lines = []
        for line in review:
            tokens_with_punctuations = word_tokenize(line)

            ############ POS TAGGING AND LEMMATIZATION ############
            tokens_tags = pos_tag(tokens_with_punctuations)

            # Preparing to lemmatize.
            # Changing from POS Tags to WordNet Tags
            wordnet_tags = [(x[0], pos_tagger(x[1])) for x in tokens_tags]

            # Lemmatize with WordNet Lemmatizer
            lemmatized_tokens = [
                "" if tag is None else wordnet_lemmatizer.lemmatize(word, tag)
                for word, tag in wordnet_tags
            ]
            ####################################################################

            # Removing stopwords
            lemmatized_uni_tokens_without_sw = [
                word for word in lemmatized_tokens if not word in stop_words
            ]

            lemmatized_without_punct = [
                word.translate(translational_table)
                for word in lemmatized_uni_tokens_without_sw
                if word != ""
                and word != "'s"
                and word != "'m"
                and word != "'re"
                and word != "'ve"
                and word != "n't"
            ]

            lemmatized_without_punct = [
                token
                for token in lemmatized_without_punct
                if token != "" and token.isnumeric() == False
            ]

            processed_lines += lemmatized_without_punct
        processed_words += processed_lines
        final_processed_reviews += [processed_lines]

dic = {}
for word in processed_words:
    if word in dic:
        dic[word] += 1
    else:
        dic[word] = 1

# Sorting
sorted_words_counts = sorted(
    [[key, value] for key, value in dic.items()], key=lambda val: val[1], reverse=True
)

# Acquiring top 50 words and reversed 50 words
top_50_words_counts = []
i = 0
while len(top_50_words_counts) < 50:
    if "".join(reversed(sorted_words_counts[i][0])) != sorted_words_counts[i][0]:
        top_50_words_counts.append(
            (sorted_words_counts[i][0], sorted_words_counts[i][1])
        )
    i += 1

top_50_words = [item[0] for item in top_50_words_counts]
reverse_top_50_words = ["".join(reversed(word)) for word in top_50_words]

# Dictionary for top 50 words
# Tracking counts with ->     word:count
top_50_words_counts = {item[0]: item[1] for item in top_50_words_counts}

replace_list = []
for word in top_50_words:
    li = list(range(top_50_words_counts[word]))
    random.shuffle(li)
    replace_list.append(li[: top_50_words_counts[word] // 2])
print(len(replace_list[49]))

replaced_processed_reviews = copy.deepcopy(final_processed_reviews)

for target in range(len(top_50_words)):
    count = 0
    for r in range(len(replaced_processed_reviews)):
        for s in range(len(replaced_processed_reviews[r])):
            if replaced_processed_reviews[r][s] == top_50_words[target]:
                if count in replace_list[target]:
                    replaced_processed_reviews[r][s] = reverse_top_50_words[target]
                count += 1

# print(replaced_processed_reviews)

# print("***" * 100)
# print(top_50_words)

sg_model = Word2Vec(
    replaced_processed_reviews, min_count=1, vector_size=100, window=5, sg=1
)
# print(sg_model.wv["esu"])
# print(sg_model.wv["use"])

feature_vecs = []

one_hundred_words = top_50_words + reverse_top_50_words

for word in one_hundred_words:
    feature_vec = sg_model.wv.get_vector(word)
    feature_vecs.append(feature_vec)

percentages = []
for i in range(10):
    kmeans = KMeans(n_clusters=50, n_init='auto').fit(feature_vecs)

    labels = kmeans.labels_
    print(labels)

    # print(labels)
    p = 0
    for i in range(50):
        if labels[i] == labels[i + 50]:
            p += 1
    p = p / 50 * 100
    # print("correct pair",p,"percentage")
    percentages.append(p)
percentages = np.array(percentages)
mean = np.mean(percentages)
print(mean)
std = np.std(percentages)
print(std)


47


TypeError: ignored