# Preparing data

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from nltk.corpus import wordnet

In [None]:
def get_synonyms(ofword):
  from nltk.corpus import wordnet
  synonyms = []

  for syn in wordnet.synsets(ofword):
    for l in syn.lemmas():
      synonyms.append(l.name())

  return synonyms

In [None]:
def get_antonyms(ofword):
  from nltk.corpus import wordnet
  antonyms = []

  for syn in wordnet.synsets(ofword):
    for l in syn.lemmas():
      if l.antonyms():
          antonyms.append(l.antonyms()[0].name())

  return antonyms
  # print(set(antonyms))

In [None]:
synonyms = get_synonyms("active")
print (synonyms)

In [None]:
# read the words to list
with open('vagueWords.txt') as vague_word_list_file:
    vague_word_list = vague_word_list_file.read().splitlines()

vague_word_set = set()

# Add the synonyms of each word
for vagueword in vague_word_list:
  for vague_word_synonym in get_synonyms(vagueword):
    vague_word_set.add(vague_word_synonym.lower())

# Add the antonyms( of each word 
for vagueword in vague_word_list:
  for vague_word_synonym in get_antonyms(vagueword):
    vague_word_set.add(vague_word_synonym.lower())

# Remove unncessary words
remove_word_list = ['adept']
for word in remove_word_list:
  if word in vague_word_set:
    vague_word_set.remove(word)

# Add words without adding synonyms
addtional_word_list = ['didnt', 'doesnt']
for word in addtional_word_list:
  vague_word_set.add(word)

vague_word_set = sorted(vague_word_set)

vague_word_list = list(vague_word_set)

print('No. of vague words =', len(vague_word_list))

In [None]:
import json
reviews = []
with open('reviews.json') as file:
  for line in file:
      entry = json.loads(line)
      reviews.append(entry["_source"]["review"])

print("Total No. of reviews =", len(reviews))

# Sentence Tokenization

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sentences_set = set()
for review in reviews:
  for sentence in sent_tokenize(review):
    sentences_set.add(sentence.lower())
sentences = list(sentences_set) 

# Removing stop words

In [None]:
def get_stopwords():
  nltk.download('stopwords')
  from nltk.corpus import stopwords
  stopwords = set(stopwords.words('english'))
  # read the words to list
  with open('stopwords.txt') as stop_word_list_file:
    for word in stop_word_list_file.read().splitlines():
      stopwords.add(word.lower())

  additional_words = ["hi", "ok", "am", "would", "i'm","im","ill","cant","else","youd","otherwise","due"
  ,"youre","ive","havent","hasnt","hadnt","didnt","could","doesnt","may","wouldnt","dont","cant","could"
  ,"every","anyone","say","isnt","arent","also","cannot","itll","lets","youll","aspacingtopmini","hello"
  ,"theres","itthe","shes","hes","another","etc"]
  for word in additional_words:
    stopwords.add(word)
  return stopwords

In [None]:
import re
from collections import defaultdict
reviews = sentences
reviews = [re.sub(r'[^\w\s]','',str(item)) for item in reviews]
stopwords = get_stopwords()
texts = [[word for word in document.lower().split() if word not in stopwords] for document in reviews]
frequency = defaultdict(int)
for text in texts:
    for token in text:
         frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]


Transforming reviews into input data text file 

In [None]:
def write_inputstxt(inputs):
  file1 = open("input.txt", "a")  # append mode
  for text in texts:
    file1.write(' '.join(text) + "\n")
    
  file1.close()
write_inputstxt(texts)

# Choosing the number of topics

In [None]:
cluster_count = "15"

Running the Brown Clustering algorithm

In [None]:
import subprocess
subprocess.call(["/static/brown-cluster-master/wcluster", "--text", "/content/input.txt", "--c", cluster_count])

Reading the results of the Brown Clustering algorithm 

In [None]:
import pandas as pd

df = pd.read_csv(f'/content/input-c{cluster_count}-p1.out/paths',  sep='\t', header=None, names=['cluster', "word", "frequency"],  dtype={'cluster': str,'word':str, "frequency": int},index_col=False)
df.reset_index(drop=True, inplace=True)
df.sort_values(["frequency"], ascending = (False), inplace=True)

In [None]:
df.head() #showing sample of the data (first five rows)

In [None]:
print(df.cluster.unique(), len(df.cluster.unique()))#Unique clusters

Calculating the words frequencies in a topic

In [None]:
def get_words(cluster, df):
  df1 = df[df["cluster"] == cluster]
  return df1.set_index("word").to_dict()['frequency']

In [None]:
print(get_words("000", df).items()) #example 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import json
import math
import re # We clean text using regex
import csv # To read the csv
from collections import defaultdict # For accumlating values
import matplotlib.colors as mcolors
from collections import Counter

# Start with loading all necessary libraries
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# % matplotlib inline

# Word cloud

In [None]:
import matplotlib.pyplot as plt
for cluster_index, cluster_value in enumerate(df.cluster.unique()):
    plt.figure()
    plt.imshow(WordCloud(max_font_size=50, max_words=100, background_color="white").fit_words(get_words(cluster_value,df)),interpolation="bilinear")
    plt.axis("off")
    plt.title("Cluster #" + str(cluster_index+1))
    plt.show()


Preparing pandas dataframe for processing

In [None]:
def get_topics(df):
  my_list = []
  for i, value in enumerate(df.cluster.unique()):
    a_view = get_words(value, df).items()
    a_list = list(a_view)
    my_tuple = (i, a_list[:20])
    my_list.append(my_tuple)
  return my_list

topics = get_topics(df)

# Calculating the Vagueness degree

In [None]:
def sum_of_frequency_of_words_in_topic(topic_index, topics):
  sum = 0
  for index, value in enumerate(topics[topic_index][1]):
    sum += value[1]
    # print(value)
  return sum

def sum_of_frequency_of_vague_words_in_topic(vague_word_list, topic_index, topics):
  sum = 0
  for index, value in enumerate(topics[topic_index][1]):
    if value[0] in vague_word_list:
      sum += value[1]
      # print(value)
  return sum  

def vagueness_degree(vague_word_list):
  for i in range(len(topics)):
    all_word_count = sum_of_frequency_of_words_in_topic(i, topics)
    vague_word_count = sum_of_frequency_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_count / all_word_count * 10000) / 100
    percentage_str = "percentage: " + str(percentage) + "%"
    if percentage >= 10.00:
      print("topic", f'{i:<2}', f'{"vague: " + str(vague_word_count):<15}', f'{"all: " + str(all_word_count):<15}', f'{percentage_str:<20}', "vague")
    else:
      print("topic", f'{i:<2}', f'{"vague: " + str(vague_word_count):<15}', f'{"all: " + str(all_word_count):<15}', f'{percentage_str:<20}')


      


vagueness_degree(vague_word_list)


In [None]:
def plot_topic_percentage(topics):
  topic_rankings_x = []
  topic_rankings_y = []
  for i in range(len(topics)):
    all_word_count = sum_of_frequency_of_words_in_topic(i, topics)
    vague_word_count = sum_of_frequency_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_count / all_word_count * 100, 2)
    topic_rankings_x.append(i + 1)
    topic_rankings_y.append(percentage)
    # print(percentage)

  plt.yticks(np.arange(0, 100, 10))
  plt.xticks(np.arange(1, len(topics) + 1, 1))
  plt.bar(topic_rankings_x, topic_rankings_y)
  plt.ylabel('Vagueness Percentage')
  plt.xlabel('Topics')
  plt.show()
plot_topic_percentage(topics)

# Words in vague topics

In [None]:
def print_vague_topic_words(vague_word_list):
  for i in range(len(topics)):
    all_word_count = sum_of_frequency_of_words_in_topic(i, topics)
    vague_word_count = sum_of_frequency_of_vague_words_in_topic(vague_word_list, i, topics)
    percentage = round(vague_word_count / all_word_count * 10000) / 100
    if percentage >= 10.00:
      print("\ntopic", i)
      for index, value in enumerate(topics[i][1]):
        print("             " + f'{value[0]:<14}', value[1])
print_vague_topic_words(vague_word_list)

Bar plot of top 20 words with their weights

In [None]:
from matplotlib.pyplot import figure

def plot_words_in_topic(topics):
  for i in range(len(topics)):
    figure(figsize=(20, 6), dpi=80)
    y_ticks = []
    x_labels = []

    # print(topics[i][1])
    for key, value in enumerate(topics[i][1]):
      # print(value)
      y_ticks.append(value[1])
      x_labels.append(value[0])

    x_ticks = list(range(1, (len(x_labels) + 1)))
    plt.xticks(x_ticks, x_labels)
    plt.bar(x_ticks, y_ticks, width=.2)
    plt.ylabel('Word frequency')
    plt.xlabel('Topic #' + str(i + 1))
    plt.show()
    print()
    print()
plot_words_in_topic(topics)

# Dendrogram

In [None]:
def cluster_labels(df):
  df1 = df
  df1['row_number'] = df.groupby(['cluster']).cumcount() + 1
  df1 = df1[df1['row_number'] == 1] 
  return df1
cluster_labels(df)

In [None]:
class TreeNode:
    def __init__(self, key="", val="", left=None, right=None, parent=None):
        self.key = key
        self.val = val
        self.right = right
        self.left = left
        self.parent = parent

    def __repr__(self):
        return f'k: {self.key}, v: {self.val}'

def get_items(df):
  items = {}
  for index, row in df.iterrows():
    items[row['cluster']] = row['word']
  return items

items = get_items(cluster_labels(df))
root = TreeNode("root")


def get_TreeNode(tree_node, key, value):
    current_node = tree_node
    for i, v in enumerate(key):
        partial_key = key[0:i + 1]
        if v == "0":
            if current_node.right is None:
                current_node.right = TreeNode(key=partial_key, parent=current_node)
            current_node = current_node.right
        else:
            if current_node.left is None:
                current_node.left = TreeNode(key=partial_key, parent=current_node)
            current_node = current_node.left
    current_node.val = value
    current_node.key = key
    return current_node


for key, value in items.items():
    get_TreeNode(root, key, value)


def parent_list_graphviz_str(parents):
    parents = parents[::-1]
    sj = []
    for item in parents:
        if item.val == "":
            sj.append(f'"{item.key}"')
        else:
            sj.append(f'"{item.key}:{item.val}"')
    return ' -> '.join(sj)


def get_parent_list(current):
    parents = [current]
    parent = current
    while parent.parent is not None:
        parents.append(parent.parent)
        parent = parent.parent
    return parents

ranks = {999: []}


def append_to_ranks(ranks, current):
    if current.key == "root":
        return
    if current.left is None and current.right is None:
        ranks[999].append(f'"{current.key}:{current.val}"')
    else:
        if len(current.key) not in ranks:
            ranks[len(current.key)] = []
        ranks[len(current.key)].append(f'"{current.key}"')


graphviz_source = ['strict digraph { node [shape=box]']
graphviz_source.append('graph [splines=ortho]')
stack = [root]
while stack:
    current = stack.pop()
    append_to_ranks(ranks, current)

    if current.val != "":
        parents = get_parent_list(current)
        graphviz_source.append(parent_list_graphviz_str(parents))

    if current.right is not None:
        stack.append(current.right)

    if current.left is not None:
        stack.append(current.left)

for value in sorted(ranks.keys()):
  graphviz_source.append('{rank = same; ' + "; ".join(ranks[value]) + '}')        

for str in graphviz_source:
    print(str)

graphviz_source.append("}")

In [None]:
from graphviz import Source

In [None]:
src = Source('\n'.join(graphviz_source))
src