In [1]:
import nltk
import re
import numpy as np
from sklearn.cluster import KMeans
import string
import os
from nltk.tokenize import *
from sklearn.feature_extraction.text import CountVectorizer
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
#read the foods.txt file, for each line starting with "review/text", store it in the array reviews[]
file = open('foods.txt','r', encoding = 'utf-8', errors = 'replace')
f = file.readlines()

reviews = []
for i in f:
    temp = i.split(':')
    if temp[0] == "review/text":
        reviews.append(re.sub('\n','',temp[1]))
file.close()

In [3]:
#for each review, change to lower case, leading and trailing spaces, some special html codes, digits, and punctuations
for i in range(len(reviews)):
    reviews[i] = reviews[i].lower()
    reviews[i] = reviews[i].strip()
    reviews[i] = re.sub('<br />', ' ', reviews[i])
    reviews[i] = re.sub(r'\d', ' ',reviews[i])
    pattern = re.compile('[^a-z]+')
    reviews[i] = pattern.sub(' ', reviews[i])

In [4]:
#find all the unique words and store in L
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
L = list(vectorizer.get_feature_names_out())


In [5]:
#I saved all long stop words in this file, read it into array
file2 = open('longstopwords.txt','r')
f2 = file2.readlines()

long_stopword_list = []
for i in f2:
    long_stopword_list.append(re.sub("\n","",i))
    
file2.close()

In [6]:
#remove all the stopwords from L and save in W
vectorizer = CountVectorizer(stop_words = long_stopword_list)
x = vectorizer.fit_transform(reviews)
W = list(vectorizer.get_feature_names_out())




In [7]:
#joined all reviews[i] into a single text and tokenized and removed stopwords
temp = " ".join(reviews)
tokens = word_tokenize(temp)
text = nltk.Text(tokens)

In [8]:
text = [w for w in text if not w in long_stopword_list]

In [9]:
#count the frequency of each word ans store (word,count) pair in array top500[]

dist = FreqDist(text)
top500 = dist.most_common(500)

In [10]:
#extract only the words from top500 and store in new array 
top500words = []
for i in top500:
    top500words.append(i[0])

In [11]:
#vectorized all reviews using vectorization
vectorizer = TfidfVectorizer(vocabulary = top500words)
V = vectorizer.fit_transform(reviews)

In [12]:
#do kmeans into 10 clusters
#this takes very long time
kmeans = KMeans(n_clusters = 10, max_iter = 20).fit(V)

In [13]:
#get the centroids for each cluster and sort them in reverse
centroid = kmeans.cluster_centers_
centroid = centroid.argsort()[:, ::-1]

In [14]:
#store the top 5 of each cluster and its count in array
wordss = []
for i in range(10):
    for j in centroid[i, :5]:
        wordss.append((top500words[j],top500[j][1]))

In [15]:
#write the top500 words to file
file3 = open("top500.txt", "w")
for element in top500:
    file3.write(str(element) + "\n")
file3.close()

In [16]:
#write top 5 words representing each cluster to file
file4 = open("clusterWords.txt", "w")
i = 1
for element in wordss:
    file4.write(str(element) + " ")
    if i % 5 == 0:
        file4.write("\n")
    i += 1
file4.close()