# Document clustering and topic modeling

## Set up environment

In [1]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [6]:
file = drive.CreateFile({'id':'192JMR7SIqoa14vrs7Z9BXO3iK89pimJL'})
file.GetContentFile('data.tsv')   

# Load data

In [7]:
import numpy as np
import pandas as pd
import nltk
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Load data into dataframe
df = pd.read_csv('data.tsv', sep='\t', header=0, error_bad_lines=False)

b'Skipping line 8704: expected 15 fields, saw 22\nSkipping line 16933: expected 15 fields, saw 22\nSkipping line 23726: expected 15 fields, saw 22\n'
b'Skipping line 85637: expected 15 fields, saw 22\n'
b'Skipping line 132136: expected 15 fields, saw 22\nSkipping line 158070: expected 15 fields, saw 22\nSkipping line 166007: expected 15 fields, saw 22\nSkipping line 171877: expected 15 fields, saw 22\nSkipping line 177756: expected 15 fields, saw 22\nSkipping line 181773: expected 15 fields, saw 22\nSkipping line 191085: expected 15 fields, saw 22\nSkipping line 196273: expected 15 fields, saw 22\nSkipping line 196331: expected 15 fields, saw 22\n'
b'Skipping line 197000: expected 15 fields, saw 22\nSkipping line 197011: expected 15 fields, saw 22\nSkipping line 197432: expected 15 fields, saw 22\nSkipping line 208016: expected 15 fields, saw 22\nSkipping line 214110: expected 15 fields, saw 22\nSkipping line 244328: expected 15 fields, saw 22\nSkipping line 248519: expected 15 fields,

In [9]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,3653882,R3O9SGZBVQBV76,B00FALQ1ZC,937001370,"Invicta Women's 15150 ""Angel"" 18k Yellow Gold ...",Watches,5,0,0,N,Y,Five Stars,Absolutely love this watch! Get compliments al...,2015-08-31
1,US,14661224,RKH8BNC3L5DLF,B00D3RGO20,484010722,Kenneth Cole New York Women's KC4944 Automatic...,Watches,5,0,0,N,Y,I love thiswatch it keeps time wonderfully,I love this watch it keeps time wonderfully.,2015-08-31
2,US,27324930,R2HLE8WKZSU3NL,B00DKYC7TK,361166390,Ritche 22mm Black Stainless Steel Bracelet Wat...,Watches,2,1,1,N,Y,Two Stars,Scratches,2015-08-31
3,US,7211452,R31U3UH5AZ42LL,B000EQS1JW,958035625,Citizen Men's BM8180-03E Eco-Drive Stainless S...,Watches,5,0,0,N,Y,Five Stars,"It works well on me. However, I found cheaper ...",2015-08-31
4,US,12733322,R2SV659OUJ945Y,B00A6GFD7S,765328221,Orient ER27009B Men's Symphony Automatic Stain...,Watches,4,0,0,N,Y,"Beautiful face, but cheap sounding links",Beautiful watch face. The band looks nice all...,2015-08-31


In [10]:
# Remove missing value
df.dropna(subset=['review_body'],inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 960056 entries, 0 to 960203
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   marketplace        960056 non-null  object
 1   customer_id        960056 non-null  int64 
 2   review_id          960056 non-null  object
 3   product_id         960056 non-null  object
 4   product_parent     960056 non-null  int64 
 5   product_title      960054 non-null  object
 6   product_category   960056 non-null  object
 7   star_rating        960056 non-null  int64 
 8   helpful_votes      960056 non-null  int64 
 9   total_votes        960056 non-null  int64 
 10  vine               960056 non-null  object
 11  verified_purchase  960056 non-null  object
 12  review_headline    960049 non-null  object
 13  review_body        960056 non-null  object
 14  review_date        960052 non-null  object
dtypes: int64(5), object(10)
memory usage: 117.2+ MB


In [15]:
# substract review_body as trainning set
data = df.loc[:, 'review_body'].tolist()

# Tokenizing and stemming

In [17]:
# Use nltk's English stopwords.
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("'s")
stopwords.append("'m")
stopwords.append("n't")
stopwords.append("br")

print ("We use " + str(len(stopwords)) + " stop-words from nltk library.")
print (stopwords[:20])

We use 183 stop-words from nltk library.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [19]:
from nltk.stem.snowball import SnowballStemmer
import re

stemmer = SnowballStemmer("english")

# tokenization and stemming
def tokenization_and_stemming(text):
    tokens = []
    # exclude stop words and tokenize the document, generate a list of string 
    for word in nltk.word_tokenize(text):
        if word.lower() not in stopwords:
            tokens.append(word.lower())

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [20]:
tokenization_and_stemming(data[0])

['absolut',
 'love',
 'watch',
 'get',
 'compliment',
 'almost',
 'everi',
 'time',
 'wear',
 'dainti']

# TF-IDF

In [22]:
tfidf_model = TfidfVectorizer(max_df=0.99, max_features=1000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,2))

tfidf_matrix = tfidf_model.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \
      " reviews and " + str(tfidf_matrix.shape[1]) + " terms.")

In total, there are 960056 reviews and 411 terms.


In [23]:
# check the parameters
tfidf_model.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.99,
 'max_features': 1000,
 'min_df': 0.01,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': <function __main__.tokenization_and_stemming>,
 'use_idf': True,
 'vocabulary': None}

In [24]:
# words
tf_selected_words = tfidf_model.get_feature_names()

In [48]:
tf_selected_words[:10]

["'d",
 'abl',
 'absolut',
 'absolut love',
 'accur',
 'actual',
 'adjust',
 'ago',
 'alarm',
 'alreadi']

In [26]:
Eftfidf_matrix

<960056x411 sparse matrix of type '<class 'numpy.float64'>'
	with 12161247 stored elements in Compressed Sparse Row format>

#K means clustering

In [27]:
# k-means clustering
from sklearn.cluster import KMeans

num_clusters = 5

# number of clusters
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [34]:
product = { 'review': df[:].review_body, 'cluster': clusters}
frame = pd.DataFrame(product, columns = ['review', 'cluster'])

In [35]:
frame.head(10)

Unnamed: 0,review,cluster
0,Absolutely love this watch! Get compliments al...,0
1,I love this watch it keeps time wonderfully.,0
2,Scratches,3
3,"It works well on me. However, I found cheaper ...",3
4,Beautiful watch face. The band looks nice all...,3
5,"i love this watch for my purpose, about the pe...",0
6,"for my wife and she loved it, looks great and ...",4
7,I was about to buy this thinking it was a Swis...,3
8,Watch is perfect. Rugged with the metal &#34;B...,4
9,Great quality and build.<br />The motors are r...,4


In [36]:
print ("Number of reviews included in each cluster:")
frame['cluster'].value_counts().to_frame()

Number of reviews included in each cluster:


Unnamed: 0,cluster
3,636542
4,115873
0,95776
1,71673
2,40192


In [37]:
km.cluster_centers_

array([[0.00126937, 0.00207204, 0.02137691, ..., 0.00171569, 0.01176861,
        0.00680251],
       [0.00172826, 0.0017194 , 0.00143768, ..., 0.00207161, 0.00490479,
        0.00162052],
       [0.00066598, 0.00049239, 0.00041813, ..., 0.00074147, 0.00305293,
        0.00040923],
       [0.00440864, 0.00537962, 0.00499399, ..., 0.0043641 , 0.01905546,
        0.00401179],
       [0.0018309 , 0.00230686, 0.00440453, ..., 0.00289153, 0.01171859,
        0.00268264]])

In [38]:
km.cluster_centers_.shape

(5, 411)

In [39]:
print ("<Document clustering result by K-means>")

#km.cluster_centers_ denotes the importances of each items in centroid.
#We need to sort it in decreasing-order and get the top k items.
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

Cluster_keywords_summary = {}
for i in range(num_clusters):
    print ("Cluster " + str(i) + " words:", end='')
    Cluster_keywords_summary[i] = []
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        Cluster_keywords_summary[i].append(tf_selected_words[ind])
        print (tf_selected_words[ind] + ",", end='')
    print ()
    
    cluster_reviews = frame[frame.cluster==i].review.tolist()
    print ("Cluster " + str(i) + " reviews (" + str(len(cluster_reviews)) + " reviews): ")
    print (", ".join(cluster_reviews))
    print ()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Topic modeling-Latent dirichilet allocation

In [40]:
# Use LDA for clustering
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5)

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
# LDA requires integer values
tfidf_model_lda = CountVectorizer(max_df=0.99, max_features=500,
                                 min_df=0.01, stop_words='english',
                                 tokenizer=tokenization_and_stemming, ngram_range=(1,1))

tfidf_matrix_lda = tfidf_model_lda.fit_transform(data) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix_lda.shape[0]) + \
      " reviews and " + str(tfidf_matrix_lda.shape[1]) + " terms.")

In total, there are 960056 reviews and 355 terms.


In [42]:
# document topic matrix for tfidf_matrix_lda
lda_output = lda.fit_transform(tfidf_matrix_lda)
print(lda_output.shape)
print(lda_output)

(960056, 5)
[[0.02577426 0.0253714  0.89787495 0.02589161 0.02508778]
 [0.04159326 0.04068506 0.8365683  0.04087485 0.04027852]
 [0.10091865 0.59840914 0.10000113 0.10061433 0.10005674]
 ...
 [0.0337492  0.03359651 0.32025948 0.57886293 0.03353189]
 [0.94962853 0.01263307 0.01258087 0.01259939 0.01255814]
 [0.83255018 0.01867164 0.01833728 0.01860771 0.11183319]]


In [43]:
# topics and words matrix
topic_word = lda.components_
print(topic_word.shape)
print(topic_word)

(5, 355)
[[6.69970885e+03 9.16206696e+03 4.95247617e+02 ... 1.82911719e+03
  6.46071456e+03 7.22216334e+04]
 [2.62265043e+03 7.39052073e+03 2.34170038e+03 ... 3.06627920e+04
  9.15650958e+02 7.57097904e+02]
 [1.51779039e+02 3.86296606e+02 1.79015451e+04 ... 1.88111181e+02
  7.14239074e+02 8.75032723e+03]
 [6.23160020e+03 1.78810765e+03 6.38296546e+02 ... 4.84468634e+04
  4.00598071e+03 1.26219670e+04]
 [1.17526148e+03 2.55008056e+02 6.21031400e+00 ... 3.86831163e+04
  1.35041470e+03 2.99745005e+01]]


In [44]:
# column names
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
doc_names = ["Doc" + str(i) for i in range(len(data))]

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=doc_names)

# get dominant topic for each document
topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,topic
Doc0,0.03,0.03,0.9,0.03,0.03,2
Doc1,0.04,0.04,0.84,0.04,0.04,2
Doc2,0.1,0.6,0.1,0.1,0.1,1
Doc3,0.74,0.03,0.17,0.03,0.03,0
Doc4,0.11,0.31,0.01,0.5,0.08,3
Doc5,0.03,0.03,0.4,0.51,0.03,3
Doc6,0.03,0.03,0.88,0.03,0.03,2
Doc7,0.52,0.41,0.03,0.03,0.03,0
Doc8,0.01,0.01,0.14,0.82,0.01,3
Doc9,0.58,0.02,0.35,0.02,0.02,0


In [45]:
df_document_topic['topic'].value_counts().to_frame()

Unnamed: 0,topic
2,245359
3,242986
0,229354
4,144158
1,98199


In [46]:
# topic word matrix
print(lda.components_)
# topic-word matrix
df_topic_words = pd.DataFrame(lda.components_)

# column and index
df_topic_words.columns = tfidf_model_lda.get_feature_names()
df_topic_words.index = topic_names

df_topic_words.head()

[[6.69970885e+03 9.16206696e+03 4.95247617e+02 ... 1.82911719e+03
  6.46071456e+03 7.22216334e+04]
 [2.62265043e+03 7.39052073e+03 2.34170038e+03 ... 3.06627920e+04
  9.15650958e+02 7.57097904e+02]
 [1.51779039e+02 3.86296606e+02 1.79015451e+04 ... 1.88111181e+02
  7.14239074e+02 8.75032723e+03]
 [6.23160020e+03 1.78810765e+03 6.38296546e+02 ... 4.84468634e+04
  4.00598071e+03 1.26219670e+04]
 [1.17526148e+03 2.55008056e+02 6.21031400e+00 ... 3.86831163e+04
  1.35041470e+03 2.99745005e+01]]


Unnamed: 0,'d,abl,absolut,accur,actual,adjust,ago,alarm,alreadi,alway,amaz,amazon,anoth,anyon,anyth,appear,arm,arriv,ask,attract,automat,away,awesom,bad,band,batteri,beat,beauti,believ,best,better,bezel,big,bigger,birthday,bit,black,blue,bought,box,...,thought,time,timepiec,timex,took,tool,total,tri,turn,type,uniqu,updat,use,usual,valu,ve,wait,want,warranti,watch,water,way,wear,week,weight,went,white,wife,wind,wish,wo,wonder,wore,work,worn,worri,worth,wrist,wrong,year
Topic0,6699.70885,9162.066958,495.247617,12793.442206,8127.920944,5205.538318,15793.882462,28684.185051,7893.942085,8902.598843,687.551303,30962.353162,23822.495594,2426.249166,6376.764351,4529.460966,1797.664697,11064.259566,5569.037246,1543.322482,7754.072948,7556.278594,377.274721,10581.088667,33082.065665,77833.020478,1639.356668,812.150586,3668.29972,5534.964162,8795.124239,1652.574095,1083.742983,27.454617,30.708307,3902.218435,931.769617,110.064975,33273.071614,7790.556552,...,8018.762753,163928.731161,1594.235444,15248.175662,14552.280033,2.225328,4106.884161,22070.424764,12493.077796,3152.241193,56.355065,11589.560882,69232.39521,3756.349433,1388.269855,28546.678411,5558.311479,23088.91436,17623.956998,382184.132838,28465.95638,13441.65918,22579.049726,37041.09995,379.17167,10762.533972,135.847815,1757.16216,11470.462756,3958.466774,6950.627399,2420.089343,9243.804723,101806.967113,6416.09765,3682.30676,6783.572764,1829.117185,6460.714562,72221.633355
Topic1,2622.650433,7390.520732,2341.700381,7016.599423,6306.69684,30026.254135,144.932367,0.766292,2801.703622,2278.846298,2226.5512,5029.393127,3703.782281,1519.595031,1542.94167,5950.571129,2459.222153,938.018001,593.57754,4624.355572,13770.356831,1990.590733,575.574873,1399.698665,71917.239084,463.160664,953.946748,12720.155472,1945.91577,3053.828446,6978.403626,16575.877548,9893.06514,1129.905845,0.397821,13093.113829,21387.340628,12329.621057,2189.756013,7357.880627,...,2423.690076,34918.534158,4617.338279,5.721112,7057.237936,18397.168311,1708.690963,5824.379506,4695.519967,3849.516217,2757.951874,1134.650105,25917.759754,2345.986944,2229.582152,6894.857597,642.712108,7492.026312,25.628815,203724.519781,1489.723022,8727.375136,12852.565097,3799.1149,2214.583671,1037.594779,12919.034576,141.035217,3900.879601,3894.397408,2852.290717,1496.234817,441.249214,8202.017965,2414.721866,1491.123517,5438.485611,30662.792025,915.650958,757.097904
Topic2,151.779039,386.296606,17901.545142,155.879518,1259.979773,9.727331,866.903385,0.200918,2496.672815,4793.881241,13571.647108,20704.188632,6928.38327,6908.023228,968.705527,352.880725,115.816819,16809.336978,4263.202537,1465.961524,11.083697,866.824026,15341.867677,6.557924,631.84776,3.719092,2339.119859,59127.459727,1935.144897,10959.5433,7951.348698,0.20293,4193.841653,106.068418,13466.582928,64.815908,221.333224,225.892535,49330.299906,14880.755745,...,3380.321529,25937.637821,2269.623128,2.832265,1649.114249,0.200885,3133.947865,2.669074,8.040278,149.406135,3167.099955,0.25869,317.583627,242.383519,6795.44161,102.747458,4451.65911,12870.044661,34.475345,268816.345305,0.219242,2804.396381,16963.029886,438.650896,381.205541,1504.031967,320.116665,21909.264697,3.018297,1597.133125,301.078804,6011.903112,1020.865004,16410.845617,1503.109909,519.61188,9146.860678,188.111181,714.239074,8750.327227
Topic3,6231.600203,1788.107647,638.296546,8093.912056,9557.458694,930.896314,1172.198282,294.922016,2590.743247,10297.44415,2232.11474,1033.900817,7378.632614,5908.151664,7284.447048,2991.171796,6854.267668,20.521567,2691.065912,6931.220651,1182.508038,1559.587443,4724.432082,2002.566884,64512.558332,290.776149,6937.930382,5199.017478,1835.690273,7996.795754,10646.293011,718.107914,50657.227086,5805.242979,2.109001,14015.954404,17522.998489,9387.316978,14806.285441,538.55794,...,7848.539265,58726.903058,4371.554515,11850.200853,210.32345,0.200674,640.993416,3716.75276,1896.835653,3290.598955,4421.11225,595.45142,24693.781372,3623.318169,1298.444129,29242.150589,223.236082,31019.369332,0.776711,456759.134668,970.703842,11113.002106,97060.653147,3449.874777,18417.392473,590.773916,7581.545018,620.334287,318.637536,5538.496204,4110.790123,1617.984122,959.923074,22144.547006,11087.997861,5440.1895,2401.261003,48446.863357,4005.980709,12621.967014
Topic4,1175.261475,255.008056,6.210314,680.166797,5522.943748,920.583901,1.083504,141.925722,237.938231,418.229469,292.135649,2138.164262,2155.706241,839.98091,400.141404,7385.915385,882.028662,4606.863888,143.116764,2268.139771,51.978485,1139.719205,17.850648,7945.087859,48010.289159,0.323618,2.646343,1115.216737,850.949341,2060.868338,15312.830425,57.237514,10331.123138,6661.328141,0.201943,13344.897425,2492.558043,359.104454,5531.587026,9348.249137,...,6924.686378,6581.193802,636.248635,330.070107,151.044332,0.204801,770.483594,807.773896,512.526306,1138.2375,56.480856,20.078903,10234.480036,454.961935,8559.262254,61.565946,13.081221,4811.645334,1.162131,112192.867408,2547.397515,5966.567197,611.702144,181.259477,2156.646644,139.065366,712.455926,1765.203639,2.00181,3172.506489,445.212956,597.788606,1.157985,10451.622299,54.072714,329.768343,5856.819943,38683.116252,1350.414698,29.9745


In [47]:
# print top n keywords for each topic
def print_topic_words(tfidf_model, lda_model, n_words):
    words = np.array(tfidf_model.get_feature_names())
    topic_words = []
    # for each topic, we have words weight
    for topic_words_weights in lda_model.components_:
        top_words = topic_words_weights.argsort()[::-1][:n_words]
        topic_words.append(words.take(top_words))
    return topic_words

topic_keywords = print_topic_words(tfidf_model=tfidf_model_lda, lda_model=lda, n_words=15)        

df_topic_words = pd.DataFrame(topic_keywords)
df_topic_words.columns = ['Word '+str(i) for i in range(df_topic_words.shape[1])]
df_topic_words.index = ['Topic '+str(i) for i in range(df_topic_words.shape[0])]
df_topic_words

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,watch,time,work,batteri,year,use,day,replac,set,month,need,week,like,bought,new
Topic 1,watch,band,hand,link,dial,time,face,wrist,second,adjust,remov,use,make,case,pin
Topic 2,watch,love,great,look,gift,beauti,price,bought,purchas,excel,recommend,perfect,husband,product,got
Topic 3,watch,look,like,nice,wear,realli,great,band,face,time,big,easi,light,wrist,love
Topic 4,good,watch,look,qualiti,strap,nice,band,price,fit,wrist,like,product,expect,leather,size


# Report

**Motivation:**

This project did clustering on customers' reviews about watches and applied LDA and KNN algorithms to build model to assign the review to its relative topic group.


**Step1: Text preprocessing**
* Exclude stop words and text tokenization(1&2 gram) using NLTK tools


**Step2: Feature engineering**
* Applied TF-IDF to transfer text into feature matrix


**Step3: Model training**
* 5 means clutsering was used to divide those comments into 5 groups
* Build LDA model to do clusterig and find the top keywords for each topic

