In [1]:
import numpy as np 
import pandas as pd
import plotly.express as px
pd.options.plotting.backend = 'plotly'

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Cleaning

In [2]:
reviews = pd.read_csv('filtered-reviews.csv')

In [3]:
reviews.describe()

Unnamed: 0,review_id,user_id,business_id,text,date
count,327819,327819,327819,327819,327819
unique,327819,228555,237,327238,327552
top,l3Wk_mvAog6XANIuGQ9C7Q,CfX4sTIFFNaRchNswqhVfg,2BMk_drsikKWslJCXmQtjQ,Omg!There's food was good!Ryan is awesome away...,2014-06-22 00:49:26
freq,1,57,2023,7,3


In [4]:
# Check for null values
reviews.isna().sum()

review_id      0
user_id        0
business_id    0
text           0
date           0
dtype: int64

In [5]:
# Check datatypes
print(reviews.dtypes, '\n')

# Change date to datetime, leave other attributes
reviews['date'] = pd.to_datetime(reviews['date'])

# Print new datatypes
print(reviews.dtypes)

review_id      object
user_id        object
business_id    object
text           object
date           object
dtype: object 

review_id              object
user_id                object
business_id            object
text                   object
date           datetime64[ns]
dtype: object


# Clustering

In [6]:
# Find a sample of reviews that belong to a specific business
business_series = pd.Series(reviews["business_id"].unique())
business_x = business_series[0]
reviews_x = reviews[reviews["business_id"] == business_x].reset_index(drop=True)

## OG Clusters

In [100]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [102]:
#instantiate CountVectorizer() 
cv=CountVectorizer() 

# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(reviews_x['text'])
word_count_vector.shape

(1421, 7849)

In [108]:
# count matrix 
count_vector=cv.transform(reviews_x['text']) 

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = cv.get_feature_names_out()

pd.DataFrame(tf_idf_vector.T[])

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [130]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=10, init='random', max_iter=100, n_init=1, verbose=1)
df = tf_idf_vector
km.fit(df)

Initialization complete
Iteration 0, inertia 2304.522932482682
Iteration 1, inertia 1234.093097610763
Iteration 2, inertia 1227.297287867837
Iteration 3, inertia 1224.5643734054377
Iteration 4, inertia 1223.6778013249325
Iteration 5, inertia 1223.2024834283038
Iteration 6, inertia 1222.8858992740838
Iteration 7, inertia 1222.7933933227655
Iteration 8, inertia 1222.756089859988
Iteration 9, inertia 1222.7354582994262
Iteration 10, inertia 1222.7294167269315
Iteration 11, inertia 1222.7239286169574
Iteration 12, inertia 1222.7184783816338
Converged at iteration 12: strict convergence.


KMeans(init='random', max_iter=100, n_clusters=10, n_init=1, verbose=1)

### Distribution of Reviews Across Clusters

In [131]:
pd.Series(km.labels_).plot(kind='hist', title='Distribution of Reviews Across Clusters')

In [106]:
km.cluster_centers_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## New Clusters

In [7]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [8]:
# Import re, nltk, and WordNetLemmatizer
import re
import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [9]:
def remove_stopwords(reviews):
       # Stopword removal, converting uppercase into lower case, and lemmatization
       stopwords = nltk.corpus.stopwords.words('english')
       lemmatizer = WordNetLemmatizer()
       nltk.download('stopwords')
       data_without_stopwords = []
       for i in range(0, reviews.shape[0]):
              doc = re.sub('[^a-zA-Z]', ' ', reviews['text'][i])
              doc = doc.lower()
              doc = doc.split()
              doc = [lemmatizer.lemmatize(word)
                     for word in doc if not word in set(stopwords)]
              doc = ' '.join(doc)
              data_without_stopwords.append(doc)

       return data_without_stopwords

data_without_stopwords = remove_stopwords(reviews_x)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
vectorizer = TfidfVectorizer() 
vectors = vectorizer.fit_transform(data_without_stopwords)

# Print how many rows and columns of the TF-IDF matrix consists
print("n_samples: %d, n_features: %d" % vectors.shape)

# Instantiate clustering algo and fit it to the vectors
km = KMeans(n_clusters=10, init='random', max_iter=100, n_init=1, verbose=0)
km.fit(vectors)

n_samples: 1421, n_features: 6859


KMeans(init='random', max_iter=100, n_clusters=10, n_init=1)

### Review Distribution by Cluster

In [112]:
review_counts = pd.Series(km.labels_).value_counts()
review_counts

8    263
2    210
1    160
3    159
5    135
4    131
0    129
6     84
9     77
7     73
dtype: int64

In [13]:
pd.Series(km.labels_).plot(kind='hist', title='Distribution of Reviews Across Clusters')

### Create TFIDF Matrix

In [64]:
def matrix_from_vec(vectors):
    # Select the first five documents from the data set
    tf_idf = pd.DataFrame(vectors.todense())  
    tf_idf.columns = vectorizer.get_feature_names()
    tfidf_matrix = tf_idf.T
    tfidf_matrix.columns = ['review'+ str(i) for i in range(1, tf_idf.shape[0]+1)]
    tfidf_matrix['count'] = tfidf_matrix.sum(axis=1)

    # Top 10 words 
    tfidf_matrix = tfidf_matrix.sort_values(by ='count', ascending=False)
    tfidf_matrix = tfidf_matrix.drop('count', axis=1)
    return tfidf_matrix

# Print the first 10 words
tfidf_matrix = matrix_from_vec(vectors) 
tfidf_matrix.iloc[:, :10]


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Unnamed: 0,review1,review2,review3,review4,review5,review6,review7,review8,review9,review10
food,0.0,0.043135,0.000000,0.117932,0.000000,0.0,0.000000,0.047657,0.000000,0.000000
place,0.0,0.093278,0.062133,0.000000,0.000000,0.0,0.000000,0.051527,0.033743,0.042741
wait,0.0,0.107655,0.000000,0.147164,0.000000,0.0,0.043394,0.178408,0.000000,0.000000
coffee,0.0,0.052702,0.070210,0.144087,0.083185,0.0,0.084973,0.116451,0.076259,0.000000
menu,0.0,0.153154,0.000000,0.000000,0.000000,0.0,0.000000,0.056402,0.147740,0.046785
...,...,...,...,...,...,...,...,...,...,...
inclusion,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
separately,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
believed,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
tweet,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


### Top Words by Cluster

In [355]:
def top_words(matrix, n):
    assert n >= 1
    lables = pd.Series(km.labels_)
    top_matrix = pd.DataFrame([
        matrix.iloc[:, lables[lables == i].index].mean(axis=1)
        for i in range(lables.unique().shape[0])
    ]).T

    top_matrix = top_matrix.apply(
        lambda x: x.sort_values(ascending=False).index, axis=0)
        
    return top_matrix.reset_index(drop=True).T.apply(
        lambda x: ' '.join(x.to_list()[:n]), axis=1)

top_words(tfidf_matrix, 5)

0    drink bar balcony square jackson
1        food service good nice staff
2      ghost upstairs food room table
3        great place food service new
4      shrimp cheese crepe goat bread
5         table time one server would
6      fun super food atmosphere lady
7       best ever meal muriel orleans
8        brunch jazz sunday egg great
9      soup turtle lunch good amazing
dtype: object

### Highest Cosine Similarity


In [388]:
from sklearn.metrics.pairwise import cosine_similarity

def find_targets(matrix):
    # create a similatity_matrix
    similatity_matrix = cosine_similarity(matrix.T, matrix.T)
    # fill the diagonial with 0 since it will skew results
    np.fill_diagonal(similatity_matrix, 0)
    similatity_matrix = pd.DataFrame(similatity_matrix)
    # find the targets 
    targets = similatity_matrix.idxmax()
    # find max values
    values = similatity_matrix.apply(lambda x: x.max())
    return targets, values

targets, values = find_targets(tfidf_matrix)
targets, values

(0       1732
 1       1871
 2       1101
 3       1254
 4        720
         ... 
 2003    1391
 2004     213
 2005     990
 2006     156
 2007    1250
 Length: 2008, dtype: int64,
 0       0.336504
 1       0.177481
 2       0.232919
 3       0.281658
 4       0.301150
           ...   
 2003    0.308653
 2004    0.253117
 2005    0.194374
 2006    0.208138
 2007    0.133592
 Length: 2008, dtype: float64)

### Dimensionality Reduction 

In [377]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

def dim_reduce(matrix, n=2):
    # Step 1: Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(matrix)

    # Step 2: Perform Singular Value Decomposition (SVD)
    svd = TruncatedSVD(n_components=n)
    reduced_data = svd.fit_transform(scaled_data)
    return pd.DataFrame(reduced_data, columns=['x', 'y'])


dim_reduce(km.cluster_centers_)

Unnamed: 0,x,y
0,8.129188,107.945764
1,-3.586141,-8.117628
2,-21.194085,-9.995714
3,-16.07961,-11.484207
4,-21.039541,-12.619156
5,-15.748072,-8.582816
6,130.598848,-18.852817
7,-15.751159,-16.038937
8,-25.859266,-11.336142
9,-19.470163,-10.918349


# Output Data

## 1. Cluster Overviews 
This dataframe will be used to generate a plot where a review clusters are ploted on two dimensions 
### Features
* Business ID
* Cluster Number (index)
* Centroid X
* Centroid Y
* Top Words
* Review Volume 
* Business Total 

In [378]:
cluster_summaries = []

for i in range(10):
    business_x = business_series[i]
    reviews_x = reviews[reviews["business_id"] == business_x].reset_index(drop=True)

    data_without_stopwords = remove_stopwords(reviews_x)

    vectorizer = TfidfVectorizer() 
    vectors = vectorizer.fit_transform(data_without_stopwords)

    # Instantiate clustering algo and fit it to the vectors
    km = KMeans(n_clusters=10, init='random', max_iter=100, n_init=1, verbose=0)
    km.fit(vectors)

    tfidf_matrix = matrix_from_vec(vectors) 

    coordinates = dim_reduce(km.cluster_centers_)

    cluster_summary = pd.DataFrame().assign(
        business_id = np.repeat(business_x, review_counts.shape[0]),
        review_vol = pd.Series(km.labels_).value_counts(),
        x = coordinates['x'],
        y = coordinates['y'],
        top_words = top_words(tfidf_matrix, 5),
    )
    cluster_summary['cluster_n'] = cluster_summary.index
    cluster_summaries.append(cluster_summary)

cluster_summaries = pd.concat(cluster_summaries)
cluster_summaries

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_da

Unnamed: 0,business_id,review_vol,x,y,top_words,cluster_n
0,EQ-TZ2eeD_E0BHuvoaeG5Q,80,-18.890580,-14.518375,portion food good size weird,0
1,EQ-TZ2eeD_E0BHuvoaeG5Q,568,156.561678,-3.430511,food coffee menu time milktooth,1
2,EQ-TZ2eeD_E0BHuvoaeG5Q,174,-16.670979,-18.355185,place great food atmosphere brunch,2
3,EQ-TZ2eeD_E0BHuvoaeG5Q,63,-19.019869,-17.787254,lamb food seating place never,3
4,EQ-TZ2eeD_E0BHuvoaeG5Q,67,-18.970382,-11.374129,always wait food love time,4
...,...,...,...,...,...,...
5,Zi-F-YvyVOK0k5QD7lrLOg,181,-13.681275,83.369940,square jackson lunch muriel soup,5
6,Zi-F-YvyVOK0k5QD7lrLOg,24,-21.361290,-16.972717,attitude food start awful management,6
7,Zi-F-YvyVOK0k5QD7lrLOg,182,-7.242404,-10.442259,pork chop double cut best,7
8,Zi-F-YvyVOK0k5QD7lrLOg,173,-12.053303,-5.452086,cheese crepe goat shrimp crawfish,8


In [380]:
cluster_summaries.to_csv('cluster-summaries.csv', index=False)

In [379]:
import plotly.express as px

d = cluster_summary[:]
d['review_vol'] = d['review_vol']
d['top_words'] = d['top_words'].apply(lambda x: ' '.join(x[:5]))
fig = px.scatter(d, x='x', y='y', size='review_vol', color='cluster_n')
fig.update_layout(hovermode="closest")
fig

## 2. Node-Link On Clusters
Nodes linked by closest cosine similarity 
### Features
* Business ID
* Review ID
* Cluster Number
* Source
* Target
* Value
* Top Word

In [401]:
node_link_data = []

for i in range(10):
    business_x = business_series[i]
    reviews_x = reviews[reviews["business_id"] == business_x].reset_index(drop=True)

    data_without_stopwords = remove_stopwords(reviews_x)

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(data_without_stopwords)

    # Instantiate clustering algo and fit it to the vectors
    km = KMeans(n_clusters=10, init='random',
                max_iter=100, n_init=1, verbose=0)
    km.fit(vectors)

    tfidf_matrix = matrix_from_vec(vectors)

    targets, values = find_targets(tfidf_matrix)

    out = pd.DataFrame().assign(
        business_id=np.repeat(business_x, tfidf_matrix.shape[1]),
        review_id=reviews_x['review_id'],
        cluster_n=km.labels_,
        source=np.arange(tfidf_matrix.shape[1]),
        target=targets,
        value=values,
        top_word=tfidf_matrix.apply(lambda x: ' '.join(x.sort_values().iloc[0:3].index), axis=0).to_list()
    )
    
    node_link_data.append(out)

node_link_data = pd.concat(node_link_data)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteowork/nltk_data...
[nltk_da

In [402]:
node_link_data.to_csv('node-link-data.csv', index=False)
node_link_data

Unnamed: 0,business_id,review_id,cluster_n,source,target,value,top_word
0,EQ-TZ2eeD_E0BHuvoaeG5Q,l3Wk_mvAog6XANIuGQ9C7Q,1,0,1230,0.160280,food frosted asset
1,EQ-TZ2eeD_E0BHuvoaeG5Q,940tqxFO4Pwg_KMg4Y4Z5g,3,1,893,0.244556,ring frosted asset
2,EQ-TZ2eeD_E0BHuvoaeG5Q,S-H-Ao17MEYH9cLpvevbnQ,0,2,1336,0.229324,food frosted asset
3,EQ-TZ2eeD_E0BHuvoaeG5Q,qNwMUWWtHiTrno15lcX_vw,3,3,1087,0.232635,ring frosted asset
4,EQ-TZ2eeD_E0BHuvoaeG5Q,3jnhCyBDfJ8cWewtlUZaFg,1,4,534,0.190492,food frosted asset
...,...,...,...,...,...,...,...
2003,Zi-F-YvyVOK0k5QD7lrLOg,Du5_62mUlRrdpWuYZnlZQw,3,2003,1391,0.308653,www regain cashed
2004,Zi-F-YvyVOK0k5QD7lrLOg,sCw_OD-BQBLg_bFLMwgipQ,1,2004,213,0.253117,www regain cashed
2005,Zi-F-YvyVOK0k5QD7lrLOg,QLM02QpOrrEH-2ekFxQl4Q,2,2005,990,0.194374,food regain cashed
2006,Zi-F-YvyVOK0k5QD7lrLOg,gBgAGQ2qkIytTDmAUF1mRg,4,2006,156,0.208138,food regain cashed


In [403]:
business_series[0:10]

0    EQ-TZ2eeD_E0BHuvoaeG5Q
1    nRKndeZLQ3eDL10UMwS2rQ
2    S2Ho8yLxhKAa26pBAm6rxA
3    ltBBYdNzkeKdCNPDAsxwAA
4    Zx7n8mdt8OzLRXVzolXNhQ
5    j8JOZvfeHEfUWq3gEz6ABQ
6    I6L0Zxi5Ww0zEWSAVgngeQ
7    EtKSTHV5Qx_Q7Aur9o4kQQ
8    oQ5CPRt0R3AzFvcjNOqB1w
9    Zi-F-YvyVOK0k5QD7lrLOg
dtype: object