The cluster map (scatter plot) is a visualization of the clusters formed by the K-Means algorithm on the app reviews dataset. The purpose of the cluster map is to help you understand the distribution and relationships among the data points in a lower-dimensional space, making it easier to interpret the results of the clustering process.

In this specific case, the scatter plot displays the data points in a 2-dimensional space, which is achieved by using Incremental PCA for dimensionality reduction. Each point in the plot represents a single app review, and the color of the point indicates the cluster it belongs to, as determined by the K-Means clustering algorithm.

The cluster map allows you to visually assess the quality of the clustering, identify any patterns or groupings, and evaluate how well the clusters separate the reviews based on their content. By examining the cluster map, you can gain insights into the structure of your data and identify relationships or similarities among the reviews that may not be immediately apparent through other means of analysis.

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from plotly.offline import plot

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import plotly.io as pio

def save_graph_html(fig, filename):

    plot_div = plot(fig, output_type='div')

    with open(filename, 'w') as f:
        f.write(plot_div)


nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Harsh
[nltk_data]     Clean\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Harsh
[nltk_data]     Clean\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stopwords.words('english')]
    return ' '.join(words)

In [28]:
# review_score = int(input("Enter the review score: "))

df = pd.read_csv('../../Warehouse/Reviews/app_reviews_1.csv')

# Preprocess the data
df['content'] = df['content'].fillna('').apply(preprocess_text)

df = df.sample(frac=0.3, replace=False, random_state=1)

# Feature extraction
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = tfidf.fit_transform(df['content'])

In [29]:
# save the model to disk 
filename = 'tfidf1.sav'
pickle.dump(tfidf, open(filename, 'wb'))

In [30]:
# Dimensionality reduction
pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X.toarray())  # convert sparse matrix to dense

# Clustering algorithm (now 6 clusters instead of 2)
clustering = MiniBatchKMeans(n_clusters=6, random_state=42)
y_pred = clustering.fit_predict(X)

# Save the model to a pickle file
pickle.dump(clustering, open('minibatch_kmeans.pkl', 'wb'))

# Combine data into a single DataFrame
combined_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2', 'PCA3'])
combined_df['Cluster'] = y_pred
combined_df['Content'] = df['content'].values

# Sample a subset of your data for visualization
df_sample = combined_df.sample(frac=1, random_state=42)

# Extract sample data
X_sample = df_sample[['PCA1', 'PCA2', 'PCA3']].values
y_sample = df_sample['Cluster'].values

# Visualize the clusters in 3D
fig = px.scatter_3d(x=X_sample[:, 0], y=X_sample[:, 1], z=X_sample[:, 2], 
                    color=y_sample, title="Clusters Visualization",
                    hover_data=[df_sample['Content'].tolist()])
save_graph_html(fig, 'clusters3D.html')
print(len(X_sample))
print(len(df_sample.index))
fig.show()





11776
11776


In [31]:
pio.write_html(fig, "../../fam-report-site/public/App-Analytics/clustering3D.html")

In [None]:
# Analyze the clusters
df['cluster'] = y_pred
for i in range(best_n_clusters):
    print(f"Cluster {i}:")
    print(df[df['cluster'] == i]['content'].value_counts().head(10))

Cluster 0:
payment issue                 16
payment problem               13
payment                        7
payment received               7
upi payment                    6
payment processing             6
payment failed                 6
slow payment                   5
payment slow                   4
payment processing problem     3
Name: content, dtype: int64
Cluster 1:
useful                                                                                                                                                                      12
getting otp                                                                                                                                                                  6
useful app                                                                                                                                                                   5
app useful                                                                                     