In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd

In [8]:
data = pd.read_csv("../../fampay_instagram_comments_all.csv")
data = data.dropna(subset=['text'])

In [9]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words="english")

# Transform the text data into a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(data["text"])

In [10]:
# Choose the number of clusters
num_clusters = 5

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data["cluster"] = kmeans.fit_predict(tfidf_matrix)



In [13]:
# Perform PCA to reduce the dimensionality of the data for visualization
pca = PCA(n_components=10)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

# Create a new DataFrame with the PCA results and cluster labels
clustered_data = pd.DataFrame(reduced_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"])
clustered_data["cluster"] = data["cluster"]

# Create a scatter plot of the PCA results, colored by cluster
fig = px.scatter_3d(
    clustered_data,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    symbol="cluster",
    width=800,
    height=800,
    hover_name=data["text"],
    hover_data=["cluster"],
)
fig.update_traces(marker=dict(size=5, line=dict(width=2, color="DarkSlateGrey")), selector=dict(mode="markers"))
fig.show()

In [14]:
# Add the PCA columns to the original DataFrame
data["PC1"] = clustered_data["PC1"]
data["PC2"] = clustered_data["PC2"]
data["PC3"] = clustered_data["PC3"]
data["PC4"] = clustered_data["PC4"]
data["PC5"] = clustered_data["PC5"]
data["PC6"] = clustered_data["PC6"]
data["PC7"] = clustered_data["PC7"]
data["PC8"] = clustered_data["PC8"]
data["PC9"] = clustered_data["PC9"]
data["PC10"] = clustered_data["PC10"]

# Save the DataFrame as a CSV file
data.to_csv("clustered_data.csv", index=False)

In [15]:
data

Unnamed: 0,username,text,shortcode,cluster,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,svs_legacy,I liked that fampay bag,CrXogaGgxAL,1,-0.045651,0.201694,-0.006705,0.003082,0.012218,-0.035150,-0.011554,-0.011729,7.464985e-07,-0.020400
1,lakshay_870_,I want fampay baggggggg 😢,CrXogaGgxAL,1,-0.059291,0.452150,-0.000017,0.018772,0.016053,-0.044525,-0.014928,-0.000554,8.173664e-03,-0.049928
2,hasnainsheikh70,Logo and it's animation are best no need to ch...,CrXogaGgxAL,1,-0.039554,-0.013292,-0.026533,-0.017497,-0.008986,0.003767,-0.010883,0.004699,-2.616415e-04,-0.028892
3,aaru_s_g,*No fampay employees were hurt during this reel*,CrXogaGgxAL,1,-0.044502,0.170670,-0.008749,0.000733,0.010879,-0.032764,-0.011388,-0.012151,-1.070688e-03,-0.019365
4,mr.psykovsky,My Fampay Not Working 🥲 Some Error Occurred Sh...,CrXogaGgxAL,1,-0.046784,0.202594,-0.009648,0.000971,0.008767,-0.030260,-0.014421,-0.006131,7.100156e-04,-0.022401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33328,torqh4x,Why I m not getting my QR code ..after update 💀,CopN45aJzLd,1,,,,,,,,,,
33329,_mbasta21,Wheree is watch giveaway????@fampay.in,CopN45aJzLd,1,,,,,,,,,,
33330,_priyam_37,❤️,CopN45aJzLd,1,,,,,,,,,,
33331,sajid_9346_x,Dear fampay how to earn money in your app with...,CopN45aJzLd,1,,,,,,,,,,
