In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import plotly.io as pio
import plotly.graph_objs as go
import numpy as np
from plotly.graph_objs import Scatter, Layout, Figure
import random

In [25]:
data = pd.read_csv("../../fampay_instagram_comments_all_new.csv")
data = data.dropna(subset=['text'])

In [26]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words="english")

# Transform the text data into a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(data["text"])

# Choose the number of clusters
num_clusters = 5

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data["cluster"] = kmeans.fit_predict(tfidf_matrix)

# Perform dimensionality reduction with TruncatedSVD
svd = TruncatedSVD(n_components=10)
reduced_data = svd.fit_transform(tfidf_matrix)

# Create a new DataFrame with the SVD results and cluster labels
clustered_data = pd.DataFrame(reduced_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"])
clustered_data["cluster"] = data["cluster"]





In [27]:
# Reduce the number of data points by selecting a random subset
sample_size = 100000  # Adjust this value based on your computer's capabilities
random_indices = np.random.choice(data.index, size=sample_size, replace=False)
sampled_data = data.loc[random_indices]

# Update the TF-IDF matrix and clustering
tfidf_matrix = vectorizer.fit_transform(sampled_data["text"])
sampled_data["cluster"] = kmeans.fit_predict(tfidf_matrix)

# Update the dimensionality reduction
reduced_data = svd.fit_transform(tfidf_matrix)

# Update the clustered_data DataFrame
clustered_data = pd.DataFrame(reduced_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"])
clustered_data["cluster"] = sampled_data["cluster"]

# Create a 2D scatter plot of the SVD results, colored by cluster
fig = go.Figure()

for cluster_label in clustered_data["cluster"].unique():
    cluster_data = clustered_data[clustered_data["cluster"] == cluster_label]
    fig.add_trace(go.Scatter(
        x=cluster_data["PC1"],
        y=cluster_data["PC2"],
        mode="markers",
        name=f"Cluster {cluster_label}",
        text=sampled_data.loc[cluster_data.index, "text"],
        marker=dict(size=5, line=dict(width=2, color="DarkSlateGrey")),
    ))

fig.update_layout(
    title="2D Scatter Plot of Clusters",
    xaxis_title="PC1",
    yaxis_title="PC2",
    width=800,
    height=800,
)

pio.write_html(fig, file='Clustering.html', auto_open=True)
fig.show()





In [28]:
# Reduce the number of data points by selecting a random subset
sample_size = 100000  # Set this value to 100000
random_indices = random.sample(range(len(data)), sample_size)
sampled_data = data.iloc[random_indices]

# Update the TF-IDF matrix and clustering
tfidf_matrix = vectorizer.fit_transform(sampled_data["text"])
sampled_data["cluster"] = kmeans.fit_predict(tfidf_matrix)

# Update the dimensionality reduction
reduced_data = svd.fit_transform(tfidf_matrix)

# Update the clustered_data DataFrame
clustered_data = pd.DataFrame(reduced_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10"])
clustered_data["cluster"] = sampled_data["cluster"]

# Create a 3D scatter plot of the SVD results, colored by cluster
fig = px.scatter_3d(
    clustered_data,
    x="PC1",
    y="PC2",
    z="PC3",
    color="cluster",
    symbol="cluster",
    width=800,
    height=800,
    hover_name=sampled_data["text"],
    hover_data=["cluster"],
)
fig.update_traces(marker=dict(size=5, line=dict(width=2, color="DarkSlateGrey")), selector=dict(mode="markers"))
pio.write_html(fig, file='Clustering3D.html', auto_open=True)
fig.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [38]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words="english")

# Transform the text data into a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(data["text"])

# Choose the number of clusters
num_clusters = 30

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data["cluster"] = kmeans.fit_predict(tfidf_matrix)

# Perform dimensionality reduction with TruncatedSVD
svd = TruncatedSVD(n_components=30)
reduced_data = svd.fit_transform(tfidf_matrix)

# Create a new DataFrame with the SVD results and cluster labels
# clustered_data = pd.DataFrame(reduced_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15"])
clustered_data["cluster"] = data["cluster"]





In [43]:
import os

# Determine the number of clusters
num_clusters = len(clustered_data["cluster"].unique())

# Create the "clusters" directory if it doesn't exist
dir_name = f"clusters_{num_clusters}"
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# Save a sample of each cluster to a text file
for i, cluster_label in enumerate(clustered_data["cluster"].unique()):
    cluster_data = clustered_data[clustered_data["cluster"] == cluster_label]
    sample_indices = cluster_data.sample(min(40, len(cluster_data)), random_state=42).index
    with open(f"{dir_name}/{i}_cluster_{cluster_label}_sample.txt", "w") as outfile:
        for index in sample_indices:
            username = data.loc[index, "username"]
            shortcode = data.loc[index, "shortcode"]
            comment = data.loc[index, "text"]
            outfile.write(f"{comment}, {username}, {shortcode}\n")
