In [1]:
!pip install umap-learn


Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m916.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82807 sha256=cc73927ad997dfd4b83f97c4d6f1d2dc298af3cce860e326a73b089a67ea994c
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for p

In [2]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.0


In [18]:
from sklearn.cluster import HDBSCAN
import os
import json

import pandas as pd
from google.colab import drive
import umap
import matplotlib.pyplot as plt
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score, silhouette_score


In [5]:
drive.mount('/content/drive')

#input_path="/content/drive/MyDrive/TFM/Instagram.Post.json"
EMB_PATHS="/content/drive/MyDrive/TFM/Embeddings/ImgEmbeddings/ALIGNEmbeddings.csv"

data = pd.read_csv(EMB_PATHS, header=None)
ids = data.iloc[:, 0]
features = data.iloc[:, 1:]

Mounted at /content/drive


In [26]:
FOLDER_PATH = "/content/drive/MyDrive/TFM/Embeddings/ImgEmbeddings/Images"
img_ids = []
for folder_name in os.listdir(FOLDER_PATH):

  folder_path = os.path.join(FOLDER_PATH, folder_name)

  if os.path.isdir(folder_path):
    img_ids.append(folder_name)

In [27]:
TEXT_PATH = "/content/drive/MyDrive/TFM/Embeddings/Posts.json"
captions = {}
with open(TEXT_PATH, "r") as input:
  for line in input:
    post = json.loads(line)
    captions[post["_id"]] = post["caption"]

In [28]:
def evaluate_clustering(title, features, labels):
    """
    Calculate clustering evaluation metrics for a given clustering result and print the results.

    Parameters:
    - title: A string, title for the clustering result.
    - features: Features used for clustering (numpy array or DataFrame).
    - labels: Cluster labels assigned by the clustering algorithm.
    """
    try:
      davies_bouldin = davies_bouldin_score(features, labels)
    except:
      davies_bouldin = 'Not applicable'

    try:
      calinski_harabasz = calinski_harabasz_score(features, labels)
    except:
      calinski_harabasz = 'Not applicable'

    try:
      silhouette = silhouette_score(features, labels)
    except:
      silhouette = 'Not applicable'
    try:
      n_clusters = len(set(labels))
    except:
      n_clusters = 'Not applicable'

    print(f"{title} Clustering Evaluation:")
    print(f"Davies-Bouldin Score: {davies_bouldin}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz}")
    print(f"Silhouette Score: {silhouette}\n")
    print(f"Number of clusters: {n_clusters}\n")

In [29]:
def generate_clusters(message_embeddings,
                    n_neighbors,
                    n_components,
                    min_cluster_size,
                    random_state = None):
  """
  Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
  """

  umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors,
                              n_components=n_components,
                              metric='cosine',
                              random_state=random_state)
                          .fit_transform(message_embeddings))

  clusters = HDBSCAN(min_samples = min_cluster_size,
                              metric='euclidean').fit_predict(umap_embeddings)

  return clusters, umap_embeddings

In [40]:
n_neighbors = 100
n_components = 3
min_cluster_size = 500
cluster_labels3, red_embeddings3 = generate_clusters(features,
          n_neighbors, n_components, min_cluster_size)
evaluate_clustering(f"HBDBSCAN-{n_neighbors}_{n_components}_{min_cluster_size}", features, cluster_labels3)



HBDBSCAN-100_3_500 Clustering Evaluation:
Davies-Bouldin Score: 0.9920219927797796
Calinski-Harabasz Score: 722.3548224944866
Silhouette Score: 0.20502413243888137

Number of clusters: 3



In [42]:
text = []
for i in img_ids:
  text.append(captions[i])

In [56]:
df = pd.DataFrame({'Image ID': img_ids, 'Text': text, 'Cluster Label': cluster_labels3,
                   'dim0':red_embeddings3[:,0],'dim1':red_embeddings3[:,1],'dim2':red_embeddings3[:,2]})


In [57]:
csv_file = "/content/drive/MyDrive/TFM/GoodClusters.csv"

df.to_csv(csv_file, index=False)


In [79]:
JSONPANTH = "/content/drive/MyDrive/TFM/GoodClusters.json"
with open(JSONPANTH, "w") as out:
  for i in range(len(img_ids)):
    d = {"_id":img_ids[i],"caption":text[i], "cluster_label":int(cluster_labels3[i]),
        "embeddings": [float(j) for j in list(red_embeddings3[i,:])]}
    out.write(json.dumps(d))
    out.write("\n")


In [76]:
d

{'_id': '6475d0a7967e4199e325bcbf',
 'caption': '#elfuturoesprovida #noalabortosialavida  #noalabortosialavida💙 #noalabortosialavida♥ #feminismohipocrita #ideologiadegenero #serprovida #hipocresiafeminista #ideologiadegenero  #feministashipócritas #patriarcado #elpatriarcadosevaacaer #provida #feminismo #abortoesviolencia #femiorcos #acanoserindenadie  #sialavida #aborto #pañueloverde💚 #trapoverde #niunamenos #noalaborto #abortolegal #salvemoslasdosvidas  #conmishijosnotemetas #conabortonotevoto #seraderogada',
 'cluster_label': 0,
 'embeddings': [7.188345909118652, 7.561276435852051, -7.417057991027832]}