In [1]:
from wildlife_datasets import datasets, splits
import torchvision.transforms as T
import wildlife_tools.data.__init__ as tools
import timm
from wildlife_tools.features import DeepFeatures
from sklearn.metrics import adjusted_rand_score
from active_semi_clustering.semi_supervised.pairwise_constraints import PCKMeans, COPKMeans, MKMeans, MPCKMeans, MPCKMeans, RCAKMeans
from active_semi_clustering.semi_supervised.labeled_data import KMeans
from active_semi_clustering.active.pairwise_constraints import ExampleOracle, ExploreConsolidate, MinMax
from sklearn import metrics
from active_semi_clustering.active.pairwise_constraints import ExampleOracle, ExploreConsolidate, MinMax
import numpy as np
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


## Carga del dataset

In [2]:
# Cargamos el dataset de tortugas
dataset_path = 'data/SeaTurtleID2022'
datasets.SeaTurtleID2022.get_data(dataset_path)
metadata = datasets.SeaTurtleID2022(dataset_path)
data_df = metadata.df

DATASET SeaTurtleID2022: DOWNLOADING STARTED.
DATASET SeaTurtleID2022: EXTRACTING STARTED.
DATASET SeaTurtleID2022: FINISHED.



In [2]:
# Cargamos el dataset de tortugas
dataset_path = 'data/CowDataset'
datasets.CowDataset.get_data(dataset_path)
metadata = datasets.CowDataset(dataset_path)
data_df = metadata.df

DATASET CowDataset: DOWNLOADING STARTED.
You are trying to download an already downloaded dataset.
        This message may have happened to due interrupted download or extract.
        To force the download use the `force=True` keyword such as
        get_data(..., force=True) or download(..., force=True).
        


## Transformaciones

In [9]:
transform = T.Compose([T.Resize(size=(384, 384)),
                              T.ToTensor(), 
                              T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
dataset = tools.WildlifeDataset(metadata=data_df, root="./data/SeaTurtleID2022", transform=transform)

## Extracción de características

In [10]:
backboneDescriptor = timm.create_model("hf-hub:BVRA/MegaDescriptor-L-384", pretrained=True, num_classes=0)
extractorDescritor = DeepFeatures(backboneDescriptor)
outputFeaturesDescritor = extractorDescritor(dataset)

  1%|▉                                                            | 1/69 [04:41<5:18:33, 281.08s/it]


RuntimeError: DataLoader worker (pid(s) 5176) exited unexpectedly

In [5]:
outputClasses = data_df['identity'].to_numpy()

## Extracción del numero de clusters

In [6]:
numberOfCluster = data_df['identity'].nunique()

## Refinamiento de Caracteristicas Extraidas

In [9]:
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_visualized = tsne.fit_transform(outputFeaturesDescritor)

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_visualized)

## Extracción de restricciones

In [11]:
def density_based_threshold_must(X, percentile=20):
    # Entrena un modelo de vecinos más cercanos
    nbrs = NearestNeighbors(n_neighbors=numberOfCluster).fit(X)
    distances, _ = nbrs.kneighbors(X)
    
    # Selecciona las distancias al vecino más cercano (primer vecino, excepto el punto en sí)
    avg_distances = distances[:, 1]  # Distancia al vecino más cercano real
    return np.percentile(avg_distances, percentile)  # Percentil bajo para zonas densas

# Aplica la función al dataset
threshold_must = density_based_threshold_must(X_scaled)
print("Threshold must-link basado en densidad:", threshold_must)

Threshold must-link basado en densidad: 0.0051595037686212675


In [12]:
def density_based_threshold(X, percentile=80):
    nbrs = NearestNeighbors(n_neighbors=13).fit(X)
    distances, _ = nbrs.kneighbors(X)
    avg_distances = distances[:, -1]  # Distancia al vecino más lejano entre los 10 más cercanos
    return np.percentile(avg_distances, percentile)

threshold_cannot = density_based_threshold(X_scaled)
print("Threshold cannot-link basado en densidad:", threshold_cannot)

Threshold cannot-link basado en densidad: 0.17314173210384964


In [16]:
def generate_constraints(X, y, threshold_must, threshold_cannot):
    distances = euclidean_distances(X)
    must_link = []
    cannot_link = []
    
    # Recorre combinaciones de pares de puntos
    for i, j in combinations(range(len(y)), 2):
        if distances[i, j] <= threshold_must and y[i] == y[j]:  # misma etiqueta, distancia corta
            must_link.append((i, j))
        elif distances[i, j] >= threshold_cannot and y[i] != y[j]:  # diferente etiqueta, distancia larga
            cannot_link.append((i, j))
    
    return must_link, cannot_link 

must_link, cannot_link = generate_constraints(X_scaled, outputClasses, threshold_must, threshold_cannot)

print("Restricciones must-link:", must_link[:5])  # Muestra las primeras 5 restricciones must-link
print("Restricciones cannot-link:", cannot_link[:5])  # Muestra las primeras 5 restricciones cannot-link

Restricciones must-link: [(6, 399), (10, 1102), (18, 22), (19, 869), (23, 1389)]
Restricciones cannot-link: [(0, 1), (0, 3), (0, 4), (0, 5), (0, 6)]


In [18]:
# TODO implement your own oracle that will, for example, query a domain expert via GUI or CLI
oracle = ExampleOracle(outputClasses, max_queries_cnt=20)

active_learner = MinMax(n_clusters=numberOfCluster)
active_learner.fit(outputFeaturesDescritor, oracle=oracle)
pairwise_constraints = active_learner.pairwise_constraints_

## Evaluaciones

## Supervisado

In [17]:
# 1. Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(outputFeaturesDescritor, outputClasses, test_size=0.2, random_state=42)

# 2. Escalar los datos
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Entrenar un clasificador (por ejemplo, SVM)
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train_scaled, y_train)

# 4. Predecir en los datos de prueba
y_pred = clf.predict(X_test_scaled)

# 5. Evaluar el rendimiento
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9932659932659933
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        20
           2       1.00      1.00      1.00        21
           3       1.00      1.00      1.00        15
           4       0.96      1.00      0.98        26
           5       1.00      1.00      1.00        27
           6       0.96      1.00      0.98        26
           7       1.00      0.95      0.98        22
           8       1.00      0.96      0.98        25
           9       1.00      1.00      1.00        24
          10       1.00      1.00      1.00        28
          11       1.00      1.00      1.00        18
          12       1.00      1.00      1.00        26
          13       1.00      1.00      1.00        19

    accuracy                           0.99       297
   macro avg       0.99      0.99      0.99       297
weighted avg       0.99      0.99      0.99       297



## Semi Supervisado

In [19]:
n_runs = 10  # Número de repeticiones
rand_scores = []

for _ in range(n_runs):
    clusterer = PCKMeans(n_clusters=numberOfCluster, max_iter=1000, w=0)
    clusterer.fit(X_scaled, ml=pairwise_constraints[0], cl=pairwise_constraints[1])
    score = adjusted_rand_score(outputClasses, clusterer.labels_)
    rand_scores.append(score)

mean_rand_score = np.mean(rand_scores)
std_rand_score = np.std(rand_scores)

print(f"Rand Score Promedio: {mean_rand_score:.4f} ± {std_rand_score:.4f}")

Rand Score Promedio: 0.6473 ± 0.0362


In [20]:
n_runs = 10  # Número de repeticiones
rand_scores = []

for _ in range(n_runs):
    clusterer = PCKMeans(n_clusters=numberOfCluster, max_iter=100, w=1)
    clusterer.fit(X_scaled, outputClasses, ml=must_link, cl=cannot_link)
    score = adjusted_rand_score(outputClasses, clusterer.labels_)
    rand_scores.append(score)

mean_rand_score = np.mean(rand_scores)
std_rand_score = np.std(rand_scores)

print(f"Rand Score Promedio: {mean_rand_score:.4f} ± {std_rand_score:.4f}")

Rand Score Promedio: 0.9874 ± 0.0084


In [21]:
homogeneity = homogeneity_score(outputClasses, clusterer.labels_)
completeness = completeness_score(outputClasses, clusterer.labels_)
v_measure = v_measure_score(outputClasses, clusterer.labels_)

print(f"Homogeneidad: {homogeneity:.4f}")
print(f"Completitud: {completeness:.4f}")
print(f"V-Measure: {v_measure:.4f}")

Homogeneidad: 1.0000
Completitud: 1.0000
V-Measure: 1.0000


In [None]:
n_runs = 10  # Número de repeticiones
rand_scores = []

for _ in range(n_runs):
    clusterer = PCKMeans(n_clusters=numberOfCluster, max_iter=100, w=1)
    clusterer.fit(X_scaled, ml=must_link, cl=cannot_link)
    score = adjusted_rand_score(outputClasses, clusterer.labels_)
    rand_scores.append(score)

mean_rand_score = np.mean(rand_scores)
std_rand_score = np.std(rand_scores)

print(f"Rand Score Promedio: {mean_rand_score:.4f} ± {std_rand_score:.4f}")

In [35]:
n_runs = 10  # Número de repeticiones
rand_scores = []

for _ in range(n_runs):
    clusterer = COPKMeans(n_clusters=numberOfCluster, max_iter=100)
    clusterer.fit(X_scaled, outputClasses, ml=must_link, cl=cannot_link)
    score = adjusted_rand_score(outputClasses, clusterer.labels_)
    rand_scores.append(score)

mean_rand_score = np.mean(rand_scores)
std_rand_score = np.std(rand_scores)


print(f"Rand Score Promedio: {mean_rand_score:.4f} ± {std_rand_score:.4f}")

EmptyClustersException: 

In [None]:
homogeneity = homogeneity_score(outputClasses, clusterer.labels_)
completeness = completeness_score(outputClasses, clusterer.labels_)
v_measure = v_measure_score(outputClasses, clusterer.labels_)

print(f"Homogeneidad: {homogeneity:.4f}")
print(f"Completitud: {completeness:.4f}")
print(f"V-Measure: {v_measure:.4f}")