In [None]:
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image
import torch
from torchvision import models, transforms

import json
from IPython.display import clear_output

In [None]:
# Transformer pour pré-traiter les images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Charger et pré-traiter les images
def load_and_preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB')
    return transform(img).unsqueeze(0)

# Utiliser un modèle pré-entraîné pour extraire les caractéristiques
def extract_features(model, img_path, device):
    img = load_and_preprocess_image(img_path).to(device)
    with torch.no_grad():
        features = model(img)
    return features.cpu().numpy().flatten()

def charge_feature(model, device, image_paths):
    # Dossier contenant les images
    
    # Extraire les caractéristiques pour toutes les images
    features = np.array([extract_features(model, img_path, device) for img_path in image_paths])

    # Normaliser les vecteurs de caractéristiques
    features_normalized = normalize(features, norm='l2')

    return features_normalized

def cluster_features(similarity_matrix, eps = 0.07, min_samples = 2):
    dbscan = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(1 - similarity_matrix)
    return labels

def find_indices(similarity_matrix, threshold=1):
    truth = similarity_matrix>threshold
    indices = []
    for i in range(len(truth)):
        for j in range(len(truth)):
            if truth[i][j]:
                indices.append((i, j))
    return indices

def compute_average_distance(features):
    distances = []
    for i, feature in enumerate(features):
        distances.append(np.mean(np.linalg.norm(features - feature, axis=1)))
    return distances

def compute_centers(features, labels, image_paths):
    representative_images = {}
    for cluster_label in np.unique(labels):
        cluster_features = features[labels == cluster_label]
        cluster_distances = compute_average_distance(cluster_features)
        representative_image_index = np.argmin(cluster_distances)
        representative_images[cluster_label]=image_paths[np.where(labels == cluster_label)[0][representative_image_index]]
    return representative_images

def convert_dict(dic):
    converted_dict = {str(key): value for key, value in dic.items()}
    return converted_dict

def cluster_video(dataset, model, device, video, return_similarities=False):
    channel = video[:3]
    if channel in ['c+n', 'bft']:
        eps = 0.05
    else:
        eps = 0.1
    image_dir = os.path.join(dataset, video)
    image_paths = np.sort([os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith(('jpg', 'jpeg', 'png'))])
    features = charge_feature(model, device, image_paths)
    similarity_matrix = cosine_similarity(features)
    indices_to_check = find_indices(similarity_matrix, 1)
    for i, j in indices_to_check:
        similarity_matrix[i][j] = 1
        similarity_matrix[j][i] = 1
    labels = cluster_features(similarity_matrix, eps=eps)
    representative_images = compute_centers(features, labels, image_paths)
    enum_clust = np.unique(labels)
    dic_clust = {i:[] for i in enum_clust}

    mean_similarity = (np.sum(similarity_matrix)-len(similarity_matrix))/(len(similarity_matrix)**2-len(similarity_matrix))
    for i, label in enumerate(labels):
        dic_clust[label].append(image_paths[i])
    if -1 in dic_clust:
        final_imgs = dic_clust[-1].copy()
    else:
        final_imgs = []
    for i in representative_images:
        if i!=-1:
            final_imgs.append(representative_images[i])
    
    count_track = {
        "original_nb_img": len(image_paths),
        "reduced_nb_img": len(final_imgs),
        "nb_clusters": len(np.unique(labels))-1,
        "nb_outliers": len(dic_clust[-1]) if -1 in dic_clust else 0,
        "mean_similarity": mean_similarity
        }
    if return_similarities:
        return final_imgs, convert_dict(dic_clust), count_track, similarity_matrix
    else:
        return final_imgs, convert_dict(dic_clust), count_track


def main(dataset, video_list=None):
    img_list = {}
    clusters = {}
    metadata = {}
    error_videos = []
    if video_list is None:
        video_list = os.listdir(dataset)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = models.resnet50(pretrained=True)
    model = torch.nn.Sequential(*list(model.children())[:-1])  # Supprimer la dernière couche FC
    model = model.to(device)
    model.eval()
    sims = []
    for i,video in enumerate(video_list):
        
        #print(video)
        try:
            final_imgs, dic_clust, count_track, sim = cluster_video(dataset, model, device, video, return_similarities=True)

        except:
            print(f"Error with video {video}")
            error_videos.append(video)
            continue
        img_list[video] = final_imgs
        clusters[video] = dic_clust        
        metadata[video] = count_track
        sims.append([video, sim])

        print(f"Video {video} ok : {count_track}")
        print(f"Progression : {i+1}/{len(video_list)}")
        clear_output(wait=True)
    

    with open('img_list2.json', 'w') as f:
        json.dump(img_list, f)
    
    with open('clusters2.json', 'w') as f:
        json.dump(clusters, f)
    
    return img_list, clusters, metadata, sims, error_videos

        


In [None]:
video = "bft_20230620T184222"
final_imgs, dic_clust, count_track = cluster_video("/home/msouda/Datasets/true_anonymized", model, device, video)
print(f"Video {video} ok : {count_track}")

In [None]:
imgs, clusters, metadata, sims, error_videos = main('/home/msouda/Datasets/true_anonymized')

In [None]:
imgs

In [None]:
error_videos

In [None]:
len(sims)

In [None]:
for channel, sim in sims:
    np.save(f"cluster_similarities/{channel}_sim.npy", sim)

In [None]:
for channel, sim in sims:
    tmp = [sim[i][j] for i in range(len(sim)) for j in range(i+1, len(sim))]
    plt.hist(tmp, bins=100)
    plt.title(channel)
    plt.show()

In [None]:
with open('metadata_clusters.json', 'w') as f:
        json.dump(metadata, f)

In [None]:
pd.DataFrame.from_dict(metadata, orient='index').sort_values(by='nb_outliers')

In [None]:
metadata_videos = pd.read_csv('metadata_videos.csv').sort_values(by='duration').query('duration<150').assign(video_id=lambda x: x['video_id'].apply(lambda x: x.split('/')[-1]))
metadata_videos

In [None]:
imgs, clusters, metadata = main('/home/msouda/Datasets/true_anonymized', video_list=metadata_videos['video_id'].values)

In [None]:
a = pd.DataFrame.from_dict(metadata, orient='index').sort_values(by='nb_outliers')
#plt.plot(a["mean_similarity"], a["nb_outliers"], 'o', label='Outliers')
plt.plot(a["mean_similarity"], a["nb_clusters"], 'o', label='Clusters')
#plt.plot(a["mean_similarity"], a["original_nb_img"], 'o', label='Original nb images')
plt.legend()
plt.show()

In [None]:
a = a.sort_values(by='mean_similarity', ascending=False)

In [None]:
a.index

In [None]:
sims = []
for x in a.index:
    _, _, _, sim = cluster_video('/home/msouda/Datasets/true_anonymized', model, device, x, return_similarities=True)
    sim = [sim[i][j] for j in range(len(sim)) for i in range(j+1, len(sim))]
    sims.append(sim)

In [None]:
a

In [None]:
for i,x in enumerate(a.index):
    print(a['mean_similarity'][x], a['nb_outliers'][x], a['nb_clusters'][x], a['original_nb_img'][x])
    plt.hist(a['sims'][x], bins=100, alpha=0.5, label=x)
    plt.legend()
    plt.show()

In [None]:
ms = np.array([metadata[video]["mean_similarity"] for video in metadata])
n_cl = np.array([metadata[video]["nb_clusters"] for video in metadata])
n_out = np.array([metadata[video]["nb_outliers"] for video in metadata])
plt.plot((ms-np.min(ms))/(np.max(ms)-np.min(ms)), label="mean_similarity")
plt.plot((n_cl-np.min(n_cl))/(np.max(n_cl)-np.min(n_cl)), label="nb_clusters")
plt.plot((n_out-np.min(n_out))/(np.max(n_out)-np.min(n_out)), label="nb_outliers")
plt.legend()
plt.show()

In [None]:
plt.plot(ms, n_cl, 'o')
plt.show()

In [None]:
video

In [None]:
# Utiliser un modèle pré-entraîné (ResNet-50 utilisé ici)


print(device)
print(image_paths)
print(model)

In [None]:
image_paths

In [None]:
features_normalized.shape

In [None]:
# Utiliser PCA pour réduire la dimensionnalité (optionnel mais recommandé pour de grands ensembles de données)
pca = PCA(n_components=50)
features_reduced = pca.fit_transform(features_normalized)
features_reduced = normalize(features_reduced, norm='l2')


In [None]:
features_reduced

In [None]:
features_reduced.shape

In [None]:
(features_normalized>1).any()

In [None]:
# Calculer la matrice de similarité cosinus
similarity_matrix = cosine_similarity(features_normalized)

def find_indices(similarity_matrix, threshold=1):
    truth = similarity_matrix>threshold
    indices = []
    for i in range(len(truth)):
        for j in range(len(truth)):
            if truth[i][j]:
                indices.append((i, j))
    return indices

x =find_indices(similarity_matrix)
for i, j in x:
    #print(f"Value on index {i} and {j} is {similarity_matrix[i][j]}")
    similarity_matrix[i][j] = 1

# Appliquer DBSCAN avec la similarité cosinus comme mesure de distance
dbscan = DBSCAN(metric='precomputed', eps=0.07, min_samples=2)
labels = dbscan.fit_predict(1 - similarity_matrix)  # 1 - similarité pour la distance

# Afficher les résultats
for i, label in enumerate(labels):
    print(f"Image {image_paths[i]} is in cluster {label}")


In [None]:

dic_clust

In [None]:
def compute_average_distance(features):
    distances = []
    for i, feature in enumerate(features):
        distances.append(np.mean(np.linalg.norm(features - feature, axis=1)))
    return distances

In [None]:
representative_images = {}
for cluster_label in np.unique(labels):
    cluster_features = features[labels == cluster_label]
    cluster_distances = compute_average_distance(cluster_features)
    representative_image_index = np.argmin(cluster_distances)
    representative_images[cluster_label]=image_paths[np.where(labels == cluster_label)[0][representative_image_index]]
representative_images

In [None]:
i = np.random.randint(0, len(enum_clust))
list_img = dic_clust[enum_clust[i]]
for path in list_img:
    if path == representative_images[enum_clust[i]]:
        print(f"Representative image for cluster {enum_clust[i]}")
    img = Image.open(path)
    plt.imshow(img)
    plt.show()

In [None]:

print(len(final_imgs))
print(len(representative_images)-1)
print(len(dic_clust[-1]))

In [None]:
final_imgs.append(representative_images[0])
final_imgs

In [None]:
len(final_imgs)

In [None]:
for i in representative_images:
    if i!=-1:
        print(representative_images[i])