In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
import heapq
from collections import defaultdict
from sklearn.cluster import OPTICS
import os
import math
from itertools import compress
import seaborn as sns
import warnings
from collections import defaultdict
from collections import Counter
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

def get_dominant_parent_cluster(child_id, generation, population_df, crossover_df, cluster_map_prev_gen, pop_data):
    """
    Určí dominantní rodičovský cluster pro daného potomka podle bližšího rodiče.
    Pokud je jedinec elitou (přenesen z předchozí generace), žádný cluster nevrací.
    """
    
    # Připrav slovník: ID jedinců v předchozí generaci → vektory
    pop_data_prev = {
        row["id"]: row.drop(["id", "generation"]).values
        for _, row in population_df[population_df["generation"] == generation - 1].iterrows()
    }

    # Najdi globální ID potomka v této generaci
    child_row = population_df[population_df["generation"] == generation].iloc[child_id]
    child_global_id = child_row["id"]

    # Pokud už existoval v předchozí generaci → jedná se o elitu
    if child_global_id in pop_data_prev:
        # Zjistíme jeho původní cluster z předchozí generace
        for cluster_id, members in cluster_map_prev_gen.items():
            if child_global_id in members:
                #print(f"[{generation}] Elitní jedinec ID {child_global_id} → zachován v clusteru {cluster_id}")
                return cluster_id
        #print(f"[{generation}] Elitní jedinec ID {child_global_id} → nebyl nalezen v žádném clusteru (chyba?)")
        return None

    # Najdi rodiče potomka
    crossover_row = crossover_df[
        (crossover_df["Child1"] == child_global_id) | (crossover_df["Child2"] == child_global_id)
    ]
    if crossover_row.empty:
        #print(f"[{generation}] Child ID {child_global_id} → rodiče nebyli nalezeni.")
        return None

    parent1 = crossover_row.iloc[0]["Parent1"]
    parent2 = crossover_row.iloc[0]["Parent2"]

    # Najdi clustery rodičů
    parent1_cluster = next((cid for cid, members in cluster_map_prev_gen.items() if parent1 in members), None)
    parent2_cluster = next((cid for cid, members in cluster_map_prev_gen.items() if parent2 in members), None)

    # Vektory
    child_vector = pop_data[child_id]
    parent1_vector = pop_data_prev.get(parent1, None)
    parent2_vector = pop_data_prev.get(parent2, None)

    selected_cluster = None
    selected_parent = None

    # Rozhodni podle bližšího rodiče
    if parent1_cluster is not None and parent2_cluster is not None and parent1_vector is not None and parent2_vector is not None:
        d1 = np.linalg.norm(child_vector - parent1_vector)
        d2 = np.linalg.norm(child_vector - parent2_vector)
        selected_cluster = parent1_cluster if d1 <= d2 else parent2_cluster
        selected_parent = parent1 if d1 <= d2 else parent2
    elif parent1_cluster is not None:
        selected_cluster = parent1_cluster
        selected_parent = parent1
    elif parent2_cluster is not None:
        selected_cluster = parent2_cluster
        selected_parent = parent2

    #print(f"[{generation}] Child ID {child_global_id} → Parent1: {parent1}, Parent2: {parent2}, Selected parent: {selected_parent}, Cluster: {selected_cluster}")

    return selected_cluster

def schwefel(coordinates):
    result = 0
    for coordinate in coordinates:
        result += coordinate * math.sin(math.sqrt(abs(coordinate)))
    return 418.9829 * len(coordinates) - result

def sphere(x):
    return sum([xi**2 for xi in x])

def map_and_remap_clusters(
    gen_index,
    prev_clusters_members, curr_clusters_members,
    clusters_array, pop_data,
    global_id_counter,
    use_jaccard=True,
    pop_data_df=None,
    crossover_data_df=None
):
    if gen_index > 0 and global_id_counter == 0:
        global_id_counter = max(prev_clusters_members.keys()) + 1

    if gen_index == 0:
        new_ids = {}
        for cid in curr_clusters_members.keys():
            new_ids[cid] = global_id_counter
            global_id_counter += 1

        new_clusters_array = np.array([
            new_ids[cl] if cl in new_ids else -1 for cl in clusters_array
        ])
        new_centroids = np.array([
            np.mean(pop_data[np.array(new_clusters_array) == gid], axis=0)
            for gid in sorted(new_ids.values())
        ])
        new_cluster_members = {
            new_ids[cid]: members for cid, members in curr_clusters_members.items()
        }

        return new_clusters_array, new_centroids, new_cluster_members, global_id_counter

    suggestions = defaultdict(list)
    id_mapping = {}
    if not use_jaccard:
        # === 1. Dominantní historický cluster pro každý jedinec ===
        previous_ids = {}
        for ident, cluster in zip(range(len(clusters_array)), clusters_array):
            if cluster == -1:
                continue
            dominant_parent_cluster = get_dominant_parent_cluster(
                child_id=ident,
                generation=gen_index,
                population_df=pop_data_df,
                crossover_df=crossover_data_df,
                cluster_map_prev_gen=prev_clusters_members,
                pop_data=pop_data
            )
            if dominant_parent_cluster is not None:
                previous_ids[ident] = dominant_parent_cluster

        # === 2. Shluky → historické ID členů
        cluster_to_past_ids = defaultdict(list)
        for ident, cluster in zip(range(len(clusters_array)), clusters_array):
            if cluster == -1 or ident not in previous_ids:
                continue
            cluster_to_past_ids[cluster].append(previous_ids[ident])

        # === 3. Sbírání návrhů od všech clusterů ===
        cluster_votes = []
        for curr_cluster_id, past_ids in cluster_to_past_ids.items():
            #print(f"[GEN {gen_index}] Cluster {curr_cluster_id} obsahuje:")
            counter = Counter(past_ids)
            #for cid, count in counter.items():
                #print(f"   - {count}x historický cluster {cid} (velikost v minulosti: {len(prev_clusters_members.get(cid, []))})")

            if not past_ids:
                continue

            top = counter.most_common()
            max_count = top[0][1]
            top_candidates = [cid for cid, cnt in top if cnt == max_count]

            best_candidate = max(
                top_candidates,
                key=lambda cid: len(prev_clusters_members.get(cid, set()))
            )

            cluster_votes.append((curr_cluster_id, best_candidate, counter[best_candidate], len(curr_clusters_members[curr_cluster_id])))

        # === 4. Řešení kolizí ===
        #id_mapping = {}
        used_prev_ids = set()

        # Pro každý historický cluster zjistíme, kdo o něj žádá
        requests = defaultdict(list)
        for cluster_id, wanted_id, votes, size in cluster_votes:
            requests[wanted_id].append((cluster_id, votes, size))

        for wanted_id, contenders in requests.items():
            if len(contenders) == 1:
                # jediný zájemce
                cluster_id, _, _ = contenders[0]
                id_mapping[cluster_id] = wanted_id
                used_prev_ids.add(wanted_id)
                #print(f"   → Přiřazen historický cluster {wanted_id} (žádný soupeř)")
            else:
                # kolize: rozhodni podle počtu hlasů, pak podle velikosti
                contenders.sort(key=lambda x: (-x[1], -x[2]))  # primárně hlasů, sekundárně velikost
                winner_id, _, _ = contenders[0]
                id_mapping[winner_id] = wanted_id
                used_prev_ids.add(wanted_id)
                #print(f"   → Přiřazen historický cluster {wanted_id} clusteru {winner_id} (vyhrál v konkurenci)")
                #for cluster_id, _, _ in contenders[1:]:
                #    print(f"   → Cluster {cluster_id} prohrál spor o historický cluster {wanted_id}")

        # === 5. Zbytek nových clusterů dostane nové ID
        for cid in curr_clusters_members:
            if cid not in id_mapping:
                id_mapping[cid] = global_id_counter
                #print(f"🆕 Cluster {cid} nedostal žádné historické ID → nové ID {global_id_counter}")
                global_id_counter += 1
    else:

        # === 1. Každý starý cluster navrhne JEDEN nejlepší nový cluster ===
        #suggestions = defaultdict(list)

        for prev_id, prev_members in prev_clusters_members.items():
            best_score = -1
            best_curr_id = None
            for curr_id, curr_members in curr_clusters_members.items():
                intersection = len(prev_members & curr_members)
                union = len(prev_members | curr_members)
                if union == 0:
                    continue
                jaccard = intersection / union
                if jaccard > best_score:
                    best_score = jaccard
                    best_curr_id = curr_id
            if best_curr_id is not None:
                suggestions[best_curr_id].append((prev_id, best_score)) 

        # === Debug výpis návrhů ===
        #print("\n--- Návrhy pro nové clustery (přesně jeden od každého předka) ---")
        #for curr_id, proposals in suggestions.items():
            #print(f"Nový cluster {curr_id}: {[f'{pid} ({score:.2f})' for pid, score in sorted(proposals, key=lambda x: -x[1])]}")
        #print(f"[GEN {gen_index}] Jaccard mapping návrhy:")
        #for curr_id, proposals in suggestions.items():
        #    print(f"  Nový cluster {curr_id} → {[f'prev {pid} ({score:.2f})' for pid, score in proposals]}")


        # === 2. Výběr nejlepšího nepoužitého předka
        used_prev_ids = set()
        #id_mapping = {}
        for curr_id, proposals in suggestions.items():
            proposals.sort(key=lambda x: -x[1])  # podle Jaccarda sestupně
            for prev_id, score in proposals:
                if prev_id not in used_prev_ids:
                    id_mapping[curr_id] = prev_id
                    used_prev_ids.add(prev_id)
                    #print(f"✔️ Cluster {curr_id} byl namapován na {prev_id} (Jaccard: {score:.2f})")
                    break

        # === 3. Zbytek nových clusterů dostane nové ID
        for curr_id in curr_clusters_members:
            if curr_id not in id_mapping:
                id_mapping[curr_id] = global_id_counter
                #print(f"🆕 Cluster {curr_id} nedostal žádného předka – přiděleno nové ID: {global_id_counter}")
                global_id_counter += 1
    
    # === 4. Mapování pole clusterů
    new_clusters_array = np.array([
        id_mapping[cl] if cl in id_mapping else -1 for cl in clusters_array
    ])

    # === 5. Přepočet centroidů
    new_centroids = []
    new_cluster_ids = sorted(set(id_mapping.values()))
    for gid in new_cluster_ids:
        member_data = pop_data[np.array(new_clusters_array) == gid]
        if len(member_data) > 0:
            new_centroids.append(np.mean(member_data, axis=0))
    new_centroids = np.array(new_centroids)

    # === 6. Členové clusterů podle nového mapování
    new_cluster_members = {}
    for curr_id, members in curr_clusters_members.items():
        gid = id_mapping[curr_id]
        new_cluster_members[gid] = members

    return new_clusters_array, new_centroids, new_cluster_members, global_id_counter


def compute_inertia(data, labels, centroids):
    inertia = 0.0
    for i, point in enumerate(data):
        label = labels[i]
        if label == -1:
            continue  # noise ignoruj
        inertia += np.sum((point - centroids[label]) ** 2)
    return inertia


import numpy as np
from sklearn.neighbors import NearestNeighbors

def run(dataset, min_samples, step, fintess_method, clustering_method_name, evol_name, crossover=None):
    output_dir = os.path.join("html_report", evol_name, fintess_method.__name__, clustering_method_name)
    images_dir = os.path.join(output_dir, "images")
    os.makedirs(images_dir, exist_ok=True)
    warnings.simplefilter("ignore", category=UserWarning)


    html_parts = [
        "<html><head><meta charset='utf-8'><title>Optics clustering Report</title></head><body>",
        "<h1>Optics shlukování – Vývoj</h1>"
    ]
    centroid_history = []
    df = pd.read_csv(dataset)
    crossover_df = None
    if evol_name == "GA":
        crossover_df = pd.read_csv(crossover)

    unique_generations = sorted(df["generation"].unique())

    #selected_generations = [unique_generations[0]] + unique_generations[step-1::step] + [unique_generations[-1]]
    selected_generations = list(dict.fromkeys(
        [unique_generations[0]] + unique_generations[step-1::step] + [unique_generations[-1]]
    ))

    first_gen_data = df[df["generation"] == unique_generations[0]].drop(columns=["generation", "id"]).values
    pca = PCA(n_components=2)
    pca.fit(first_gen_data)
    id_cluster_map = defaultdict(list)
    cluster_counts = []
    cluster_size_history = []
    inertia_history = []
    cluster_members_by_gen = {}
    #global_cluster_map = {}  # {generation: {local_cluster_id: global_id}}
    global_id_counter = 0


    #for gen in selected_generations:
    for gen_ind, gen in enumerate(selected_generations):

        pop_data = df[df["generation"] == gen].drop(columns=["generation", "id"]).values
        reduced_data = pca.transform(pop_data)

        max_distance = np.max(cdist(pop_data, pop_data))
        eps = 0.35 * max_distance
        #print(0.5 * max_distance)
        #print(0.25 * max_distance)
        #print(0.05 * max_distance)

        #print(pop_data)

        #clusters = custom_optics_clustering(data=pop_data, min_samples=min_samples, max_eps=eps, eps=None)
        #print(clusters)
        clustering = None
        percent_outliers = None
        for epx_mult in np.arange(0.05, 0.5, 0.05):
            for xi_value in np.arange(0.01, 0.20, 0.01):
                clustering = OPTICS(
                    min_samples=min_samples,      # Minimální počet bodů v husté oblasti
                    max_eps=epx_mult*max_distance,     # Maximální hledaný radius (np.inf = automaticky)
                    xi=xi_value,           # Parametr pro automatickou extrakci shluků (volitelné)
                    metric='euclidean' # Vzdálenostní metrika
                )

                # Fitování modelu
                clustering.fit(pop_data)
                num_outliers = np.sum(clustering.labels_ == -1)
                percent_outliers = num_outliers / len(pop_data) * 100
                if(percent_outliers < 50):
                    #print("break "+ str(epx_mult) + " " + str(xi_value))
                    break
            if(percent_outliers < 50):
                break


        # Výsledné labely (-1 = outlier)
        clusters = clustering.labels_

        centroids = []
        for i in range(min(clusters), max(clusters)+1):
            if i == -1:
                continue
            members = pop_data[clusters == i]
            if len(members) > 0:
                centroid = np.mean(members, axis=0)
                centroids.append(centroid)
        centroid_history.append(centroids)
        inertia = compute_inertia(pop_data, clusters, centroids)
        inertia_history.append(inertia)



        num_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
        cluster_counts.append(num_clusters)
        ids = df[df["generation"] == gen]["id"].values
        #for ident, cl in zip(ids, clusters):
        #    id_cluster_map[ident].append(cl)
        sizes = []
        for i in range(-1, max(clusters)+1):
            sizes.append(np.sum(clusters == i))
        cluster_size_history.append(sizes)

        # === Uložení členů clusterů pro aktuální generaci ===
        #cluster_to_ids = defaultdict(set)
        #for ident, cl in zip(ids, clusters):
        #    cluster_to_ids[cl].add(ident)
        #cluster_members_by_gen[gen] = cluster_to_ids

        # === Přemapování clusterů pomocí Jaccard (bez globální mapy)
        prev_clusters_members = cluster_members_by_gen[selected_generations[gen_ind - 1]] if gen_ind > 0 else None
        curr_clusters_members = defaultdict(set)
        for ident, cl in zip(ids, clusters):
            if cl != -1:
                curr_clusters_members[cl].add(ident)

        if evol_name != "GA":
            if prev_clusters_members is not None:
                clusters, centroids, curr_clusters_members, global_id_counter = map_and_remap_clusters(
                    gen_ind,
                    prev_clusters_members,
                    curr_clusters_members,
                    clusters,
                    pop_data,
                    global_id_counter,
                    True
                )
        else:
            if prev_clusters_members is not None:
                clusters, centroids, curr_clusters_members, global_id_counter = map_and_remap_clusters(
                    gen_ind,
                    prev_clusters_members,
                    curr_clusters_members,
                    clusters,
                    pop_data,
                    global_id_counter,
                    False,
                    df,
                    crossover_df
                )



        cluster_members_by_gen[gen] = curr_clusters_members

        # === Až teď uložíme do `id_cluster_map`
        for ident, cl in zip(ids, clusters):
            id_cluster_map[ident].append(cl)



        plt.figure(figsize=(6, 6))
        scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', alpha=0.7)

        handles, _ = scatter.legend_elements(prop="colors")
        legend_labels = [f"Shluk {cl}" for cl in sorted(set(clusters))]  # ← reálné názvy

        plt.legend(handles, legend_labels, title="Shluky", loc='upper center', bbox_to_anchor=(0.5, -0.12),
                ncol=5, fontsize=8, frameon=False)
        plt.title(f"Generace {gen} - Optics Shlukování")
        plt.xlabel("PCA komponent 1")
        plt.ylabel("PCA komponent 2")
        plt.tight_layout()
        scatter_path = os.path.join(images_dir, f"scatter_gen_{gen:03d}.png")
        plt.savefig(scatter_path)
        plt.close()

        html_parts.append(f"<h2>Generace {gen}</h2>")
        html_parts.append(f"<img src='images/{os.path.basename(scatter_path)}' width='600'><br>")


        # === Histogram velikostí clusterů (distribuce) ===
        #unique_labels = sorted(set(clusters))
        #cluster_sizes = [np.sum(clusters == label) for label in unique_labels]
        #plt.figure(figsize=(6, 4))
        #bars = plt.bar(unique_labels, cluster_sizes, color='skyblue', edgecolor='black')
        #plt.xticks([])
        #for bar, label in zip(bars, unique_labels):
        #    height = bar.get_height()
        #    if height > 0:
        #        plt.text(bar.get_x() + bar.get_width() / 2.0,
        #                -0.5,
        #                f"Shluk {label}",
        #                ha='center', va='top', rotation=90, fontsize=8)
        #
        #plt.ylabel("Počet jedinců")
        #plt.title(f"Histogram velikostí shluků - generace {gen}")
        #plt.grid(True, axis='y')
        #plt.tight_layout()
        #hist_path = os.path.join(images_dir, f"histogram_gen_{gen:03d}.png")
        #plt.savefig(hist_path)
        #plt.close()
        #html_parts.append(f"<img src='images/{os.path.basename(hist_path)}' width='600'><br>")
        # Získání unikátních labelů a jejich velikostí
        # Získání unikátních labelů a jejich velikostí
        unique_labels = sorted(set(clusters))
        cluster_sizes = [np.sum(clusters == label) for label in unique_labels]

        # Filtrování jen neprázdných shluků
        filtered = [(label, size) for label, size in zip(unique_labels, cluster_sizes) if size > 0]
        if not filtered:
            filtered_labels, filtered_sizes = [], []
        else:
            filtered_labels, filtered_sizes = zip(*filtered)

        # Vykreslení histogramu s indexy místo skutečných labelů jako souřadnic
        plt.figure(figsize=(6, 4))
        x_pos = range(len(filtered_labels))  # [0, 1, 2, ...]
        bars = plt.bar(x_pos, filtered_sizes, color='skyblue', edgecolor='black')

        # Popisky pod osou
        plt.xticks(ticks=x_pos, labels=[f"Shluk {label}" for label in filtered_labels], rotation=90)

        plt.ylabel("Počet jedinců")
        plt.title(f"Histogram velikostí shluků - generace {gen}")
        plt.grid(True, axis='y')
        plt.tight_layout()

        # Uložení
        hist_path = os.path.join(images_dir, f"histogram_gen_{gen:03d}.png")
        plt.savefig(hist_path)
        plt.close()

        # Přidání do HTML
        html_parts.append(f"<img src='images/{os.path.basename(hist_path)}' width='600'><br>")


        # === Analýza stability clusterů mezi generacemi ===
        if gen_ind > 0 and evol_name != "GA":
            previous_gen = selected_generations[gen_ind - 1]
            current_gen = gen

            cluster_transitions = defaultdict(lambda: defaultdict(int))  # prev_cluster -> current_cluster -> count

            for ident, history in id_cluster_map.items():
                if len(history) > gen_ind:
                    prev_label = history[gen_ind - 1]
                    curr_label = history[gen_ind]
                    cluster_transitions[prev_label][curr_label] += 1

            html_parts.append(f"<h3>Stabilita mezi generací {previous_gen} → {current_gen}:</h3>")
            html_parts.append("<pre>")
            for prev_label, curr_counts in cluster_transitions.items():
                total = sum(curr_counts.values())
                for curr_label, count in curr_counts.items():
                    percent = count / total * 100 if total > 0 else 0
                    html_parts.append(f"  Shluk {prev_label} → {curr_label}: {count} jedinců ({percent:.1f} %)")
            html_parts.append("</pre>")

        # === Mapování pomocí Jaccardova indexu mezi předchozí a aktuální generací ===
        if gen_ind > 0 and evol_name != "GA":
            gen_a = selected_generations[gen_ind - 1]
            gen_b = gen

            clusters_a = cluster_members_by_gen[gen_a]
            clusters_b = cluster_members_by_gen[gen_b]

            html_parts.append(f"<h3>Jaccardovo mapování a overlap: Generace {gen_a} → {gen_b}</h3>")
            html_parts.append("<pre>")

            for ca, members_a in clusters_a.items():
                best_jaccard = 0
                best_cb = None

                for cb, members_b in clusters_b.items():
                    intersection = len(members_a & members_b)
                    union = len(members_a | members_b)
                    jaccard = intersection / union if union > 0 else 0
                    overlap = len(members_a & members_b) / min(len(members_a), len(members_b))

                    if jaccard > best_jaccard:
                        best_jaccard = jaccard
                        best_cb = cb

                if best_cb is not None:
                    intersection = len(members_a & clusters_b[best_cb])
                    denom = min(len(members_a), len(clusters_b[best_cb]))
                    overlap = intersection / denom if denom > 0 else 0
                    html_parts.append(f" - Shluk {ca} → {best_cb} (Jaccard: {best_jaccard:.2f}, Overlap: {overlap:.2f})")
                else:
                    html_parts.append(f" - Shluk {ca} → žádný vhodný přechod")

            html_parts.append("</pre>")
        
         # === Posun centroidů mezi předchozí a aktuální generací ===
        if gen_ind > 0:
            html_parts.append(f"<h4>Posun centroidů mezi generací {selected_generations[gen_ind - 1]} → {gen}:</h4>")
            html_parts.append("<pre>")
            centroids_a = centroid_history[gen_ind - 1]
            centroids_b = centroid_history[gen_ind]
            for j in range(min(len(centroids_a), len(centroids_b))):
                dist = np.linalg.norm(centroids_a[j] - centroids_b[j])
                html_parts.append(f" - Shluk {j}: {dist:.4f}")
            html_parts.append("</pre>")

        #switch_counts = {}
        #for ident, history in id_cluster_map.items():
        #    switches = sum(1 for a, b in zip(history, history[1:]) if a != b)
        #    switch_counts[ident] = switches

        #total = len(switch_counts)
        #same = sum(1 for s in switch_counts.values() if s == 0)
        #once = sum(1 for s in switch_counts.values() if s == 1)
        #many = sum(1 for s in switch_counts.values() if s > 1)

        #print("Statistiky pohybu mezi clustery:")
        #print(f"Celkem jedinců: {total}")
        #print(f"Zůstali ve stejném clusteru: {same} ({same/total*100:.1f} %)")
        #print(f"Změnili cluster jednou: {once} ({once/total*100:.1f} %)")
        #print(f"Změnili cluster vícekrát: {many} ({many/total*100:.1f} %)")

        # === Boxplot fitness hodnot v jednotlivých clusterech ===
        gen_data = df[df["generation"] == gen]
        pop_values = gen_data.drop(columns=["generation", "id"]).values
        fitness_values = np.array([fintess_method(ind) for ind in pop_values])

        # Získání unikátních cluster ID (vynecháme -1 = noise)
        cluster_ids = sorted(set(clusters) - {-1})

        # Inicializace slovníku
        cluster_fitness = {cl: [] for cl in cluster_ids}

        # Naplnění fitness hodnot do jednotlivých clusterů
        for i, cl in enumerate(clusters):
            if cl in cluster_fitness:
                cluster_fitness[cl].append(fitness_values[i])

        # Vykreslení boxplotu
        plt.figure(figsize=(8, 5))
        plt.boxplot([cluster_fitness[cl] for cl in cluster_ids],
                    labels=[f"Shluk {cl}" for cl in cluster_ids])
        plt.xticks(rotation=90)
        plt.xlabel("Shluk")
        plt.ylabel("Fitness")
        plt.title(f"Distribuce fitness hodnot – generace {gen}")
        plt.grid(True)
        plt.tight_layout()
        box_path = os.path.join(images_dir, f"boxplot_gen_{gen:03d}.png")
        plt.savefig(box_path)
        plt.close()
        html_parts.append(f"<img src='images/{os.path.basename(box_path)}' width='600'><br>")



    plt.figure(figsize=(8, 4))
    plt.plot(selected_generations, cluster_counts, marker='o')
    plt.xlabel("Generace")
    plt.ylabel("Počet shluků")
    plt.title("Vývoj počtu shluků v čase")
    plt.grid(True)
    plt.tight_layout()
    time_cluster_path = os.path.join(images_dir, f"time_cluster_gen_{gen:03d}.png")
    plt.savefig(time_cluster_path)
    plt.close()
    html_parts.append(f"<img src='images/{os.path.basename(time_cluster_path)}' width='600'><br>")

    #max_cluster_id = max(len(sizes) for sizes in cluster_size_history)
    #cluster_size_array = np.zeros((len(cluster_size_history), max_cluster_id))
    #for i, sizes in enumerate(cluster_size_history):
    #    for j, size in enumerate(sizes):
    #        cluster_size_array[i, j] = size

    #plt.figure(figsize=(12, 5))
    #for j in range(cluster_size_array.shape[1]):
    #    plt.plot(selected_generations, cluster_size_array[:, j], label=f"Cluster {j - 1 if -1 in sizes else j}")
    #plt.xlabel("Generace")
    #plt.ylabel("Počet jedinců")
    #plt.title("Vývoj velikosti clusterů v čase")
    #plt.legend()
    #plt.grid(True)
    #plt.tight_layout()
    #plt.show()

    # === Vývoj velikosti clusterů v čase (s korektním globálním ID) ===
    all_global_ids = sorted({cid for gen_map in cluster_members_by_gen.values() for cid in gen_map})
    global_id_to_index = {cid: i for i, cid in enumerate(all_global_ids)}

    cluster_size_array = np.zeros((len(selected_generations), len(all_global_ids)))

    for gen_idx, gen in enumerate(selected_generations):
        gen_clusters = cluster_members_by_gen[gen]
        for gid, members in gen_clusters.items():
            idx = global_id_to_index[gid]
            cluster_size_array[gen_idx, idx] = len(members)

    plt.figure(figsize=(12, 5))
    for j, gid in enumerate(all_global_ids):
        plt.plot(selected_generations, cluster_size_array[:, j], label=f"Shluk {gid}")
    plt.xlabel("Generace")
    plt.ylabel("Počet jedinců")
    plt.title("Vývoj velikosti shluků v čase")
    #plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.25),
    #        ncol=5, fontsize=8, frameon=False)
    plt.grid(True)
    plt.tight_layout()
    size_cluster_path = os.path.join(images_dir, f"size_cluster_gen_{gen:03d}.png")
    plt.savefig(size_cluster_path)
    plt.close()
    html_parts.append(f"<img src='images/{os.path.basename(size_cluster_path)}' width='1200'><br>")


    plt.figure(figsize=(14, 6))
    sns.heatmap(cluster_size_array.T, cmap="viridis", cbar_kws={"label": "Počet jedinců"})
    plt.xlabel("Generace")
    plt.ylabel("Globální ID shluku")
    plt.title("Heatmapa vývoje velikosti shluků v čase")
    plt.tight_layout()
    size_cluster_heatmap = os.path.join(images_dir, f"Size_cluster_heatmap_gen_{gen:03d}.png")
    plt.savefig(size_cluster_heatmap)
    plt.close()
    html_parts.append(f"<img src='images/{os.path.basename(size_cluster_heatmap)}' width='1200'><br>")

    num_clusters = cluster_size_array.shape[1]
    chunk_size = 50  # uprav dle potřeby (např. 50)
    num_chunks = (num_clusters + chunk_size - 1) // chunk_size  # zaokrouhlení nahoru

    if num_clusters > 50:
        for i in range(num_chunks):
            start = i * chunk_size
            end = min(start + chunk_size, num_clusters)

            # Získání skutečných globálních ID pro tento chunk
            chunk_ids = all_global_ids[start:end]
            chunk_array = cluster_size_array[:, start:end].T  # každý řádek = jeden cluster

            plt.figure(figsize=(14, 6))
            sns.heatmap(chunk_array, cmap="viridis", 
                        cbar_kws={"label": "Počet jedinců"},
                        yticklabels=chunk_ids)  # správné ID na ose Y
            plt.xlabel("Generace")
            plt.ylabel("Globální ID shluku")
            plt.title(f"Heatmapa vývoje velikosti shluků (Clustery {chunk_ids[0]}–{chunk_ids[-1]})")
            plt.tight_layout()

            heatmap_path = os.path.join(images_dir, f"size_cluster_heatmap_{chunk_ids[0]:03d}_{chunk_ids[-1]:03d}.png")
            plt.savefig(heatmap_path)
            plt.close()

            html_parts.append(f"<img src='images/{os.path.basename(heatmap_path)}' width='1200'><br>")



    # === Vývoj průměrné vzdálenosti mezi centroidy ===
    average_intercentroid_distances = []
    for centroids in centroid_history:
        if len(centroids) < 2:
            average_intercentroid_distances.append(0)
            continue
        dists = cdist(centroids, centroids)
        upper_triangle = dists[np.triu_indices_from(dists, k=1)]
        avg_dist = np.mean(upper_triangle)
        average_intercentroid_distances.append(avg_dist)

    plt.figure(figsize=(8, 4))
    plt.plot(selected_generations, average_intercentroid_distances, marker='o', color='purple')
    plt.xlabel("Generace")
    plt.ylabel("Průměrná vzdálenost mezi centroidy")
    plt.title("Vývoj vzdáleností mezi centroidy")
    plt.grid(True)
    plt.tight_layout()
    distance_centroid_path = os.path.join(images_dir, f"distance_centroid_gen_{gen:03d}.png")
    plt.savefig(distance_centroid_path)
    plt.close()
    html_parts.append(f"<img src='images/{os.path.basename(distance_centroid_path)}' width='600'><br>")


    #for i in range(len(centroid_history) - 1):
    #    print(f"\nPosun centroidů mezi generací {selected_generations[i]} a {selected_generations[i+1]}:")
    #    for j in range(min(len(centroid_history[i]), len(centroid_history[i+1]))):
    #        dist = np.linalg.norm(centroid_history[i][j] - centroid_history[i+1][j])
    #        print(f" - Cluster {j}: {dist:.4f}")

    # === Vývoj WCSS ===
    plt.figure(figsize=(8, 4))
    plt.plot(selected_generations, inertia_history, marker='o')
    plt.xlabel("Generace")
    plt.ylabel("WCSS (inercie)")
    plt.title("Vývoj kvality shluků (inercie) – Optics")
    plt.grid(True)
    plt.tight_layout()
    wcss_path = os.path.join(images_dir, f"wcss_gen_{gen:03d}.png")
    plt.savefig(wcss_path)
    plt.close()
    html_parts.append(f"<img src='images/{os.path.basename(wcss_path)}' width='600'><br>")

    html_path = os.path.join(output_dir, "report.html")
    with open(html_path, "w", encoding="utf-8") as f:
        f.write("\n".join(html_parts))

    print(f"✅ HTML report byl uložen do: {html_path}")

In [8]:
run("population_log_DE_sphere.csv", 4, 1, sphere, "Optics", "DE")

✅ HTML report byl uložen do: html_report\DE\sphere\Optics\report.html


In [9]:
run("population_log_DE_schwefel.csv", 4, 1, schwefel, "Optics", "DE")

✅ HTML report byl uložen do: html_report\DE\schwefel\Optics\report.html


In [6]:
run("population_log_GA_sphere.csv", 4, 1, sphere, "Optics", "GA", "crossover_log_sphere.csv")

✅ HTML report byl uložen do: html_report\GA\sphere\Optics\report.html


In [8]:
run("population_log_GA_schwefel.csv", 4, 1, schwefel, "Optics", "GA", "crossover_log_schwefel.csv")

✅ HTML report byl uložen do: html_report\GA\schwefel\Optics\report.html


In [3]:
run("population_log_PSO_sphere.csv", 4, 1, sphere, "Optics", "PSO")

✅ HTML report byl uložen do: html_report\PSO\sphere\Optics\report.html


In [4]:
run("population_log_PSO_schwefel.csv", 4, 1, schwefel, "Optics", "PSO")

✅ HTML report byl uložen do: html_report\PSO\schwefel\Optics\report.html


In [5]:
import warnings
def runing(dataset, min_samples, step, fintess_method, clustering_method_name, evol_name):
    df = pd.read_csv(dataset)

    warnings.simplefilter("ignore", category=UserWarning)

    unique_generations = sorted(df["generation"].unique())

    #selected_generations = [unique_generations[0]] + unique_generations[step-1::step] + [unique_generations[-1]]
    selected_generations = list(dict.fromkeys(
        [unique_generations[0]] + unique_generations[step-1::step] + [unique_generations[-1]]
    ))

    first_gen_data = df[df["generation"] == unique_generations[0]].drop(columns=["generation", "id"]).values
    pca = PCA(n_components=2)
    pca.fit(first_gen_data)
    id_cluster_map = defaultdict(list)
    cluster_counts = []
    cluster_size_history = []
    inertia_history = []
    cluster_members_by_gen = {}
    #global_cluster_map = {}  # {generation: {local_cluster_id: global_id}}
    #next_global_id = 0


    #for gen in selected_generations:
    for gen_ind, gen in enumerate(selected_generations):

        pop_data = df[df["generation"] == gen].drop(columns=["generation", "id"]).values

        max_distance = np.max(cdist(pop_data, pop_data))
        eps = 0.5 * max_distance
        #print(0.5 * max_distance)
        #print(0.25 * max_distance)
        #print(0.05 * max_distance)

        #print(pop_data)

        #clusters = custom_optics_clustering(data=pop_data, min_samples=min_samples, max_eps=eps, eps=None)
        #print(clusters)

        #clustering = OPTICS(
        #    min_samples=min_samples,      # Minimální počet bodů v husté oblasti
        #    max_eps=eps,     # Maximální hledaný radius (np.inf = automaticky)
        #    xi=0.001,           # Parametr pro automatickou extrakci shluků (volitelné)
        #    metric='euclidean' # Vzdálenostní metrika
        #)
        total_points = len(pop_data)
        for ips in np.arange(0.05, 0.8, 0.05):
            for min_samples in range(3, 7):
                for xi in np.arange(0.01, 0.21, 0.01):
                    clustering = OPTICS(
                        min_samples=min_samples,
                        max_eps=ips*max_distance,
                        xi=xi,
                        metric='euclidean'
                    )
                    clustering.fit(pop_data)
                    if np.all(np.isinf(clustering.reachability_)):
                        continue
                    labels = clustering.labels_
                    num_outliers = np.sum(labels == -1)
                    percent_outliers = num_outliers / total_points * 100

                    num_clusters = len(set(labels)) - (1 if -1 in labels else 0)

                    if num_clusters >= 3 and percent_outliers <= 50:
                        star = "**" if percent_outliers <= 25 else ""
                        print(f"{star}min_samples={min_samples}, xi={xi:.2f}, eps={ips:.2f} → {num_clusters} clusterů, {percent_outliers:.1f}% outlierů{star}")
                        for cl in sorted(set(labels)):
                            if cl == -1:
                                continue
                            count = np.sum(labels == cl)
                            print(f"  - Cluster {cl}: {count} bodů")
                        print()

In [41]:
runing("population_log_DE_sphere.csv", 3, 10, sphere, "Optics", "DE")

min_samples=4, xi=0.07, eps=0.35 → 3 clusterů, 38.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 6 bodů

min_samples=4, xi=0.08, eps=0.35 → 3 clusterů, 38.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 6 bodů

min_samples=4, xi=0.09, eps=0.35 → 3 clusterů, 38.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 6 bodů

min_samples=4, xi=0.10, eps=0.35 → 3 clusterů, 35.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 9 bodů

min_samples=4, xi=0.11, eps=0.35 → 3 clusterů, 35.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 9 bodů

min_samples=4, xi=0.12, eps=0.35 → 3 clusterů, 35.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 9 bodů

min_samples=4, xi=0.13, eps=0.35 → 3 clusterů, 35.0% outlierů
  - Cluster 0: 23 bodů
  - Cluster 1: 33 bodů
  - Cluster 2: 9 bodů

min_samples=4, xi=0.14, eps=0.35 → 3 clusterů, 35.0% outlierů
  - Cluster 0: 23 bod