In [None]:
import pytorch_lightning as pl

from collections import defaultdict

import pandas as pd

from tqdm.auto import tqdm
from tqdm import trange

import sys

import numpy as np
from functools import partial

from torch.nn.parallel import DistributedDataParallel as DDP
from src.diffusion_model_discrete import DiscreteDenoisingDiffusion

In [None]:
sys.path.append("../graph_diversity_problems/")

In [None]:
from generation import get_initial_graphs, precompute_assets_for_generated_graphs
from base import DiversityBaseClass
from utils import read_pickle

from analysis import draw_graphs
import networkx as nx
import random

from distances import DISTANCE_TO_FUNC
from pathlib import Path
from joblib import Parallel, delayed

from distances import ProgressParallel

from typing import List, Tuple, Any, Callable, Dict, Optional

from itertools import combinations
from base import GraphObject

In [None]:
from joblib import Parallel, delayed

from tqdm import trange

import plotly.express as px

import json

In [None]:

# path = input()
# print(path)
# model = DiscreteDenoisingDiffusion.load_from_checkpoint(path, map_location="cpu")
# model.visualization_tools = None


In [None]:
# def get_state_dict_from_other_run(ckpt_path):
#     model = DiscreteDenoisingDiffusion.load_from_checkpoint(ckpt_path, map_location="cpu").model
#     return model.state_dict()

In [None]:
def get_graph_objects(adjacencies=None,
                      get_initial_graphs_func=None, 
                      distances_set={"netLSD_heat", "netLSD_wave", "GCD", "Portrait"}):
    
    config = {
        "initial_graphs": "user",
    }
    
    if adjacencies is not None and get_initial_graphs_func is None:
        
        graph_with_computed_descriptors = get_initial_graphs(
                                                config=config,
                                                threads=12, 
                                                distances_set=distances_set, 
                                                samples=None, 
                                                nodes_number=16, 
                                                orca_path="../graph_diversity_problems/orca/", 
                                                equal_sizes=True, 
                                                maybe_ready_graphs=adjacencies,
                                                greedy_graphs_objects_per_distance=None,
                                            )["user"]
    elif adjacencies is None and get_initial_graphs_func is not None:
        graph_with_computed_descriptors = get_initial_graphs_func()["mix"]
    
    else:
        raise ValueError("Either adjacencies or graph_with_computed_descriptors should be specified!")
    
    result_dict = {}
    
    for distance, graph_label_entity in graph_with_computed_descriptors.items():
        graph_objects = [
            GraphObject(
                _entity=e, identification=i, _graph=g
            ) for g, i, e in graph_label_entity
        ]
        
        result_dict[distance] = graph_objects
        
    
    return result_dict

In [None]:
def count_pairwise_energy(graphs: List[GraphObject], distance_function:Callable[[Any, Any], float]):
    i_j_indices = combinations(graphs, 2)
    
    distances = ProgressParallel(n_jobs=12)(
        delayed(
            distance_function
        )(e_1, e_2) for e_1, e_2 in i_j_indices
    )
    
    distances = np.array(distances)
    return distances, distances.mean()

In [None]:
def energy_distance(x: GraphObject, y: GraphObject, distance_name:str):
    return 1 / (DISTANCE_TO_FUNC[distance_name](x.entity, y.entity) + 1e-6)

PORTRAIT = partial(energy_distance, distance_name="Portrait")
GCD = partial(energy_distance, distance_name="GCD")

In [None]:
def load_graphs_and_get_volumes(run_dir, graphs_file, distance_name, distance_func):
    
    run_dir_cache_file = run_dir / "volumes_cache.json"
    
    if run_dir_cache_file.is_file():
        volumes_cache_dict = json.load(open(run_dir_cache_file))
        vol_1k, vol_1hundred = volumes_cache_dict["vol_1k"], volumes_cache_dict["vol_1hundred"]
        
        # print("Got values from cache")
    else:
        thousand_graphs = np.load(graphs_file, allow_pickle=True)
        graph_objects = get_graph_objects(thousand_graphs, distances_set={distance_name})[distance_name]
        
        _,  vol_1k = count_pairwise_energy(graph_objects, distance_function=distance_func)
        _, vol_1hundred = count_pairwise_energy(graph_objects[:100], distance_function=distance_func)

        volumes_cache_dict = dict(vol_1k=vol_1k, vol_1hundred=vol_1hundred)
        json.dump(volumes_cache_dict, open(run_dir_cache_file, "w"))        
    
    return vol_1k, vol_1hundred

def get_df_from_digress_iterations(root: Path, distance_func, distance_name:str):
    data = []
    for run_dir in root.glob("./*/"):
        
        run_number = int(run_dir.name.split("_")[-1]) + 1
        graphs_file = run_dir / "final_graphs_greedy.npy"
        
        if not graphs_file.is_file():
            continue
        
        vol_1k, vol_1hundred = load_graphs_and_get_volumes(run_dir, graphs_file, distance_name, distance_func)
        
        data.append([run_number, vol_1k, vol_1hundred])

    df = pd.DataFrame(data, columns=["Step", "Fitness of 1k graphs", "Fitness of 100 graphs"]).sort_values(by="Step")

    return df

In [None]:
from itertools import combinations

In [None]:
def get_dataframe_for_digress_runs(runs_roots_dict: dict[str, list[Path]], 
                                   distance_name:str,
                                   zero_step_filename_location: Optional[str]=None,
                                   ):
    
    
    _columns = ["Label", "ID", "Step", "Fitness of 1k graphs", "Fitness of 100 graphs"]
    
    dataframes: list[pd.DataFrame] = []
    
    distance_function = partial(energy_distance, distance_name=distance_name)
    
    _labels_to_ids = defaultdict(list)
    
    for runs_label, runs_roots in runs_roots_dict.items():
        
        for i, run_root in enumerate(runs_roots):
            
            run_df = get_df_from_digress_iterations(root=run_root, distance_func=distance_function, distance_name=distance_name)
            run_df["ID"] = i
            run_df["Label"] = runs_label
            
            dataframes.append(run_df)
            
            _labels_to_ids[runs_label].append(i)
       
    if zero_step_filename_location is not None:
        cache_file: Path = Path(zero_step_filename_location + ".json")
        
        if cache_file.is_file():
            cache = json.load(open(cache_file))
            vol_1k, vol_1hundred = cache["vol_1k"], cache["vol_1hundred"]
        else:
            vol_1k, vol_1hundred = load_graphs_and_get_volumes(Path("./"), zero_step_filename_location, distance_func=distance_function, distance_name=distance_name)
            cache = dict(vol_1k=vol_1k, vol_1hundred=vol_1hundred)
            json.dump(cache, open(cache_file, "w"))
        
        zero_step_data = []
        
        for _label, _ids in _labels_to_ids.items():
            for _id in _ids:
                zero_step_data.append(
                    [_label, _id, 0, vol_1k, vol_1hundred]
                )

        dataframes.append(pd.DataFrame(data=zero_step_data,
                                       columns=_columns)
                          )
            
    dataframe: pd.DataFrame = pd.concat(dataframes).sort_values(by=["Step"]).reset_index(drop=True)
    
    
    return dataframe



In [None]:
gcd_roots = {"Memory_preserving": [Path("/home/fvelikon/projects/DiGress/digress_iterations/GCD_preserve_memory")],
             "Simple": [Path("/home/fvelikon/projects/DiGress/digress_iterations/GCD")]}

gcd_dataframes = get_dataframe_for_digress_runs(gcd_roots, "GCD", 
                                                # zero_step_filename_location="/home/fvelikon/projects/DiGress/GCD_greedy_1k_graphs.pkl",
                                                )

In [None]:
gcd_dataframes.query("Label == 'Memory_preserving'")

In [None]:
gcd_dataframes

In [None]:
px.line(gcd_dataframes, x="Step", y=["Fitness of 100 graphs"],
        color="Label",
        markers=True,
        )

In [None]:
portrait_roots = {
            # "Memory_preserving": [Path("/home/fvelikon/projects/DiGress/digress_iterations/Portrait_memory_preserving")],
             "Simple": [Path("/home/fvelikon/projects/DiGress/digress_iterations/Portrait")],
             }

portrait_dataframes = get_dataframe_for_digress_runs(portrait_roots, "Portrait", 
                                                zero_step_filename_location="/home/fvelikon/projects/DiGress/Portrait_greedy_1k_graphs.pkl")

In [None]:
px.line(portrait_dataframes, x="Step", y=["Fitness of 100 graphs", "Fitness of 1k graphs"],
        # color="Label",
        markers=True,
        title="Portrait",
        )

In [None]:
heat_roots = {
            # "Memory_preserving": [Path("/home/fvelikon/projects/DiGress/digress_iterations/Portrait_memory_preserving")],
             "Simple": [Path("/home/fvelikon/projects/DiGress/digress_iterations/heat")],
             }

heat_dataframes = get_dataframe_for_digress_runs(heat_roots, "netLSD_heat", 
                                                zero_step_filename_location="/home/fvelikon/projects/DiGress/netLSD_heat_greedy_1k_graphs.pkl")

In [None]:
px.line(heat_dataframes, x="Step", y=["Fitness of 100 graphs", "Fitness of 1k graphs"],
        # color="Label",
        markers=True,
        title="netLSD_heat",
        )

In [None]:
heat_dataframes

In [None]:
wave_roots = {
            # "Memory_preserving": [Path("/home/fvelikon/projects/DiGress/digress_iterations/Portrait_memory_preserving")],
             "Simple": [Path("/home/fvelikon/projects/DiGress/digress_iterations/wave")],
             }

wave_dataframes = get_dataframe_for_digress_runs(wave_roots, "netLSD_wave", 
                                                zero_step_filename_location="/home/fvelikon/projects/DiGress/netLSD_wave_greedy_1k_graphs.pkl")
display(wave_dataframes)
px.line(wave_dataframes, x="Step", y=["Fitness of 100 graphs", "Fitness of 1k graphs"],
        # color="Label",
        markers=True,
        title="netLSD_wave",
        )

In [None]:
last_graphs_gcd = [nx.from_numpy_array(g) for g in np.load("/home/fvelikon/projects/DiGress/digress_iterations/GCD_preserve_memory/digress_run_9/final_graphs_greedy.npy")][:100]



graph_sorted_by_density = list(sorted(last_graphs_gcd, key=lambda x: nx.density(x), reverse=True))
complements = list(map(nx.complement, graph_sorted_by_density))

In [None]:
draw_graphs(last_graphs_gcd)

In [None]:
draw_graphs(graph_sorted_by_density)

In [None]:
draw_graphs(complements)

In [None]:
diffusion_input_graphs_path = "./GCD_train_val_1k_each.npz" #"./GCD_greedy_1k_graphs.pkl"

diffusion_input_graphs_path_1 = "./GCD_greedy_1k.npy"
diffusion_input_graphs_path_2 = "/home/fvelikon/projects/DiGress/graphs_1k_train_val_as_part_of_train/greedy_graphs_1k_v1.npy" 

# final_graphs = np.load("./graphs_1k_train_val_as_part_of_train_RUN_2/final_graphs_greedy.pkl", allow_pickle=True)["GCD"]

final_graphs = np.load("/home/fvelikon/projects/DiGress/graphs_1k_train_val_as_part_of_train_RUN_3/final_graphs_greedy.npy", allow_pickle=True)

In [None]:
# diffusion_input_graphs = np.load(diffusion_input_graphs_path, allow_pickle=True)["train"]

input_graphs_1 = np.load(diffusion_input_graphs_path_1, allow_pickle=True)
input_graphs_2 = np.load(diffusion_input_graphs_path_2, allow_pickle=True)


In [None]:
# generated_graphs = read_pickle("./graphs/train_eval_test_1k_each/final_graphs_SUB_greedy.pkl")["GCD"]

In [None]:
# generated_graph_objects = get_graph_objects(model_10k_graphs, distances_set={"GCD"})

# generated_graph_objects = get_graph_objects(generated_graphs, distances_set={"GCD"})
input_graphs_1_graph_objects = get_graph_objects(input_graphs_1, distances_set={"GCD"})
input_graphs_2_graph_objects = get_graph_objects(input_graphs_2, distances_set={"GCD"})
final_grap_objects = get_graph_objects(final_graphs, distances_set={"GCD"})

In [None]:
# all_graphs = read_pickle("./generated_graphs_by_model_GCD_1M.pkl")

In [None]:
# input_graph_objects = get_graph_objects(diffusion_input_graphs, distances_set={"GCD"})
# generated_graphs = get_graph_objects()

In [None]:
# _, vol_1 = count_pairwise_energy(input_graphs_1_graph_objects["GCD"], distance_function=GCD)

# print(vol_1)
# _, vol_2 = count_pairwise_energy(input_graphs_2_graph_objects["GCD"], distance_function=GCD)

# print(vol_2)

_, vol_3 = count_pairwise_energy(final_grap_objects["GCD"], distance_function=GCD)

print(vol_3)


In [None]:
# _, vol_1 = count_pairwise_energy(input_graphs_1_graph_objects["GCD"][:100], distance_function=GCD)

# print(vol_1)
# _, vol_2 = count_pairwise_energy(input_graphs_2_graph_objects["GCD"][:100], distance_function=GCD)

# print(vol_2)

_, vol_3 = count_pairwise_energy(final_grap_objects["GCD"][:100], distance_function=GCD)

print(vol_3)


In [None]:
draw_graphs([nx.from_numpy_array(G) for G in final_graphs[:100]])

In [None]:
volumes = []

for i in range(10):
    generated_graphs_sampled = get_graph_objects(random.sample(all_graphs, 1000), distances_set={"GCD"})["GCD"]
    _, vol =  count_pairwise_energy(generated_graphs_sampled, distance_function=GCD)
    
    print(vol)
    
    volumes.append(vol)


volumns = np.array(volumes)

volumes.mean(), volumes.std()

In [None]:
random_100_graphs = [nx.from_numpy_array(g) for g in random.sample(model_10k_graphs, 100)]
draw_graphs(random_100_graphs)

## 2. Greedy sampling

In [None]:
def sample_greedy_from_graphobjects_of_certain_distance(graph_objects:List[GraphObject],
                                                        distance_function:Callable[[Any, Any], float],
                                                        final_set_size:int=100,
                                                        super_greedy=False,
                                                        ):    
    
    
    if not super_greedy:
        N = len(graph_objects)
        competitors_per_sample = N // (final_set_size - 1)
        
        # random_permutation
        indices = np.random.permutation(range(N))
        
        
        offset = 2
        resulting_set: List[GraphObject] = [graph_objects[random.choice(indices[:offset])]]
        
        with Parallel(n_jobs=12) as workers:
            for i in range(final_set_size - 1):
                
                candidate_indices = indices[offset + competitors_per_sample * i : offset + competitors_per_sample * (i + 1)]
                candidates = [graph_objects[k] for k in candidate_indices]
                
                
                distances = np.array(workers(
                    delayed(
                    distance_function 
                    )(already_chosen_graph, candidate) for candidate in candidates for already_chosen_graph in resulting_set
                ))
                
                distances = -1.0 * distances.reshape(len(candidate_indices), -1)
                
                fitnesses = distances.sum(1)
                
                max_fitness_index = fitnesses.argmax()
                
                
                winner = candidates[max_fitness_index]
                
                resulting_set.append(winner)
    else:
        N = len(graph_objects)
        
        # random_permutation
        indices = np.random.permutation(range(N))
        
        
        offset = 2
        resulting_set: List[GraphObject] = [graph_objects[0]]
        
        graphs = graph_objects[1:]
        
        fitnesses = np.zeros(len(graphs))
        
        with Parallel(n_jobs=12) as workers:
            for i in trange(final_set_size - 1):
                
                
                distances = np.array(workers(
                    delayed(
                    distance_function 
                    )(resulting_set[-1], candidate) for candidate in graphs
                ))
                
                distances = distances.reshape(len(graphs), -1)
                
                fitnesses += distances.sum(1)
                
                max_fitness_index = fitnesses.argmin()
                
                winner = graphs[max_fitness_index]
                fitnesses[max_fitness_index] += 1e5
                
                resulting_set.append(winner)


    return resulting_set

In [None]:
def generate_greedy_graphs_for_generated_set(graph_objects_dict:Dict[str, GraphObject],
                                             greedy_set_size:int,
                                             number_of_repeats:int=5,
                                             super_greedy:bool=False,
                                             ):
    
    table = defaultdict(list)
    final_graphs = {}
    M = number_of_repeats

    for distance_name, graph_objects_list in graph_objects_dict.items():
        distance_func = partial(energy_distance, distance_name=distance_name)
        
        max_fitness = -1
        for i in range(M):
            greedy_chosen_graphs = sample_greedy_from_graphobjects_of_certain_distance(graph_objects_list, 
                                                                                        distance_function=distance_func,
                                                                                        super_greedy=super_greedy,
                                                                                        final_set_size=greedy_set_size,
                                                                                        )
            
            distances, fitness = count_pairwise_energy(greedy_chosen_graphs, distance_func)
            
            if fitness > max_fitness:
                final_graphs[distance_name] = [g.graph for g in greedy_chosen_graphs]
                max_fitness = fitness
                
            
            table[distance_name].append(fitness)
            print(f"{distance_name} - {i+1}/{M}")
            
        
        print(f"{distance_name} - done")
        
    overall_table = pd.DataFrame.from_dict(table)


    cols = ["fitness", "distance"]
    data = []
    for d, array in table.items():
        for f in array:
            data.append([f, d])
            
            
    df = pd.DataFrame(data, columns=cols)
    distance_fitness_average_std = df.groupby("distance").aggregate(["mean", "std"])
    
    return final_graphs, overall_table, distance_fitness_average_std

In [None]:
final_graphs, overall_table, distance_fitness_average_std = generate_greedy_graphs_for_generated_set(
    graph_objects_dict=get_graph_objects(all_graphs, distances_set={"GCD"})["GCD"],
    greedy_set_size=1000,
    number_of_repeats=5,
    super_greedy=False,
)

display(overall_table)
display(distance_fitness_average_std)

In [None]:
final_graphs, overall_table_super_greedy, distance_fitness_average_std_super_greedy = generate_greedy_graphs_for_generated_set(
    graph_objects_dict=generated_graph_objects,
    greedy_set_size=100,
    number_of_repeats=1,
    super_greedy=True,
)

display(overall_table_super_greedy)
display(distance_fitness_average_std_super_greedy)

# 3. Sampling from graph random models greedily for training and validation

In [None]:
import json

from generation import get_initial_graphs

In [None]:
?get_initial_graphs

In [None]:
models_configurations = json.load(open("../graph_diversity_problems/configs/config_final.json"))["models"]

In [None]:
NUM_OF_GRAPHS = 10000
NUM_OF_GRAPHS_GREEDY = 1000

NUM_OF_NODES = 16

THREADS = 12

DISTANCES = {"GCD"}

greedy_graphs_objects_per_distance = None
orca_path="../graph_diversity_problems/orca/"

equal_sizes=True
maybe_ready_graphs=None

config = {"initial_graphs": ["mix"], 
          "models": models_configurations,
          "greedy_sampling_size": NUM_OF_GRAPHS,
          }


In [None]:
get_initial_graphs_partial = partial(get_initial_graphs,
                             config=config,
                             threads=THREADS,
                             distances_set=DISTANCES,
                             greedy_graphs_objects_per_distance=greedy_graphs_objects_per_distance,
                             samples=NUM_OF_GRAPHS,
                             nodes_number=NUM_OF_NODES,
                             orca_path=orca_path,
                             equal_sizes=equal_sizes,
                             maybe_ready_graphs=maybe_ready_graphs
                             
                             )

In [None]:
def create_greedy_initial_set_from_mix(generation_func,
                                       distance_name="GCD",
                                       num_of_samples=1000,
                                       super_greedy=True,
                                       ):
    
    initial_graph_objects = get_graph_objects(get_initial_graphs_func=generation_func, 
                                              distances_set=DISTANCES)
    
    
    
    
    final_graphs, _, _ = generate_greedy_graphs_for_generated_set(
                graph_objects_dict=initial_graph_objects,
                greedy_set_size=num_of_samples,
                number_of_repeats=1,
                super_greedy=super_greedy,
            )
    
    
    return final_graphs[distance_name]
    

In [None]:
initial_graphs_1 = create_greedy_initial_set_from_mix(get_initial_graphs_partial)
initial_graphs_1

In [None]:
initial_graphs_2 = create_greedy_initial_set_from_mix(get_initial_graphs_partial)

In [None]:
graphs_set = {
    "train": initial_graphs_1,
    "valid": initial_graphs_2,
    "test": initial_graphs_2,
}

In [None]:
np.savez(
    "GCD_train_val_1k_each",
    **graphs_set
)

In [None]:
"train" in np.load("./GCD_train_val_1k_each.npz", allow_pickle=True)

## 3.1 Get 1k graphs for train and for test independently from the same graph random model

In [None]:
graphs = []

get_initial_graphs_partial_with_size_100 = partial(get_initial_graphs,
                             config=config,
                             threads=THREADS,
                             distances_set=DISTANCES,
                             greedy_graphs_objects_per_distance=greedy_graphs_objects_per_distance,
                             samples=NUM_OF_GRAPHS,
                             nodes_number=NUM_OF_NODES,
                             orca_path=orca_path,
                             equal_sizes=equal_sizes,
                             maybe_ready_graphs=maybe_ready_graphs
                             
                             )

for i in range(10):
    _graphs, _, _ = create_greedy_initial_set_from_mix(get_initial_graphs_partial, num_of_samples=100)
    
    graphs.extend(_graphs)