In [None]:
import torch
import matplotlib.pyplot as plt
# local
import sys
if '..' not in sys.path:
    sys.path.insert(0, '..')

from datasets.import_dataset import import_dataset
from trainer import Trainer
import clamiter as ci
from utils.plotting import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device = {device}')
from datasets.simulations import create_sbm

%load_ext autoreload
%autoreload 2

# Distance Experiments

In [None]:
plot_distance_experiment(0.01, 'find_d', ds_name='squirrel', range=300)

In [None]:
'''Experiment: for every dataset we used find the best d and see that the log cut converges.'''

import os
import numpy as np


#
for ds_name in ['squirrel']:
    for model_name in ['ieclam']:
        ds = import_dataset(ds_name)


        # SBM
        config_triplets = [
                            ['feat_opt', 'n_iter', 15000],
                            # ['feat_opt', 'lr', 0.00005],
                            # ['prior_opt', 'n_iter', 1500],
                            # ['prior_opt', 'lr', 0.0000005],
                            # ['back_forth','n_back_forth', 30],
                            # ['back_forth', 'first_func_in_fit', 'feat_opt']
                          ]

        for d in [0.0005, 0.001, 0.0001]:
            printd(f'{d=}')
            trainer = Trainer(
                        model_name=model_name,
                        task='distance',
                        device=device,
                        config_triplets_to_change=config_triplets,
                        dataset=ds.clone()
            )
            # i want to optimize the trainer
            log_likelihoods, test_accs, val_accs = trainer.train(
                d=d,
                plot_every=-1,
                init_feats=True,
                init_type='small_gaus',
                verbose=False,
                acc_every=100,
                verbose_in_funcs=False
            )

            log_likelihoods = np.array(log_likelihoods)/(ds.num_nodes**2)


            dir_path = f'results/distance/find_d/{ds_name}/{model_name}/d_{d}'
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            torch.save(test_accs['log_cut'], dir_path + f'/logcuts.pt')
            torch.save(log_likelihoods, dir_path + f'/log_likelihoods.pt')
            torch.save(val_accs['l2'], dir_path + f'/l2s.pt')
            
            del trainer.data
            if trainer.clamiter.prior is not None:
                del trainer.clamiter.prior.model
            


In [None]:
'''Experiment: for every dataset we used find the best d and see that the log cut converges.'''

import os
import numpy as np


#
for ds_name in ['photo', 'sbm3x3HalfDiag']:
    for model_name in ['ieclam']:
        ds = import_dataset(ds_name)


        # SBM
        config_triplets = [
                            ['feat_opt', 'n_iter', 5000],
                            ['feat_opt', 'lr', 0.00005],
                            # ['prior_opt', 'n_iter', 1500],
                            # ['prior_opt', 'lr', 0.0000005],
                            # ['back_forth','n_back_forth', 30],
                            # ['back_forth', 'first_func_in_fit', 'feat_opt']
                          ]

        for d in [0.0005, 0.001, 0.0001]:
            printd(f'{d=}')
            trainer = Trainer(
                        model_name=model_name,
                        task='distance',
                        device=device,
                        config_triplets_to_change=config_triplets,
                        dataset=ds.clone()
            )
            # i want to optimize the trainer
            log_likelihoods, test_accs, val_accs = trainer.train(
                d=d,
                plot_every=-1,
                init_feats=True,
                init_type='small_gaus',
                verbose=False,
                acc_every=100,
                verbose_in_funcs=False
            )

            log_likelihoods = np.array(log_likelihoods)/(ds.num_nodes**2)


            dir_path = f'results/distance/find_d/{ds_name}/{model_name}/d_{d}'
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            torch.save(test_accs['log_cut'], dir_path + f'/logcuts.pt')
            torch.save(log_likelihoods, dir_path + f'/log_likelihoods.pt')
            torch.save(val_accs['l2'], dir_path + f'/l2s.pt')
            
            del trainer.data
            if trainer.clamiter.prior is not None:
                del trainer.clamiter.prior.model

In [None]:
plot_distance_experiment(0.01, 'train_to_overfit', 'squirrel', 'ieclam', 4900)

#? experimental insight: the approximation matrix reaches values of 1 via the log likelihood optimization in which case the log cut is infinity. so the limitation is also via the step size...

In [None]:
'''Experiment: try and get overfitting in the log cut distance. If we train the model for long enough, we expect that the log cut will start increasing because of overfitting so we are training it for a very long time.'''
from datetime import datetime


model_name = 'ieclam'
ds = import_dataset('squirrel')
d = 0.005
n_iter = 45000
config_triplets = [
                    ['feat_opt', 'n_iter', n_iter],
                    # ['feat_opt', 'lr', 0.00005],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 30],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]

trainer = Trainer(
            model_name=model_name,
            task='distance',
            device=device,
            config_triplets_to_change=config_triplets,
            dataset=ds.clone()
)
# i want to optimize the trainer
try:
    log_likelihoods, test_accs, val_accs = trainer.train(
        d=d,
        plot_every=-1,
        init_feats=True,
        init_type='small_gaus',
        verbose=False,
        acc_every=100,
        verbose_in_funcs=False
    )

    log_likelihoods = np.array(log_likelihoods)/(ds.num_nodes**2)
except Exception as e:
    raise

finally:
    dir_path = f'results/distance/train_to_overfit/squirrel/ieclam/d_{d}/n_iter_{n_iter}/{datetime.now().strftime("%Y-%m-%d_%H-%M")}'
    
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    torch.save(test_accs['log_cut'], dir_path + f'/logcuts')
    torch.save(log_likelihoods, dir_path + f'/log_likelihoods')
    torch.save(val_accs['l2'], dir_path + f'/l2s')

    del trainer.data
    if trainer.clamiter.prior is not None:
        del trainer.clamiter.prior.model

    #todo: print every 1000 iterations
    #todo: find where it gets infinity (and what,,,)


In [None]:
plt.plot(test_accs['log_cut'][400:])

# SBM

In [None]:
# import sbm and bipartite
ds_sbm_halfdiag = import_dataset('sbm3x3HalfDiag')
ds_sbm_halfcenter = import_dataset('sbm3x3HalfCenter')

# plot_adj(ds_sbm_halfcenter.edge_index, ax=axes[0])
# plot_adj(ds_sbm_halfdiag.edge_index, ax=axes[1])

# create sbm
prob_adj_3X3, y = create_sbm(70, p_comm=[0.0, 0.0, 0.0], p_bipart=[0.5, 0.5, 0.5]) 
plot_adj(prob_adj_3X3)


### Lorenz Inner

#### IEClam

In [None]:
config_triplets = [
                    ['feat_opt', 'n_iter', 15000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 5],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_ieclam = Trainer(
                model_name='ieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag.clone()
                
)

losses_halfdiag_ieclam, logcut_halfdiag_ieclam, l2_halfdiag_ieclam = trainer_halfdiag_ieclam.train(
    d = 0.01,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    acc_every=500,
    verbose=False,
    verbose_in_funcs=False
)

# todo: move the contents of train into fit and then do trainer.clamiter.fit() not urgent. add an option for trainer to do cross validation.

#### PieClam

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 200],
                    ['clamiter_init', 'dim_feat', 6],
                    ['feat_opt', 'lr', 0.0001],
                    # ['prior_opt', 'n_iter', 150],
                    ['prior_opt', 'lr', 0.00001],
                    ['back_forth','n_back_forth', 5],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_pieclam = Trainer(
                model_name='pieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag
)

losses_halfdiag_pieclam, logcut_halfdiag_pieclam, l2_halfdiag_pieclam = trainer_halfdiag_pieclam.train(
    d = 0.2,
    plot_every=1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False,
    node_size_factor=5,
    draw_edges=False
    
    
)
del trainer_halfdiag_pieclam.data   
del trainer_halfdiag_pieclam.clamiter.prior.model

### Inner Prod

#### BIGCLAM

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 50],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_bigclam= Trainer(
                model_name='bigclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag.clone()
)

losses_halfdiag_bigclam, logcut_halfdiag_bigclam, l2_halfdiag_bigclam = trainer_halfdiag_bigclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)

del trainer_halfdiag_bigclam.data

#### PClam

In [None]:
# SBM
config_triplets = [
                    ['clamiter_init', 'dim_feat', 6],
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    ['back_forth','n_back_forth', 1],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_pclam = Trainer(
                model_name='pclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag.clone()
)
# i want to optimize the trainer
losses_halfdiag_pclam, logcut_halfdiag_pclam, l2_halfdiag_pclam = trainer_halfdiag_pclam.train(
    d = 0.2,
    plot_every=1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)
del trainer_halfdiag_pclam.data
del trainer_halfdiag_pclam.clamiter.prior.model

### Lorenz Inner

#### IEClam

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 50],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_ieclam = Trainer(
                model_name='ieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag.clone()
)

losses_halfdiag_ieclam, logcut_halfdiag_ieclam, l2_halfdiag_ieclam = trainer_halfdiag_ieclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)

del trainer_halfdiag_ieclam.data


#### PieClam

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 50],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfdiag_pieclam = Trainer(
                model_name='pieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfdiag.clone()
)

losses_halfdiag_pieclam, logcut_halfdiag_pieclam, l2_halfdiag_pieclam = trainer_halfdiag_pieclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)

del trainer_halfdiag_pieclam.data
del trainer_halfdiag_pieclam.clamiter.prior.model

## HalfCenter

### Lorenz Inner

#### IEClam

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 50],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfcenter_ieclam = Trainer(
                model_name='ieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfcenter.clone()
)

losses_halfcenter_ieclam, logcut_halfcenter_ieclam, l2_halfcenter_ieclam = trainer_halfcenter_ieclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)
del trainer_halfcenter_ieclam.data

#### PieClam

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 200],
                    ['feat_opt', 'lr', 0.0001],
                    # ['prior_opt', 'n_iter', 150],
                    ['prior_opt', 'lr', 0.00001],
                    ['back_forth','n_back_forth', 5],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfcenter_pieclam = Trainer(
                model_name='pieclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfcenter.clone()
)

losses_halfcenter_pieclam, logcut_halfcenter_pieclam, l2_halfcenter_pieclam = trainer_halfcenter_pieclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)
del trainer_halfcenter_pieclam.data
del trainer_halfcenter_pieclam.clamiter.prior.model

### Inner Prod

#### BIGCLAM

In [None]:
config_triplets = [
                    # ['feat_opt', 'n_iter', 2000],
                    # ['feat_opt', 'lr', 0.00003],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 50],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfcenter_bigclam= Trainer(
                model_name='bigclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfcenter.clone()
)

losses_halfcenter_bigclam, logcut_halfcenter_bigclam, l2_halfcenter_bigclam = trainer_halfcenter_bigclam.train(
    d = 0.2,
    plot_every=-1,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)

del trainer_halfcenter_bigclam.data
del trainer_halfcenter_bigclam.clamiter.prior.model

#### PClam

In [None]:
# SBM
config_triplets = [
                    ['feat_opt', 'n_iter', 2000],
                    ['feat_opt', 'lr', 0.00005],
                    ['prior_opt', 'n_iter', 1500],
                    ['prior_opt', 'lr', 0.0000005],
                    ['back_forth','n_back_forth', 30],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]


trainer_halfcenter_pclam = Trainer(
                model_name='pclam',
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds_sbm_halfcenter.clone()
)
# i want to optimize the trainer
losses_halfcenter_pclam, logcut_halfcenter_pclam, l2_halfcenter_pclam = trainer_halfcenter_pclam.train(
    d = 0.2,
    plot_every=5,
    init_feats=True,
    init_type='small_gaus',
    verbose=False,
    verbose_in_funcs=False
)

del trainer_halfcenter_pclam.data
del trainer_halfcenter_pclam.clamiter.prior.model

# LogCut Convergence

## Find LL and LC Curve for Different d

In [None]:
import os
import torch
import matplotlib.pyplot as plt

from utils.plotting import plot_distance_experiment

In [None]:
'''Experiment: Find the best d. since the log cut distance is defined by the minimal d, we look to see which d gives the minimal value. In any case, the true log cut distance will always be smaller than what is calculated.'''

import os


model_name='ieclam'
ds_name = 'sbm3x3HalfDiag'



ds = import_dataset(ds_name)


# SBM
config_triplets = [
                    ['feat_opt', 'n_iter', 15000],
                    # ['feat_opt', 'lr', 0.00005],
                    # ['prior_opt', 'n_iter', 1500],
                    # ['prior_opt', 'lr', 0.0000005],
                    # ['back_forth','n_back_forth', 30],
                    # ['back_forth', 'first_func_in_fit', 'feat_opt']
                ]

for d in [0.05, 0.01, 0.005]:
    trainer = Trainer(
                model_name=model_name,
                task='distance',
                device=device,
                config_triplets_to_change=config_triplets,
                dataset=ds.clone()
    )
    # i want to optimize the trainer
    log_likelihoods, test_accs, val_accs = trainer.train(
        d=d,
        plot_every=-1,
        init_feats=True,
        init_type='small_gaus',
        verbose=False,
        acc_every=100,
        verbose_in_funcs=False
    )

    log_likelihoods = np.array(log_likelihoods)/(ds.num_nodes**2)


    dir_path = f'results/distance/find_d/{ds_name}/ieclam/d_{d}'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    torch.save(test_accs['log_cut'], dir_path + f'/logcuts')
    torch.save(log_likelihoods, dir_path + f'/log_likelihoods')
    torch.save(val_accs['l2'], dir_path + f'/l2s')
    
    del trainer.data
    if trainer.clamiter.prior is not None:
        del trainer.clamiter.prior.model
    

#todo: make one run for 50000 iteration and see if the log cut converges.

