In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pandas as pd
import numpy as np
import os
from string import ascii_letters

In [2]:
import sys
sys.path.insert(0, './src')
from pso import PSOFeatureSelection
from sa import SimulatedAnnealing
from utils import OptimizationAlgorithm
from metrics import Metrics, SammonError, KruskalStress, EditDistance
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
import datetime

import argparse
import yaml
from main import run_analysis, get_data_i
from utils import SubsetterArgparser
from utils import load_metrics_file
from typing import List, Tuple

import torch
import pickle
import gc
import os

from icecream import install

install()

In [3]:
data_path = os.path.join("data", "function_metrics.csv")
data, indices_size_descending = load_metrics_file(data_path)
device = torch.device('cuda')

In [4]:
repo_index = 50

In [5]:
index = indices_size_descending[repo_index]
repo_data = get_data_i(data, index, 'function')
repo_data = repo_data.to(device)

In [6]:
repo_data.shape

torch.Size([5133, 69])

In [7]:
K = 15
subset_size = 5

# Generate K unique random column indices
torch.manual_seed(43)
column_indices = torch.randperm(repo_data.shape[1])[:K]

# Select the K columns from the tensor
subset_repo = repo_data[:, column_indices]

In [8]:
subset_repo.shape

torch.Size([5133, 15])

## Enumerate all subsets to find best

In [9]:
import itertools
from metrics import SammonError

sammon = SammonError(subset_repo)

subset_to_loss = []
for subset in itertools.combinations(list(range(K)), subset_size):
    # Create a binary mask tensor for the current subset
    mask_tensor = torch.zeros(K, dtype=torch.bool, device='cuda')
    mask_tensor[[idx for idx in subset]] = 1
    loss = sammon.compute(mask_tensor)
    subset_to_loss.append((list(subset), loss))

subset_to_loss.sort(key=lambda x: x[1].item())

In [10]:
subset_to_loss[:20]

[([0, 2, 4, 10, 14], tensor(0.0872, device='cuda:0')),
 ([2, 4, 9, 10, 14], tensor(0.0878, device='cuda:0')),
 ([0, 2, 4, 9, 14], tensor(0.0885, device='cuda:0')),
 ([2, 4, 6, 10, 14], tensor(0.0926, device='cuda:0')),
 ([2, 4, 6, 9, 14], tensor(0.0939, device='cuda:0')),
 ([0, 2, 9, 10, 14], tensor(0.0970, device='cuda:0')),
 ([0, 4, 9, 10, 14], tensor(0.1010, device='cuda:0')),
 ([2, 6, 9, 10, 14], tensor(0.1033, device='cuda:0')),
 ([4, 6, 9, 10, 14], tensor(0.1075, device='cuda:0')),
 ([1, 2, 4, 9, 14], tensor(0.1100, device='cuda:0')),
 ([0, 2, 4, 9, 10], tensor(0.1121, device='cuda:0')),
 ([0, 2, 4, 10, 13], tensor(0.1148, device='cuda:0')),
 ([1, 2, 4, 10, 14], tensor(0.1156, device='cuda:0')),
 ([2, 4, 10, 13, 14], tensor(0.1156, device='cuda:0')),
 ([2, 4, 10, 12, 14], tensor(0.1165, device='cuda:0')),
 ([0, 1, 2, 9, 14], tensor(0.1175, device='cuda:0')),
 ([2, 4, 9, 12, 14], tensor(0.1182, device='cuda:0')),
 ([2, 4, 7, 9, 14], tensor(0.1183, device='cuda:0')),
 ([2, 4, 7, 10

In [12]:
from src.main import run_analysis
pbar = trange(0, 1)
selected_metrics_arr, optimums_arr = run_analysis(
    subset_repo,
    pbar,
    metric_subset_size=subset_size,
    error_function=SammonError,
    method=PSOFeatureSelection,
    seed=42,
    max_iter=10,
    n_runs=3,
    num_particles=10,
)


  0%|                                                      | 0/1 [02:02<?, ?it/s, N_metrics: 5; Run: 3/3; Opt: 0.087751][A

  0%|                                                                    | 0/1 [00:00<?, ?it/s, N_metrics: 5; Run: 1/3;][A
  0%|                                                      | 0/1 [00:00<?, ?it/s, N_metrics: 5; Run: 2/3; Opt: 0.130293][A
  0%|                                                      | 0/1 [00:01<?, ?it/s, N_metrics: 5; Run: 3/3; Opt: 0.087751][A

In [13]:
optimums_arr

[0.1302931159734726, 0.08775129169225693, 0.12569130957126617]

In [13]:
[x[:subset_size] for x in selected_metrics_arr]

[[[0.6277675628662109,
   0.3613280951976776,
   0.7820790410041809,
   0.7051138877868652,
   0.3471134305000305,
   0.4170715808868408,
   1.1624526977539062,
   0.7161104679107666,
   0.9573154449462891,
   0.5850797891616821,
   0.7505536079406738,
   0.37281668186187744,
   0.4622475206851959,
   0.1166602373123169,
   0.4569224715232849],
  [0.7954239845275879,
   0.3523367941379547,
   1.3883907794952393,
   0.14424923062324524,
   0.02078908681869507,
   0.393947958946228,
   0.566846489906311,
   0.7683454751968384,
   0.35238927602767944,
   0.3195575475692749,
   0.6609660387039185,
   0.532672107219696,
   1.0118502378463745,
   0.7494090795516968,
   0.8064063787460327],
  [-0.03348433971405029,
   0.32701823115348816,
   0.9217746257781982,
   0.2141723930835724,
   0.08300352096557617,
   0.5461053848266602,
   0.6119529008865356,
   0.2175091952085495,
   0.14072826504707336,
   0.75847989320755,
   0.08367013931274414,
   0.46088171005249023,
   0.7224437594413757,
   