In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
from common.utils.pickling import pickle_read, pickle_write
from common.utils.misc import *
import matplotlib.pyplot as plt
import networkx as nx
from prepare_submission_data import pkl_to_tar
import re
from collections import defaultdict
import random
from sklearn.cluster import KMeans

In [3]:
# Plan:
#+ load val embeddings
#+ cluster val embeddings
#+ assign avg loss to each cluster
#+ load train embeddings
#+ assign train embeddings to clusters
#+ check if every cluster has representation
# divide train embeddings by percentiles of distance from cluster centers
# sample from each cluster - decide fro which percentile and assign weight to each cluster to affect how many samples


#### load val embeddings and loss

In [4]:
# loss
p_val_loss = '/mnt/ext/shared/Projects/GNNetworkingChallenge/trained_oracle_models/6.33/losses_09-6.33/val_sample_loss_09-6.33.csv'
val = load_sample_loss_csv(p_val_loss)

In [5]:
# embeddings
p_val_emb = '/mnt/ext/shared/Projects/GNNetworkingChallenge/trained_oracle_models/6.33/sample_embeddings_09-6.33/val_min_max_mean.pkl'
emb_val = pickle_read(p_val_emb)
assert(val.path == emb_val['paths']).all()
emb_val = emb_val['embeddings']

#### load train embeddings

In [6]:
p_tr_emb = '/mnt/ext/shared/Projects/GNNetworkingChallenge/trained_oracle_models/6.33/sample_embeddings_09-6.33/train_min_max_mean.pkl'
emb_tr = pickle_read(p_tr_emb)
p2e_tr = {p: i for i, p in enumerate(emb_tr['paths'])}

tr = pd.DataFrame({'path': emb_tr['paths']})
tr['dset'] = tr.path.str.split('/').str[7]

# filter desired samples
tr = tr[tr.dset.isin(['1', '10', '11', '15', '5', '8', 'hard1', 'hard2', 'hard3', 'hard4', 'hard5'])]
emb_tr = emb_tr['embeddings'][tr.index.values]
tr.dset.unique(), emb_tr.shape

(array(['1', '10', '11', '15', '5', '8', 'hard1', 'hard2', 'hard3',
        'hard4', 'hard5'], dtype=object),
 (134347, 288))

#### cluster val embeddings

In [7]:
nclusters = 15
kmeans = KMeans(n_clusters=nclusters)
clusters = kmeans.fit_predict(emb_val)
# assign to each val sample its cluster
val['c'] = clusters
# assign avg loss to each cluster
clusters = val.groupby('c').loss.mean().to_frame()
clusters['nval'] = val.groupby('c').size()

#### assign train embeddings to val clusters

In [8]:
# compute sample distances from each cluster
trcd = kmeans.transform(emb_tr)

# cluster assignment
tr['c'] = trcd.argmin(axis=1)
# distance from cluster center
tr['cdist'] = trcd[np.arange(len(trcd)), tr['c'].values]
clusters['ntrain'] = tr.groupby('c').size()

In [9]:
clusters.T

c,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
loss,6.368943,2.650684,1.078728,7.10079,11.359393,13.822597,5.02397,4.664109,1.604214,18.740255,1.049908,5.37105,2.210074,16.746688,2.351897
nval,10.0,21.0,7.0,13.0,12.0,5.0,7.0,2.0,2.0,11.0,12.0,6.0,7.0,5.0,10.0
ntrain,15506.0,213.0,8233.0,2185.0,11586.0,5302.0,29670.0,14409.0,226.0,1265.0,4.0,3101.0,19051.0,9520.0,14076.0


In [10]:
assert((clusters.ntrain > 0).all())

#### select top closest from each group

In [11]:
# sort before group so each group is sorted by distance from cluster center
gp = tr.sort_values(by='cdist', ascending=True).groupby('c')
# take top samples from each group
ntop = 20
top_samples = gp.head(ntop)
gp = top_samples.groupby('c')
# size of each resulting group
gp.size()

c
0     20
1     20
2     20
3     20
4     20
5     20
6     20
7     20
8     20
9     20
10     4
11    20
12    20
13    20
14    20
dtype: int64

#### Experiment 1

In [12]:
def shuffle_by_group(df, field):
    """ groupby field and shuffle groups """
    groups = [df for _, df in df.groupby(field)]
    random.shuffle(groups)
    return pd.concat(groups)


In [16]:
dataname = '_'.join(c.replace('hard','h') for c in sorted(tr.dset.unique()))
name = f'ncls_{nclusters}_top_{ntop}_data_{dataname}'
save_root = Path('/mnt/ext-10g/users/yakovl/dev/GNNetworkingChallenge/subset_training/unshuffled_order') / 'kmeans' / name
if not save_root.exists():
    save_root.mkdir()
save_root

PosixPath('/mnt/ext-10g/users/yakovl/dev/GNNetworkingChallenge/subset_training/unshuffled_order/kmeans/ncls_15_top_20_data_1_10_11_15_5_8_h1_h2_h3_h4_h5')

In [17]:
config = {
    'nclusters': nclusters,
    'ntop': ntop,
    'data': tr.dset.unique().tolist(),
    'data_samples': len(tr),
    'clusters': clusters,
    'centers': kmeans.cluster_centers_,
    'tr_embeddings': p_tr_emb,
    'val_embeddings': p_val_emb,
    'val_loss': p_val_loss,
    'top_samples': top_samples,
}
pickle_write(save_root / 'config.pkl', config)

In [18]:
# experiments:
# equal from each group, 
# by loss, loss**.5 loss**2
# x shuffle samples | shuffle groups

In [22]:
gp = top_samples.groupby('c')
ntries = 10

iexp = 0
for shuffle in ['shuf_smp', 'shuf_gp']:
    for exp in [0, .3, .5, 1, 1.5]:
        for itry in range(ntries):
            # take one from each cluster
            new100 = [gp.sample(1)]
            nrest = 100 - len(new100[0])

            # calculate cluster weights
            wloss = clusters.loss ** exp
            p = wloss / wloss.sum()
            nsmp = np.random.multinomial(nrest, p)

            # sample from the groups
            for (c, g), n in zip(gp, nsmp):
                new100.append(g.sample(min(n, len(g))))

            new100 = pd.concat(new100)
            remaining = 100 - len(new100)
            if remaining > 0:
                irest = np.random.choice([ii for ii in top_samples.index if ii not in new100.index], remaining, replace=False)
                new100 = pd.concat((new100, top_samples.loc[irest]))

            if shuffle == 'shuf_smp':
                new100 = new100.sample(frac=1)
            elif shuffle == 'shuf_gp':
                new100 = shuffle_by_group(new100, 'c')
            else:
                assert False
                
            isave_root = save_root / f'{iexp:02d}_exp_{exp}_{shuffle}' / f'{itry:02d}'
            if not isave_root.exists():
                isave_root.mkdir(parents=True)
            save_list(new100, isave_root)
        iexp += 1
