In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import os
import scipy.sparse as sparse
from test import *

from itertools import chain
from pathlib import *
from math import ceil
from sqlalchemy import create_engine
from pandarallel import pandarallel

from test import *
from helper import *
from tree_weight import *
from time import time
from main import make_graph_from_tree
from multiprocessing import *

from typing import List
%matplotlib inline

In [None]:
pandarallel.initialize(nb_workers=cpu_count() - 1)

In [15]:
image_folder = 'graphs/'
tree = 'benchmark_models/grid-colouring/trees/4_8.sqlite'
info_df = to_df(tree, 'info').set_index('NodeID')
nodes_df = to_df(tree, 'nodes').set_index('NodeID')
valid_df = nodes_df[nodes_df['Status'] != 3]

print(nodes_df.shape)

(322477, 12)


In [None]:
def make_dfs_ordering_new(nodes_df: pd.DataFrame) -> list:
    """
    Return list of nodeids in the order they were entered by
    the depth-first search algorithm.

    """
    valid_df = nodes_df[nodes_df['Status'] != 3]
    dfs_ordering = [0]
    boundary = [valid_df[(valid_df['ParentID'] == 0) & (valid_df['Status'] != 3)]\
                .sort_values('Alternative', ascending=False).index.to_list()]

    # run simulated dfs on tree
    while len(boundary) > 0:
        if len(boundary[-1]) == 0:
            boundary.pop()
            continue
        
        nxt = boundary[-1].pop()
        dfs_ordering.append(nxt)
        boundary.append(valid_df[(valid_df['ParentID'] == nxt) & (valid_df['Status'] != 3)]\
                             .sort_values('Alternative', ascending=False).index.to_list())

    assert set(dfs_ordering) == (set(nodes_df.index) - set(nodes_df[nodes_df['Status'] == 3].index))
    return dfs_ordering

In [None]:
def calculate_subtree_size_new(nodes_df):
    valid_df = nodes_df[nodes_df['Status'] != 3]
    valid_df['SubtreeSize'] = 0
    
    start = valid_df[valid_df['Status'].isin({0, 1})] # start with children
    valid_df.loc[start.index, 'SubtreeSize'] = 1
    while 0 != start.index[0]:
        parent_idx = valid_df.loc[start.index, 'ParentID'].unique()
        valid_df.loc[parent_idx, 'SubtreeSize'] = valid_df.loc[start.index,:].groupby(['ParentID']).sum()['SubtreeSize']
        start = valid_df.loc[parent_idx, :]
        
    return valid_df['SubtreeSize']

In [18]:
valid_df = pd.DataFrame.copy(nodes_df[nodes_df['Status'] != 3])
valid_df['SubtreeSize'] = np.nan
valid_df['HasNotSubtreeSize'] = True

start = valid_df[valid_df['Status'].isin({0, 1})] # start with leaves
valid_df.loc[start.index, 'SubtreeSize'] = 1
valid_df.loc[start.index, 'HasNotSubtreeSize'] = False

while valid_df['HasNotSubtreeSize'].sum() > 0:
    parent_idx = np.unique(valid_df.loc[start.index, 'ParentID'].values)
    # filter out parent with unexplored children
    parent_idx = valid_df.loc[valid_df['ParentID'].isin(parent_idx)]\
                    .groupby(['ParentID'])\
                    .sum()['HasNotSubtreeSize'] # parent_index along with count of nodes without subtreesize
    parent_idx = parent_idx[parent_idx == 0].index
    valid_df.loc[parent_idx, 'SubtreeSize'] = 1 + valid_df[valid_df['ParentID'].isin(parent_idx)].groupby(['ParentID']).sum()['SubtreeSize']
    valid_df.loc[parent_idx, 'HasNotSubtreeSize'] = False
    start = valid_df.loc[parent_idx, :]
    
nodes_df.loc[:, 'SubtreeSize'] = valid_df['SubtreeSize']
nodes_df.loc[nodes_df['SubtreeSize'].isna(), 'SubtreeSize'] = 0
nodes_df.loc[:, 'SubtreeSize'] = nodes_df['SubtreeSize'].astype(int)

In [17]:
orig = nodes_df['SubtreeSize']

In [21]:
(nodes_df['SubtreeSize'] != orig).sum()

0

In [None]:
k = 0.9
res_df = pd.DataFrame.copy(valid_df.iloc[1:, :])
res_df['Weight'] = np.random.random(res_df.shape[0])
parent_mean = res_df.groupby('ParentID').mean()['Weight']
parent_count = res_df.groupby('ParentID').count()['Weight']

# index swap
res_df = res_df.reset_index().set_index('ParentID')
res_df.loc[:, 'Mean'] = parent_mean
res_df.loc[:, 'Count'] = parent_count
res_df = res_df.reset_index().set_index('NodeID')
res_df.loc[:, 'Weight'] = res_df['Weight'] - res_df['Mean'] + k / res_df['Count']
j = res_df.groupby(['ParentID']).sum()['Weight']
j[abs(j - k) > 1e-10]

In [None]:
weights = valid_df[['ParentID', 'HasUnequalSplit']].reset_index().set_index('ParentID').iloc[1:,:]
weights.loc[:, 'ParentDomainSize'] = parent_domain_size
weights = weights.reset_index().set_index('NodeID')
weights['Weight'] = 1 / weights['ParentDomainSize'] + weights['HasUnequalSplit'] * (1 - 2 / weights['ParentDomainSize'])
weights.drop(columns)

In [None]:
weights

In [None]:
domains

In [None]:
nodeSplitVarDomain = domains.iloc[1:,:].parallel_apply(lambda r: len(r['Info'][r['label']]), axis=1)

In [None]:
domains['parentLabel'] = get_parent_column('label', domains)

In [None]:
domains

In [None]:
print(info_df.shape)