In [None]:
import sys 
from pathlib import Path

main_path = Path('..').resolve()
sys.path.append(str(main_path))

from fge import FeatureInteractionTree
import pickle
from collections import defaultdict
import pandas as pd

dataset_names = ['adult', 'boston', 'california', 'titanic']

In [None]:
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error

objective_dict = {
    'reg': mean_squared_error, 
    'binary': accuracy_score,
}

In [None]:
results = [['ds', 'score', 'ebm_score', 'ebm_gap']]
for ds in dataset_names:
    model_ebm_file = main_path / 'cache' / 'models' / f'{ds}_ebm.pickle'
    model_file = main_path / 'cache' / 'models' / f'{ds}.pickle'
    with model_file.open('rb') as file:
        model_data = pickle.load(file)
    with model_ebm_file.open('rb') as file:
        model_ebm_data = pickle.load(file)
    
    dataset = model_data['dataset']
    score = model_data['score']
    X_test = dataset.data['X_test']
    y_test = dataset.data['y_test']
    ebm = model_ebm_data['ebm']
    eval_fn = objective_dict[dataset.task_type]
    y_pred = ebm.predict(X_test)
    ebm_score = eval_fn(y_test, y_pred)
    gap = score - ebm_score
    results.append([ds, score, ebm_score, gap])

In [None]:
df = pd.DataFrame(results[1:], columns=results[0])

In [None]:
df_results = pd.read_csv('../exp_results_head10.csv')
df_results = df_results.groupby('ds').first().loc[:, ['linear_gap', 'polynomial_gap']].reset_index()

In [None]:
df = pd.merge(df, df_results, how='inner', on='ds')
df_origin = df.copy()
for c in ['ebm_gap', 'linear_gap', 'polynomial_gap']:
    df_origin[c] = df_origin['score'] - df_origin[c]
df_origin = df_origin.rename(columns={'ebm_gap': 'ebm', 'linear_gap': 'linear', 'polynomial_gap': 'polynomial'})
df_origin.drop(columns=['ebm_score'], inplace=True)

In [None]:
df

In [None]:
df_origin

In [None]:
dict_data = defaultdict(list)
trees = defaultdict()
base_res = defaultdict()
dataset_names = ['adult', 'boston', 'california', 'titanic']

for ds in dataset_names:
    print(f'Working on {ds}...')
    model_file = main_path / 'checkpoints' / 'models' / f'{ds}.pickle'
    with model_file.open('rb') as file:
        model_data = pickle.load(file)
    n_feautres = len(model_data['dataset'].feature_names)   

    exp_files = list((main_path / 'checkpoints' / 'exps' / ds).glob('*.pickle'))

    for exp_p in exp_files:
        score_method, args_n_search, args_select, args_filter = exp_p.name.rstrip('.pickle').split('-')
        n_search = int(args_n_search.split('_')[-1])
        select_method = args_select.split('_')[-2]
        select_ratio = float(args_select.split('_')[-1])
        filter_method = args_filter.split('_')[-2]
        filter_ratio = float(args_filter.split('_')[-1])

        with exp_p.open('rb') as file:
            data = pickle.load(file)

        if base_res.get(ds) is None:
            base_res[ds] = data['base_results']

        tree = data['tree']
        t = list(map(float, data['time'].replace(' m', '').replace(' s', '').split(', ')))
        time = 60*t[0] + t[1]
        
        dict_data['ds'].append(ds)
        dict_data['score_method'].append(score_method)
        dict_data['n_search'].append(n_search)
        dict_data['select_method'].append(select_method)
        dict_data['select_ratio'].append(select_ratio)
        dict_data['filter_method'].append(filter_method)
        dict_data['filter_ratio'].append(filter_ratio)
        dict_data['time'].append(time)
        dict_data['tree_gap'].append(tree.gap)
        trees[ds+'_'+exp_p.name.rstrip('.pickle')] = tree
        origin_score = data['origin_score']
        
        for k, v in data['base_results'].items():
            dict_data[f'{k}_gap'].append(origin_score - v[0])
        dict_data['origin_score'].append(origin_score)

df_data = pd.DataFrame(dict_data)

In [None]:
with open('../checkpoints/trees.pickle', 'wb') as file:
    pickle.dump(trees, file)

df_data.to_csv('../exp_results.csv', index=False)

In [None]:
df_head10 = df_data.sort_values(['ds', 'tree_gap']).groupby('ds').head(10)
df_head10['exp'] = df_head10['ds'] + '_' + df_head10['score_method'] + '-beam_' + df_head10['n_search'].astype(str) + '-select_' + df_head10['select_method'].astype(str) + '_' + df_head10['select_ratio'].astype(str) + \
    '-filter_' + df_head10['filter_method'].astype(str) + '_' + df_head10['filter_ratio'].astype(str)
df_head10 = df_head10.set_index('exp')
df_head10.to_csv('../exp_results_head10.csv')

trees_head10 = defaultdict(list)
for ds in dataset_names:
    for k in df_head10.loc[df_head10['ds'] == ds].index:
        trees_head10[ds].append((k, trees[k]))
    trees_head10[f'base_{ds}'] = base_res[ds]
with open('../checkpoints/trees_head10.pickle', 'wb') as file:
    pickle.dump(trees_head10, file)

In [None]:
df_desc = pd.read_csv('../cache/datasets/adult_desc.csv')
df.columns

In [None]:
detail_list = df_desc.loc[df_desc.loc[:, ['encoded', 'data', 'detail']].isnull().sum(1) != 3, 'feature'].drop_duplicates().values
detail_list

In [None]:
detail = 'Sex'

In [None]:
df_detail = df_desc.loc[df_desc['feature'] == detail, ['encoded', 'data', 'detail']].fillna('')
df_detail.to_dict()

In [None]:
import pandas as pd
import numpy as np
import hvplot.pandas

dataset_names = ['adult', 'titanic', 'boston', 'california']
df = pd.read_csv('../exp_results.csv')

In [None]:
df.groupby('ds')['tree_gap'].describe()

In [None]:
for ds, thres in df.groupby('ds')['tree_gap'].quantile(0.75).items():
    df_temp = df.loc[df['ds'] == ds]
    p = (df_temp.loc[:, 'tree_gap'] >= thres).sum() / len(df_temp)
    print(f'{ds}: {p*100:.2f}% of data in `{ds}` larger than q75={thres:.4f}')

In [None]:
def boxplot(df, exp_name, target):
    for i, ds in enumerate(dataset_names):
        df_temp = df.loc[df['ds'] == ds].sort_values(exp_name)
        q1 = df_temp[target].quantile(0.00)
        q2 = df_temp[target].quantile(0.90)
        if i == 0:
            box_plot = df_temp.hvplot.box(y=target, by=exp_name, title=f'{ds}', subplots=True, height=400, width=400).opts(shared_axes=False, ylim=(q1-q1/10, q2+q2/10))
        else:
            box_plot += df_temp.hvplot.box(y=target, by=exp_name, title=f'{ds}', subplots=True, height=400, width=400).opts(shared_axes=False, ylim=(q1-q1/10, q2+q2/10))
    return box_plot

def q25dist(df, exp_name, target):
    for i, ds in enumerate(dataset_names):
        df_temp = df.loc[(df['ds'] == ds)].sort_values(exp_name)
        df_lower = df_temp.loc[df_temp[target] <= df_temp[target].quantile(0.25)]
        if i == 0:
            vio_plot = df_lower.hvplot.box(y=target, by=exp_name, height=400, width=500, title=f'{ds}-Lower Quartile Distribution: {len(df_lower)}/{len(df_temp)} data').opts(shared_axes=True, yformatter='%.4f')
        else:
            vio_plot += df_lower.hvplot.box(y=target, by=exp_name, height=400, width=500, title=f'{ds}-Lower Quartile Distribution: {len(df_lower)}/{len(df_temp)} data').opts(shared_axes=True, yformatter='%.4f')
    return vio_plot

In [None]:
# score_method
exp_name = 'score_method'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'n_search'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'select_method'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'filter_method'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'select_ratio'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'select_ratio'
target = 'time'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'filter_ratio'
target = 'tree_gap'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'filter_ratio'
target = 'time'
boxplot(df, exp_name, target)

In [None]:
df['origin_score']

In [None]:
dataset_names

In [None]:
df['tree_score/time'] = 0

# ['adult', 'boston']
idx = df['ds'].isin(['adult', 'boston'])
df.loc[idx, 'tree_score/time'] = (df['tree_gap'] + df['origin_score']) / df['time']
# ['california', 'titanic']
idx = df['ds'].isin(['california', 'titanic'])
df.loc[idx, 'tree_score/time'] = -1*(df['tree_gap'] + df['origin_score']) / df['time']

In [None]:
exp_name = 'select_ratio'
target = 'tree_score/time'
boxplot(df, exp_name, target)

In [None]:
exp_name = 'filter_ratio'
target = 'tree_score/time'
boxplot(df, exp_name, target)

In [None]:
df_head10 = df_data.sort_values(['ds', 'tree_gap']).groupby('ds').head(10)
df_head10['exp'] = df_head10['ds'] + '_' + df_head10['score_method'] + '-beam_' + df_head10['n_search'].astype(str) + '-select_' + df_head10['select_method'].astype(str) + '_' + df_head10['select_ratio'].astype(str) + \
    '-filter_' + df_head10['filter_method'].astype(str) + '_' + df_head10['filter_ratio'].astype(str)
df_head10 = df_head10.set_index('exp')

In [None]:
import sys 
from pathlib import Path

main_path = Path('..').resolve()
sys.path.append(str(main_path))

from fge import FeatureInteractionTree
import pickle
from collections import defaultdict
import pandas as pd
import numpy as np

from fge.utils import flatten

dataset_names = ['adult', 'boston', 'california', 'titanic']

df_head10 = pd.read_csv('../exp_results_head10.csv').set_index('exp')

with open('../cache/trees_head10.pickle', 'rb') as file:
    trees_head10 = pickle.load(file)

In [None]:
for ds in dataset_names:
    df = defaultdict(list)
    print(ds)
    for exp, tree in trees_head10[ds]:
        tree = FeatureInteractionTree(tree)
        df['exp'].append(exp)
        df['gap'].append(tree.root.gap)
        coef = tree.coef.round(6).tolist()
        df['coef'].append(coef)
        df['feature(simple)'].append(list(tree.node_name2f_name.keys()))
        coef_name = list(tree.node_name2f_name.values())
        df['feature(name)'].append(coef_name)
    pd.DataFrame(df).to_csv(f'../exp_results/coef_{ds}.csv', index=False)

for ds in dataset_names:
    df = defaultdict(list)
    for exp, tree in trees_head10[ds]:
        tree = FeatureInteractionTree(tree)
        df['exp'] += [exp]*2
        df['gap'] += [tree.root.gap]*2
        coef_name = list(tree.node_name2f_name.values())
        coef = tree.coef.round(6).tolist()
        for i, (n, c) in enumerate(zip(coef_name, coef)):
            df[i] += [n, c]
    pd.DataFrame(df).to_csv(f'../exp_results/coef2_{ds}.csv', index=False)

In [None]:
from itertools import permutations
from fge.functions import *
from sklearn.metrics import accuracy_score, mean_squared_error

TOOLS = ['siv', 'linear', 'tree', 'ebm']
DS_NAMES = ['adult', 'boston', 'california', 'titanic']

def load_cache(dataset_names, cache_path: Path):
    res = defaultdict(dict)
    score_methods = {
        'g_abs': g_abs,
        'g_abs_interaction': g_abs_only_interaction,
        'g_ratio': g_ratio,
    }
    task_dict = {
        'reg': mean_squared_error,
        'binary': accuracy_score,
    }
    exp_idx = 0

    with open(cache_path / 'trees_head10.pickle', 'rb') as file:
        trees_head10 = pickle.load(file)

    for ds in dataset_names:
        with open(cache_path / f'models/{ds}.pickle', 'rb') as file:
            data = pickle.load(file)

        exp, tree = trees_head10[ds][exp_idx]
        dataset = data['dataset']
        # tools
        score_fn = score_methods.get(exp.split('-')[0].lstrip(f'{ds}_'))
        res[ds]['siv'] = score_fn(data['siv'])
        base_res = trees_head10[f'base_{ds}']
        res[ds]['linear'] = base_res['linear']
        res[ds]['tree'] = FeatureInteractionTree(tree)
        res[ds]['ebm'] = 'EBM'
        res[ds]['metric'] = task_dict[dataset.task_type]
        # dataset
        X = dataset.data['X_test']
        y = dataset.data['y_test']
        res[ds]['set-pre'] = (X.iloc[:16].reset_index(drop=True), y[:16])
        res[ds]['set-post'] = (X.iloc[16:32].reset_index(drop=True), y[16:32])

    return res

In [None]:
dataset_names = ['adult', 'boston', 'california', 'titanic']
cache_path = Path('../cache/')
cache = load_cache(dataset_names, cache_path)

In [None]:
ds_cache = cache['adult']

In [None]:
tool = 'linear'
score, model = ds_cache[tool]

In [None]:
model[-1].intercept_.ndim

In [None]:
index = 0
for ds in dataset_names:
    tree = FeatureInteractionTree(trees_head10[ds][index][1])
    break

In [None]:
list(tree.node_name2f_name.values())

In [None]:
s = '$$Y='
for c, n in zip(tree.coef, list(tree.node_name2f_name.values())):
    s_add = f' {c:.4f}\\times \\text' + '{' + f'{n}' + '}'
    s += s_add
s += '$$'

In [None]:
import pygraphviz
from anytree import RenderTree, LevelGroupOrderIter

In [None]:
exp, tree = trees_head10['adult'][0]
# tree = FeatureInteractionTree(tree)
FeatureInteractionTree(tree)

In [None]:
fig = FeatureInteractionTree(tree).show_tree(typ='plotly')

In [None]:
fig.write_html('test.html')

In [None]:
coef = tree.root.model[-1].coef_
coef = coef[0] if coef.ndim == 2 else coef
feature_names = tree.root.model[1].get_feature_names()
n_origin_features = len(tuple(filter(lambda x: isinstance(x, int), feature_names.keys())))
combs2index = {}
for *_, node in RenderTree(tree.root):
    coef_idx = node.k + n_origin_features - 1 if '+' in node.name else int(node.name) 
    combs2index[node.name] = coef_idx
node_name2f_name = dict([(str(k), v) if isinstance(k, int) else ('+'.join(map(str, flatten(k))), '*'.join(map(str, flatten(v)))) for k, v in feature_names.items()])

In [None]:
def _get_fs_str(node):
    fs = node_name2f_name[node.name].split('*')
    fs_str = f'{fs[0]} * ... * {fs[-1]}' if len(fs) > 2 else ' * '.join(fs)
    return fs_str

def _fmt(fs_str, coef):
    s = '< '
    s+=f''s+=f''# if node.interaction != 0.0:#     children_interaction = np.sum([child.interaction for child in node.children])#     s += f''# if node.gap is not None:#     s += f''returns+'
{fs_str}
coef = {coef:.4f}
interaction={node.interaction - children_interaction:.4f}
gap={node.gap:.6f}
 >'

def _get_coef(node):
    coef_idx = combs2index[node.name]
    return coef[coef_idx]

def _get_node_color_key(node):
    if _get_coef(node) < 0.0:
        v_key = -1
    elif _get_coef(node) > 0.0:
        v_key = 1
    else:
        v_key = 0

    return v_key

In [None]:
colors = {
    'blue': '#85a8ed', 'red': '#ed8585', 'black': '#000000', 'green': '#9ced85'
}
show_kwargs = {
    'node':{
        1: {'fontname': 'Arial', 'fontsize': 12, 'color': colors['red'], 'shape': 'box'},
        0: {'fontname': 'Arial', 'fontsize': 12, 'color': colors['black'], 'shape': 'box'},
        -1: {'fontname': 'Arial', 'fontsize': 12, 'color': colors['blue'], 'shape': 'box'}
    },
    'edge': {
        1: {'color': colors['red'], 'arrowsize': 0.5, 'headclip': True, 'tailclip': True},
        0: {'color': colors['black'], 'arrowsize': 0.5, 'headclip': True, 'tailclip': True},
        -1: {'color': colors['blue'], 'arrowsize': 0.5, 'headclip': True, 'tailclip': True}
    }
}

In [None]:
G = pygraphviz.AGraph(directed=True)
G.graph_attr['rankdir'] = 'BT'
G.graph_attr["ordering"] = 'out'
G.layout(prog='neato')

for *_, node in RenderTree(tree.root):
    fs_str = _get_fs_str(node)
    c = _get_coef(node)
    key = _get_node_color_key(node)
    G.add_node(combs2index.get(node.name), 
        label=_fmt(fs_str, c), coef=c, fs_str=fs_str,
        **show_kwargs['node'][key]
    )
    if node.parent is not None:
        G.add_edge(combs2index.get(node.name), combs2index.get(node.parent.name), **show_kwargs['edge'][0])
    # G.add_subgraph([node for node in G.nodes() if '*' not in node], rank='same')

In [None]:
from io import BytesIO
from PIL import Image as PILImage

In [None]:
imgbuf = BytesIO()
G.draw(imgbuf, format='png', prog='dot')
img = PILImage.open(imgbuf)
img

In [None]:
import networkx as nx

In [None]:
depth2node = {}
for i, childrens in enumerate(LevelGroupOrderIter(tree.root)):
    depth2node[i] = []
    for node in childrens:
        depth2node[i].append(node.name)
depth = len(depth2node)
node2depth = {}
for lv, nodes in depth2node.items():
    for n in nodes:
        node2depth[n] = lv
node2depth

In [None]:
edges = [tuple(map(int, e)) for e in G.edges()]
G_nx = nx.from_edgelist(edgelist=edges, create_using=nx.DiGraph)
pos = nx.nx_agraph.pygraphviz_layout(G_nx, prog='dot')
print(pos)

In [None]:
nx.draw(G_nx, {n: tuple(map(lambda x: -x, c)) for n, c in pos.items()})

In [None]:
position_info = {}
for node, coor in pos.items():
    attr = dict(
        weight = float(G.get_node(node).attr['coef']),
        name =  G.get_node(node).attr['fs_str'],
        color = G.get_node(node).attr['color']
    )
    position_info[node] = list(coor) + [attr]
position_info

In [None]:
Xn = list(map(lambda x: (x[1][0]), position_info.items()))
Yn = list(map(lambda x: (2*depth-x[1][1]), position_info.items()))
Xe = []
Ye = []
for edge in edges:
    Xe += [position_info[edge[0]][0], position_info[edge[1]][0], None]
    Ye += [2*depth-position_info[edge[0]][1], 2*depth-position_info[edge[1]][1], None]

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=Xe,
        y=Ye,
        mode='lines',
        name='edges',
        line=dict(color='rgb(200,200,200)', width=1.5),
        hoverinfo='none',
        showlegend=False
    )
)

width = 30
height = 20

rec_fn_x = lambda x_pos: [x_pos-width/2, x_pos+width/2, x_pos+width/2, x_pos-width/2, x_pos-width/2]
rec_fn_y = lambda y_pos: [y_pos-height/2, y_pos-height/2, y_pos+height/2, y_pos+height/2, y_pos-height/2]

for n_id, x_coor, y_coor in zip(position_info.keys(), Xn, Yn):
    attr = position_info[n_id][-1]
    fig.add_trace(
        go.Scatter(
            x=rec_fn_x(x_coor),
            y=rec_fn_y(y_coor),
            mode='lines',
            line=dict(color='rgb(255,255,255)', width=1),
            name=attr['name'],
            fill='toself',
            fillcolor=attr['color'],
            text=f"{attr['name']}
{attr['weight']:.4f}",
            # hovertext='%{name}

coef: %{text}',
            hoverinfo='text',
            showlegend=False
        )
    )

In [None]:
def make_annotations(pos, font_size=10, font_color='rgb(0,0,0)'):   
    L = len(pos)
    annotations = []
    for k in range(L):
        text = pos[k][-1]['name']
        annotations.append(
            dict(
                text=text, # or replace labels with a different list for the text within the circle
                x=pos[k][0], y=2*depth-pos[k][1],
                xref='x1', yref='y1',
                font=dict(color=font_color, size=font_size),
                showarrow=False
            )
        )
    return annotations

In [None]:
position_info[22][-1]['name'].replace('* ... *', '~
')

In [None]:
axis = dict(
    showline=False, # hide axis line, grid, ticklabels and  title
    zeroline=False,
    showgrid=False,
    showticklabels=False,
)

fig.update_layout(
    title= 'Tree',
    annotations=make_annotations(position_info),
    font_size=10,
    showlegend=False,
    xaxis=axis,
    yaxis=axis,
    margin=dict(l=40, r=40, b=25, t=50),
    hovermode='closest',
    plot_bgcolor='rgb(248,248,248)'
)
fig.show()