In [None]:
import argparse
import os
import sys
import math

import torch
from torch import nn
from torch import optim

import numpy as np
import pandas as pd
from tqdm import tqdm
import json


## vis
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
b = 3
def predict(x, a, mu=0):
    '''
    x - torch tensor with shape [n_data_points, n_features]
    a - torch tensor with shape [n_features]
    '''
    return 1 / (1 + ((a.abs() * (x - mu).abs()).pow(b)).sum(1))

## test UMAP-inspired predict function
# n = 100
# x = torch.linspace(-3,3,n).view(n,1)
# a = torch.tensor(0.5)
# plt.plot(x, predict(x, a))


def predicate(x0, selected, n_iter=3000):
    '''
        x0 - data points
        selected - boolean array of selection
    '''
    # prepare training data
    x = torch.from_numpy(x0.astype(np.float32))
    label = torch.from_numpy(selected).float()
    
    # normalize
    mean = x.mean(0)
    scale = x.std(0) + 0.1
    x = (x - mean) / scale

    bce = nn.BCELoss()
    
    ## since data is normalized, 
    ## mu can initialized around mean_pos examples
    ## a can initialized around a constant across all axes
    mu_init = x[selected].mean(0)
    a_init = 0.4
    
    a = (a_init + 0.1*(2*torch.rand(x.shape[1])-1)).requires_grad_(True)
    mu = mu_init + 0.1 * (2*torch.rand(x.shape[1]) - 1)
    mu.requires_grad_(True)
    optimizer = optim.SGD([
        {'params': mu, 'weight_decay': 0},
        {'params': a, 'weight_decay': 0.01} ## smaller a encourages larger reach of the bounding box
    ], lr=1e-2, momentum=0.9)
    for e in range(n_iter):
        pred = predict(x, a, mu)
        l = bce(pred, label)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if e % (n_iter//5) == 0:
            print('loss', l.item())
    a.detach_()
    mu.detach_()

#     plt.stem(a.abs().numpy())
#     plt.show()
    
    ## range of the bounding box, defined by the level set of prediction=0.5
    r = 1 / a.abs()
    
    print(
        'accuracy',
        ((pred > 0.5).float() == label).float().sum().item(),
        '/', selected.shape[0])

    
    ## orginal data extent
    vmin = x0.min(0)
    vmax = x0.max(0)
    
    ##predicate clause selection
    predicates = []
    for k in range(mu.shape[0]):
        
        ## denormalize
        r_k = (r[k] * scale[k]).item()
        mu_k = (mu[k] * scale[k] + mean[k]).item()
        ci = ((mu_k - r_k), (mu_k + r_k))
        
        
        ## feature selection based on extent range
#         should_include = r[k] < 1.0 * (x[:,k].max()-x[:,k].min())
        should_include = not (ci[0] < vmin[k] and ci[1] > vmax[k])
        if should_include: 
            predicates.append(dict(
                dim=k, interval=[max(ci[0],vmin[k]), min(ci[1], vmax[k])]
            ))
    return predicates




In [None]:
# ## test, 2D dataset
# data = np.random.rand(150,2)

# target = (
#     (0.3<data[:,0])*(data[:,0]<0.6)
#     *(0<data[:,1])*(data[:,1]<0.5)
# ).astype(np.int64)
# target = ((0.3<data[:,0])*(data[:,0]<0.6)).astype(np.int64)

# for i in range(10):
#     p = predicate(data, target)
#     display(p)