In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Dataset

In [2]:
dataset='twitter'

from data.loader import DataLoader
dl = DataLoader()
train_primitive_matrix, val_primitive_matrix, test_primitive_matrix, \
train_ground, val_ground, test_ground = dl.load_data(dataset=dataset)

## Synthesis + Verification
Generate functions based on feedback from the verifier (vary cardinality)

In [3]:
import ray
ray.init(num_cpus=100, redirect_output=True)

Process STDOUT and STDERR is being redirected to /tmp/raylogs/.
Waiting for redis server at 127.0.0.1:13476 to respond...
Waiting for redis server at 127.0.0.1:64226 to respond...
Starting local scheduler with the following resources: {'GPU': 0, 'CPU': 100}.

View the web UI at http://localhost:8888/notebooks/ray_ui81934.ipynb?token=a3ec10195c91f5ae32132108c80ecc5e88f495dd220cbf47



{'local_scheduler_socket_names': ['/tmp/scheduler17109852'],
 'node_ip_address': '127.0.0.1',
 'object_store_addresses': [ObjectStoreAddress(name='/tmp/plasma_store94918002', manager_name='/tmp/plasma_manager43403031', manager_port=46347)],
 'redis_address': '127.0.0.1:13476',
 'webui_url': 'http://localhost:8888/notebooks/ray_ui81934.ipynb?token=a3ec10195c91f5ae32132108c80ecc5e88f495dd220cbf47'}

In [4]:
import time
start = time.time()

from program_synthesis.heuristic_generator import HeuristicGenerator
validation_accuracy = []
training_accuracy = []
validation_coverage = []
training_coverage = []

training_marginals = []
idx = None

hg = HeuristicGenerator(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground, 
                            b=0.35)
for i in range(3,20):
    if i == 3:
        hg.run_synthesizer(max_cardinality=1, idx=idx, keep=3, model='dt')
    else:
        hg.run_synthesizer(max_cardinality=1, idx=idx, keep=1, model='dt')
    hg.run_verifier()
    
    va,ta, vc, tc = hg.evaluate()
    validation_accuracy.append(va)
    training_accuracy.append(ta)
    training_marginals.append(hg.vf.train_marginals)
    validation_coverage.append(vc)
    training_coverage.append(tc)
    
    #idx=None
    
    hg.find_feedback()
    idx = hg.feedback_idx
    
    
    if idx == []:
        break

print 'Time Elapsed: ', time.time() - start

After get:  79.0114560127
After get:  0.290719032288
After get:  0.328837156296
After get:  0.324117898941
After get:  0.28581905365
After get:  0.28139090538
After get:  0.305010080338
After get:  0.300611972809
After get:  0.326390981674
After get:  0.295460939407
After get:  0.276116132736
After get:  0.275840997696
After get:  0.319666862488
After get:  0.324122190475
After get:  0.325244903564
After get:  0.309617042542
After get:  0.279580116272
Time Elapsed:  150.142956018


### Visualize Heuristic Behavior

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(range(3,20),validation_accuracy, color='C0')
plt.plot(range(3,20),training_accuracy, color='C1')
plt.plot(range(3,20),validation_coverage, color='C0', linestyle='--')
plt.plot(range(3,20),training_coverage, color='C1', linestyle='--')
plt.legend(['Validation Set', 'Training Set'])
plt.xlabel('Number of Heuristics')
plt.ylabel('Accuracy and Coverage')
#plt.ylim([0.55,1.0])

plt.subplot(1,2,2)
bm = [(a*b) + (0.5*(1-b)) for a,b in zip(validation_accuracy,validation_coverage)] 
plt.plot(range(3,20),bm, color='C0')
bm = [(a*b) + (0.5*(1-b)) for a,b in zip(training_accuracy,training_coverage)] 
plt.plot(range(3,20),bm, color='C1')
plt.legend(['Validation Set', 'Training Set'])
plt.xlabel('Number of Heuristics')
plt.ylabel('Accuracy and Coverage')
#plt.ylim([0.55,1.0])
plt.suptitle('Dataset: ' + dataset)

In [None]:
print "Program Synthesis Train Accuracy: ", np.max(training_accuracy[1:])
print "Program Synthesis Train Coverage: ", training_coverage[1:][np.argmax(training_accuracy[1:])]
print "Number of Heuristics: ", np.argmax(training_accuracy[1:])+4

print "Program Synthesis Validation Accuracy: ", np.max(validation_accuracy[1:])

### Save Reef Marginals

In [29]:
filepath = '/dfs/scratch0/paroma/reef/' + dataset
np.save(filepath+'_reef_nn.npy', training_marginals[-1])

In [None]:
# beta_opt = hg.syn.find_optimal_beta(hg.hf, hg.val_primitive_matrix, hg.feat_combos, hg.val_ground)
# L_train_all = hg.apply_heuristics(hg.hf, primitive_matrix, hg.feat_combos, beta_opt)

# from program_synthesis.verifier import Verifier
# vf_temp = Verifier(L_train_all,L_train_all,train_ground)
# vf_temp.train_gen_model()
# vf_temp.assign_marginals()
# training_all = vf_temp.train_marginals

# filepath = '/dfs/scratch0/paroma/reef/' + dataset
# np.save(filepath+'_reef_nn1.npy', training_all)

## Calculate Baselines

In [3]:
# primitive_matrix = np.load('./data/mammogram/primitive_matrix.npy')
# primitive_matrix = primitive_matrix.T

[**Decision Tree**](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
Depth bound is unbounded, using default settings. Could do a finer comparison if needed.

In [16]:
from baselines.models import DecisionTree
dt = DecisionTree(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground)
dt.fit()
va,ta, vc, tc = dt.evaluate()
print "Decision Tree Train Accuracy: ", ta
print "Decision Tree Validation Accuracy: ", va

Decision Tree Train Accuracy:  0.671941570298235
Decision Tree Validation Accuracy:  0.9986928104575163


[**Boosting (AdaBoost)**](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier) No bound of the number of estimators yet, could bound by number of primitives for "fair" comparison?

In [17]:
from baselines.models import BoostClassifier
bc = BoostClassifier(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground)
bc.fit()
va,ta, vc, tc = bc.evaluate()
print "Boosting Train Accuracy: ", ta
print "Boosting Validation Accuracy: ", va

Boosting Train Accuracy:  0.7460626968651567
Boosting Validation Accuracy:  0.8589909443725744


[**LabelPropagation (Normal Semi-Supervised)**](http://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelSpreading.html#sklearn.semi_supervised.LabelSpreading)
Semi-supervised method that is robust to noise. Fits with both the labeled and unlabeled examples

In [18]:
from baselines.models import SemiSupervised
ss = SemiSupervised(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground)
ss.fit()
va,ta, vc, tc = ss.evaluate()
print "SemiSupervised Train Accuracy: ", ta
print "SemiSupervised Validation Accuracy: ", va

SemiSupervised Train Accuracy:  0.704514774261287
SemiSupervised Validation Accuracy:  0.8615782664941786


[**CPLE (Pessimistic Likelihood Estimation for Semi-Supervised)**](https://github.com/tmadl/semisup-learn) Based on the [model](https://arxiv.org/abs/1503.00269) that guarantees that adding in the unlabeled data points will not worsen only training with labeled examples

In [19]:
from baselines.models import ContrastiveSemiSupervised
css = ContrastiveSemiSupervised(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground)
css.fit()
va,ta, vc, tc = css.evaluate()
print "ContrastiveSemiSupervised Train Accuracy: ", ta
print "ContrastiveSemiSupervised Validation Accuracy: ", va

NameError: global name 'CPLELearningModel' is not defined

### Save Baseline Marginals

In [21]:
filepath = '/dfs/scratch0/paroma/reef/' + dataset

np.save(filepath+'_dt.npy', dt.train_marginals)
np.save(filepath+'_bc.npy', bc.train_marginals)
np.save(filepath+'_ss.npy', ss.train_marginals)
#np.save(filepath+'_css.npy', css.train_marginals)

np.save(filepath+'_gt.npy', train_ground)