In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Dataset

In [2]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data 
y = (iris.target+1).astype(float)

val_idx = np.array(range(150))[::6] #every 6th to get 25
train_idx = np.array(list(set(range(150)) - set(val_idx))) #rest of it

train_primitive_matrix = X[train_idx,:]
val_primitive_matrix = X[val_idx,:]
train_ground = y[train_idx]
val_ground = y[val_idx]

## Synthesis + Verification
Generate functions based on feedback from the verifier (vary cardinality)

In [3]:
from program_synthesis.heuristic_generator import HeuristicGenerator
validation_accuracy = []
training_accuracy = []
validation_coverage = []
training_coverage = []

training_marginals = []
idx = None
weights = None

hg = HeuristicGenerator(train_primitive_matrix, val_primitive_matrix, 
                            val_ground, train_ground, 
                            b=0.33)
for i in range(3,20):
    if i == 3:
        hg.run_synthesizer(max_cardinality=4, idx=None, weights=weights, keep=3, model='dt')
    else:
        hg.run_synthesizer(max_cardinality=4, idx=None, weights=weights, keep=1, model='dt')
    hg.run_verifier()
    
    va,ta, vc, tc = hg.evaluate()
    validation_accuracy.append(va)
    training_accuracy.append(ta)
    training_marginals.append(hg.vf.train_marginals)
    validation_coverage.append(vc)
    training_coverage.append(tc)
        
    hg.find_feedback()
    idx = hg.feedback_idx
    
    #For Rebuttal
    #weights = hg.vf.find_weighted_vague_points()
    
    if idx == []:
        break

In [4]:
from sklearn.metrics import f1_score

y_pred = np.argmax(training_marginals[-1],axis=-1)+1.
print "F1 score for Train Set: ", f1_score(train_ground, y_pred, average='micro')

F1 score for Train Set:  0.952


## Calculate Baselines

In [5]:
from sklearn.metrics import f1_score

[**Decision Tree**](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)
Depth bound is unbounded, using default settings. Could do a finer comparison if needed.

In [6]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0, min_samples_leaf=5, min_samples_split=10)
model.fit(val_primitive_matrix, val_ground)
y_pred = model.predict(train_primitive_matrix)
print "F1 score for Train Set: ", f1_score(train_ground, y_pred, average='micro')

F1 score for Train Set:  0.952


[**Boosting (AdaBoost)**](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier) No bound of the number of estimators yet, could bound by number of primitives for "fair" comparison?

In [7]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=0)
model.fit(val_primitive_matrix, val_ground)
y_pred = model.predict(train_primitive_matrix)
print "F1 score for Train Set: ", f1_score(train_ground, y_pred, average='micro')

F1 score for Train Set:  0.952


[**LabelPropagation (Normal Semi-Supervised)**](http://scikit-learn.org/stable/modules/generated/sklearn.semi_supervised.LabelSpreading.html#sklearn.semi_supervised.LabelSpreading)
Semi-supervised method that is robust to noise. Fits with both the labeled and unlabeled examples

In [8]:
from sklearn.semi_supervised import LabelSpreading

X_ss = np.concatenate((val_primitive_matrix, train_primitive_matrix))
val_labels = val_ground
train_labels = -1.*np.ones(np.shape(train_primitive_matrix)[0])
y_ss = np.concatenate((val_labels, train_labels))

model = LabelSpreading(kernel='knn', max_iter=100)
model.fit(X_ss, y_ss)
y_pred = model.predict(train_primitive_matrix)
print "F1 score for Train Set: ", f1_score(train_ground, y_pred, average='micro')

F1 score for Train Set:  0.968
