Skip to content

Commit

Permalink
Merge pull request #542 from hal3/master
Browse files Browse the repository at this point in the history
various bug fixes, modularization to active, code cleanup, stronger python interface
  • Loading branch information
JohnLangford committed Mar 2, 2015
2 parents 5b1b5a8 + 1272f8b commit 9760ab3
Show file tree
Hide file tree
Showing 17 changed files with 379 additions and 191 deletions.
2 changes: 1 addition & 1 deletion Makefile
Expand Up @@ -70,7 +70,7 @@ FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_
#CXX = g++

# for valgrind / gdb debugging
#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0 -fPIC
#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0 -fPIC

# for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O2 -fomit-frame-pointer -ffast-math -fno-strict-aliasing -fPIC
Expand Down
130 changes: 107 additions & 23 deletions python/pylibvw.cc
Expand Up @@ -178,37 +178,30 @@ void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list&
else { cerr << "warning: malformed feature in list" << endl; continue; }
ai = fv[0];
}

bool got = false;
py::extract<uint32_t> get_int(ai);
if (get_int.check()) { f.weight_index = get_int(); got = true; }
else {

if (f.x != 0.) {
bool got = false;
py::extract<string> get_str(ai);
if (get_str.check()) {
f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash);
got = true;
} else { cerr << "warning: malformed feature in list" << endl; continue; }
}
if (got && (f.x != 0.)) {
ec->atomics[ns].push_back(f);
count++;
sum_sq += f.x * f.x;
} else {
py::extract<uint32_t> get_int(ai);
if (get_int.check()) { f.weight_index = get_int(); got = true; }
else { cerr << "warning: malformed feature in list" << endl; continue; }
}
if (got) {
ec->atomics[ns].push_back(f);
count++;
sum_sq += f.x * f.x;
}
}
}
ec->num_features += count;
ec->sum_feat_sq[ns] += sum_sq;
ec->total_sum_feat_sq += sum_sq;
}

bool ex_pop_feature(example_ptr ec, unsigned char ns) {
if (ec->atomics[ns].size() == 0) return false;
feature f = ec->atomics[ns].pop();
ec->num_features--;
ec->sum_feat_sq[ns] -= f.x * f.x;
ec->total_sum_feat_sq -= f.x * f.x;
return true;
}

void ex_push_namespace(example_ptr ec, unsigned char ns) {
ec->indices.push_back(ns);
}
Expand All @@ -219,20 +212,108 @@ void ex_ensure_namespace_exists(example_ptr ec, unsigned char ns) {
ex_push_namespace(ec, ns);
}

bool ex_pop_namespace(example_ptr ec) {
if (ec->indices.size() == 0) return false;
unsigned char ns = ec->indices.pop();
void ex_push_dictionary(example_ptr ec, vw_ptr vw, py::dict& dict) {
py::object objectKey, objectVal;
const py::object objectKeys = dict.iterkeys();
const py::object objectVals = dict.itervalues();
unsigned long ulCount = boost::python::extract<unsigned long>(dict.attr("__len__")());
for (size_t u=0; u<ulCount; u++) {
objectKey = objectKeys.attr( "next" )();
objectVal = objectVals.attr( "next" )();

char chCheckKey = objectKey.ptr()->ob_type->tp_name[0];
if (chCheckKey != 's') continue;
chCheckKey = objectVal.ptr()->ob_type->tp_name[0];
if (chCheckKey != 'l') continue;

py::extract<string> ns_e(objectKey);
if (ns_e().length() < 1) continue;
py::extract<py::list> list_e(objectVal);
py::list list = list_e();
char ns = ns_e()[0];
ex_ensure_namespace_exists(ec, ns);
ex_push_feature_list(ec, vw, ns, list);
}
}

bool ex_pop_feature(example_ptr ec, unsigned char ns) {
if (ec->atomics[ns].size() == 0) return false;
feature f = ec->atomics[ns].pop();
ec->num_features--;
ec->sum_feat_sq[ns] -= f.x * f.x;
ec->total_sum_feat_sq -= f.x * f.x;
return true;
}

void ex_erase_namespace(example_ptr ec, unsigned char ns) {
ec->num_features -= ec->atomics[ns].size();
ec->total_sum_feat_sq -= ec->sum_feat_sq[ns];
ec->sum_feat_sq[ns] = 0.;
ec->atomics[ns].erase();
ec->audit_features[ns].erase();
}

bool ex_pop_namespace(example_ptr ec) {
if (ec->indices.size() == 0) return false;
unsigned char ns = ec->indices.pop();
ex_erase_namespace(ec, ns);
return true;
}

void my_setup_example(vw_ptr vw, example_ptr ec) {
VW::setup_example(*vw, ec.get());
}

void unsetup_example(vw_ptr vwP, example_ptr ae) {
vw&all = *vwP;
ae->partial_prediction = 0.;
ae->num_features = 0;
ae->total_sum_feat_sq = 0;
ae->loss = 0.;

if (all.ignore_some) {
cerr << "error: cannot unsetup example when some namespaces are ignored!" << endl;
throw exception();
}

if(all.ngram_strings.size() > 0) {
cerr << "error: cannot unsetup example when ngrams are in use!" << endl;
throw exception();
}

if (all.add_constant) {
ae->atomics[constant_namespace].erase();
ae->audit_features[constant_namespace].erase();
int hit_constant = -1;
size_t N = ae->indices.size();
for (size_t i=0; i<N; i++) {
size_t j = N - 1 - i;
if (ae->indices[j] == constant_namespace) {
if (hit_constant >= 0) { cerr << "error: hit constant namespace twice!" << endl; throw exception(); }
hit_constant = j;
break;
}
}
if (hit_constant >= 0) {
for (size_t i=hit_constant; i<N-1; i++)
ae->indices[i] = ae->indices[i+1];
ae->indices.pop();
}
}

uint32_t multiplier = all.wpp << all.reg.stride_shift;
if(multiplier != 1) { //make room for per-feature information.
for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
for(feature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
j->weight_index /= multiplier;
if (all.audit || all.hash_inv)
for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
for(audit_data* j = ae->audit_features[*i].begin; j != ae->audit_features[*i].end; j++)
j->weight_index /= multiplier;
}
}


void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) {
// SPEEDUP: if it's already set properly, don't modify
label_parser& old_lp = vw->p->lp;
Expand Down Expand Up @@ -466,6 +547,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
.def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature")
.def("finish_example", &my_finish_example, "tell VW that you're done with a given example")
.def("setup_example", &my_setup_example, "given an example that you've created by hand, prepare it for learning (eg, compute quadratic feature)")
.def("unsetup_example", &unsetup_example, "reverse the process of setup, so that you can go back and modify this example")

.def("num_weights", &VW::num_weights, "how many weights are we learning?")
.def("get_weight", &VW::get_weight, "get the weight for a particular index")
Expand Down Expand Up @@ -513,10 +595,12 @@ BOOST_PYTHON_MODULE(pylibvw) {

.def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)")
.def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace")
.def("push_feature_dict", &ex_push_dictionary, "Add a (Python) dictionary of namespace/feature-list pairs")
.def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty")
.def("push_namespace", &ex_push_namespace, "Add a new namespace")
.def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
.def("pop_namespace", &ex_pop_namespace, "Remove the top namespace off; returns True iff the list was non-empty")
.def("erase_namespace", &ex_erase_namespace, "Remove all the features from a given namespace")

.def("set_label_string", &ex_set_label_string, "(Re)assign the label of this example to this string")

Expand Down
44 changes: 24 additions & 20 deletions python/pyvw.py
Expand Up @@ -145,6 +145,8 @@ def predict(examples, my_tag, oracle, condition=None, allowed=None, learner_id=0
ec = examples[n]
while hasattr(ec, '__call__'): ec = ec() # unfold the lambdas
if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()')
if hasattr(ec, 'setup_done') and not ec.setup_done:
ec.setup_example()
P.set_input_at(n, ec)
else:
pass # TODO: do we need to set the examples even though they're not used?
Expand Down Expand Up @@ -265,7 +267,7 @@ def push_feature(self, feature, v=1.):
def pop_feature(self):
"""Remove the top feature from the current namespace; returns True
if a feature was removed, returns False if there were no
features to pop. Fails if setup has run."""
features to pop."""
return self.ex.pop_feature(self.ns)

def push_features(self, ns, featureList):
Expand Down Expand Up @@ -399,16 +401,14 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
self.setup_done = False
elif isinstance(initStringOrDict, str):
pylibvw.example.__init__(self, vw, labelType, initStringOrDict)
self.setup_done = True
self.setup_done = False
elif isinstance(initStringOrDict, dict):
pylibvw.example.__init__(self, vw, labelType)
self.vw = vw
self.stride = vw.get_stride()
self.finished = False
self.push_feature_dict(vw, initStringOrDict)
self.setup_done = False
for ns_char,feats in initStringOrDict.iteritems():
self.push_features(ns_char, feats)
self.setup_example()
else:
raise TypeError('expecting string or dict as argument for example construction')

Expand Down Expand Up @@ -468,6 +468,13 @@ def setup_example(self):
self.vw.setup_example(self)
self.setup_done = True

def unsetup_example(self):
"""If this example has been setup, reverse that process so you can continue editing the examples."""
if not self.setup_done:
raise Exception('trying to unsetup_example that has not yet been setup')
self.vw.unsetup_example(self)
self.setup_done = False

def learn(self):
"""Learn on this example (and before learning, automatically
call setup_example if the example hasn't yet been setup)."""
Expand Down Expand Up @@ -501,42 +508,40 @@ def get_feature_id(self, ns, feature, ns_hash=None):


def push_hashed_feature(self, ns, f, v=1.):
"""Add a hashed feature to a given namespace (fails if setup
has already run on this example). Fails if setup has run."""
if self.setup_done: raise Exception("error: modification to example after setup")
"""Add a hashed feature to a given namespace."""
if self.setup_done: self.unsetup_example();
pylibvw.example.push_hashed_feature(self, self.get_ns(ns).ord_ns, f, v)

def push_feature(self, ns, feature, v=1., ns_hash=None):
"""Add an unhashed feature to a given namespace (fails if
setup has already run on this example)."""
"""Add an unhashed feature to a given namespace."""
f = self.get_feature_id(ns, feature, ns_hash)
self.push_hashed_feature(ns, f, v)

def pop_feature(self, ns):
"""Remove the top feature from a given namespace; returns True
if a feature was removed, returns False if there were no
features to pop. Fails if setup has run."""
if self.setup_done: raise Exception("error: modification to example after setup")
features to pop."""
if self.setup_done: self.unsetup_example();
return pylibvw.example.pop_feature(self, self.get_ns(ns).ord_ns)

def push_namespace(self, ns):
"""Push a new namespace onto this example. You should only do
this if you're sure that this example doesn't already have the
given namespace. Fails if setup has run."""
if self.setup_done: raise Exception("error: modification to example after setup")
given namespace."""
if self.setup_done: self.unsetup_example();
pylibvw.example.push_namespace(self, self.get_ns(ns).ord_ns)

def pop_namespace(self):
"""Remove the top namespace from an example; returns True if a
namespace was removed, or False if there were no namespaces
left. Fails if setup has run."""
if self.setup_done: raise Exception("error: modification to example after setup")
left."""
if self.setup_done: self.unsetup_example();
return pylibvw.example.pop_namespace(self)

def ensure_namespace_exists(self, ns):
"""Check to see if a namespace already exists. If it does, do
nothing. If it doesn't, add it. Fails if setup has run."""
if self.setup_done: raise Exception("error: modification to example after setup")
nothing. If it doesn't, add it."""
if self.setup_done: self.unsetup_example();
return pylibvw.example.ensure_namespace_exists(self, self.get_ns(ns).ord_ns)

def push_features(self, ns, featureList):
Expand All @@ -552,8 +557,7 @@ def push_features(self, ns, featureList):
space_hash = vw.hash_space( 'x' )
feat_hash = vw.hash_feature( 'a', space_hash )
ex.push_features('x', [feat_hash]) # note: 'x' should match the space_hash!
Fails if setup has run."""
"""
ns = self.get_ns(ns)
self.ensure_namespace_exists(ns)
self.push_feature_list(self.vw, ns.ord_ns, featureList) # much faster just to do it in C++
Expand Down
15 changes: 15 additions & 0 deletions python/test_partial_example.py
@@ -0,0 +1,15 @@
import pyvw

vw = pyvw.vw('--audit')
full = vw.example( { 'a': ['b'], 'x': ['y'] } )
full.learn()

part = vw.example( {'a': ['b'] } )
part.learn()

part.push_features('x', ['y'])
part.learn()

part.erase_namespace(ord('x'))
part.push_features('x', ['z'])
part.learn()
20 changes: 10 additions & 10 deletions test/train-sets/ref/dictionary_test.stderr
Expand Up @@ -10,21 +10,21 @@ Reading datafile = train-sets/dictionary_test.dat
num sources = 1
average since example example current current current
loss last counter weight label predict features
1.000000 1.000000 1 1.0 1.0000 -1.0000 2
1.000000 1.000000 2 2.0 -1.0000 1.0000 3
1.000000 1.000000 4 4.0 -1.0000 1.0000 5
0.750000 0.500000 8 8.0 -1.0000 1.0000 5
0.500000 0.250000 16 16.0 -1.0000 -1.0000 5
0.375000 0.250000 32 32.0 -1.0000 -1.0000 5
0.312500 0.250000 64 64.0 -1.0000 -1.0000 5
0.164062 0.015625 128 128.0 -1.0000 -1.0000 5
1.000000 1.000000 1 1.0 1.0000 -1.0000 2
1.000000 1.000000 2 2.0 -1.0000 1.0000 2
0.500000 0.000000 4 4.0 -1.0000 -1.0000 2
0.250000 0.000000 8 8.0 -1.0000 -1.0000 2
0.125000 0.000000 16 16.0 -1.0000 -1.0000 2
0.062500 0.000000 32 32.0 -1.0000 -1.0000 2
0.031250 0.000000 64 64.0 -1.0000 -1.0000 2
0.015625 0.000000 128 128.0 -1.0000 -1.0000 2

finished run
number of examples per pass = 4
passes used = 32
weighted example sum = 128
weighted label sum = 0
average loss = 0.164062
average loss = 0.015625
best constant = 0
best constant's loss = 1
total feature number = 448
total feature number = 256
10 changes: 5 additions & 5 deletions test/train-sets/ref/search_er.stderr
Expand Up @@ -9,14 +9,14 @@ num sources = 1
average since instance current true current predicted cur cur predic cache examples
loss last counter output prefix output prefix pass pol made hits gener beta
1.000000 1.000000 1 [4 ] [1 ] 0 0 1 0 1 0.000000
2.000000 3.000000 2 [2 4 2 5 10 10 ] [4 4 4 10 10 10 ] 0 0 7 0 7 0.000000
2.875000 3.750000 4 [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..] 0 0 32 0 32 0.000000
1.437500 0.000000 8 [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..] 1 0 42 0 64 0.000001
2.500000 4.000000 2 [2 4 2 5 10 10 ] [4 4 4 7 7 7 ] 0 0 7 0 7 0.000000
3.250000 4.000000 4 [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..] 0 0 32 0 32 0.000000
1.625000 0.000000 8 [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..] 1 0 64 0 64 0.000001

finished run
number of examples per pass = 4
passes used = 3
weighted example sum = 12
weighted label sum = 0
average loss = 0.958333
total feature number = 522
average loss = 1.08333
total feature number = 1185
3 changes: 3 additions & 0 deletions vowpalwabbit/Makefile
Expand Up @@ -52,4 +52,7 @@ install: $(BINARIES)
clean:
rm -f *.o *.d $(BINARIES) *~ $(MANPAGES) libvw.a

python: vw
cd .. ; $(MAKE) python

.PHONY: all clean install test things

0 comments on commit 9760ab3

Please sign in to comment.