Merge pull request #542 from hal3/master

various bug fixes, modularization to active, code cleanup, stronger python interface
VowpalWabbit · Mar 2, 2015 · 9760ab3 · 9760ab3
2 parents 5b1b5a8 + 1272f8b
commit 9760ab3
Show file tree

Hide file tree

Showing 17 changed files with 379 additions and 191 deletions.
diff --git a/Makefile b/Makefile
@@ -70,7 +70,7 @@ FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_
 #CXX = g++
 
 # for valgrind / gdb debugging
-#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
+#FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) $(ARCH) $(WARN_FLAGS) -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O0  -fPIC
 
 # for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
 #FLAGS = -std=c++0x $(CFLAGS) $(LDFLAGS) -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 $(BOOST_INCLUDE) -g -O2 -fomit-frame-pointer -ffast-math -fno-strict-aliasing  -fPIC

diff --git a/python/pylibvw.cc b/python/pylibvw.cc
@@ -178,37 +178,30 @@ void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list&
       else { cerr << "warning: malformed feature in list" << endl; continue; }
       ai = fv[0];
     }
-
-    bool got = false;
-    py::extract<uint32_t> get_int(ai);
-    if (get_int.check()) { f.weight_index = get_int(); got = true; }
-    else {
+
+    if (f.x != 0.) {
+      bool got = false;
       py::extract<string> get_str(ai);
       if (get_str.check()) {
         f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash);
         got = true;
-      } else { cerr << "warning: malformed feature in list" << endl; continue; }
-    }
-    if (got && (f.x != 0.)) {
-      ec->atomics[ns].push_back(f);
-      count++;
-      sum_sq += f.x * f.x;
+      } else {
+        py::extract<uint32_t> get_int(ai);
+        if (get_int.check()) { f.weight_index = get_int(); got = true; }
+        else { cerr << "warning: malformed feature in list" << endl; continue; }
+      }
+      if (got) {
+        ec->atomics[ns].push_back(f);
+        count++;
+        sum_sq += f.x * f.x;
+      }
     }
   }
   ec->num_features += count;
   ec->sum_feat_sq[ns] += sum_sq;
   ec->total_sum_feat_sq += sum_sq;
 }
 
-bool ex_pop_feature(example_ptr ec, unsigned char ns) {
-  if (ec->atomics[ns].size() == 0) return false;
-  feature f = ec->atomics[ns].pop();
-  ec->num_features--;
-  ec->sum_feat_sq[ns] -= f.x * f.x;
-  ec->total_sum_feat_sq -= f.x * f.x;
-  return true;
-}
-
 void ex_push_namespace(example_ptr ec, unsigned char ns) {
   ec->indices.push_back(ns);
 }
@@ -219,20 +212,108 @@ void ex_ensure_namespace_exists(example_ptr ec, unsigned char ns) {
   ex_push_namespace(ec, ns);
 }
 
-bool ex_pop_namespace(example_ptr ec) {
-  if (ec->indices.size() == 0) return false;
-  unsigned char ns = ec->indices.pop();
+void ex_push_dictionary(example_ptr ec, vw_ptr vw, py::dict& dict) {
+  py::object objectKey, objectVal;
+  const py::object objectKeys = dict.iterkeys();
+  const py::object objectVals = dict.itervalues();
+  unsigned long ulCount = boost::python::extract<unsigned long>(dict.attr("__len__")());
+  for (size_t u=0; u<ulCount; u++) {
+    objectKey = objectKeys.attr( "next" )();
+    objectVal = objectVals.attr( "next" )();
+
+    char chCheckKey = objectKey.ptr()->ob_type->tp_name[0];
+    if (chCheckKey != 's') continue;
+    chCheckKey = objectVal.ptr()->ob_type->tp_name[0];
+    if (chCheckKey != 'l') continue;
+
+    py::extract<string> ns_e(objectKey);
+    if (ns_e().length() < 1) continue;
+    py::extract<py::list> list_e(objectVal);
+    py::list list = list_e();
+    char ns = ns_e()[0];
+    ex_ensure_namespace_exists(ec, ns);
+    ex_push_feature_list(ec, vw, ns, list);
+  }
+}
+
+bool ex_pop_feature(example_ptr ec, unsigned char ns) {
+  if (ec->atomics[ns].size() == 0) return false;
+  feature f = ec->atomics[ns].pop();
+  ec->num_features--;
+  ec->sum_feat_sq[ns] -= f.x * f.x;
+  ec->total_sum_feat_sq -= f.x * f.x;
+  return true;
+}
+
+void ex_erase_namespace(example_ptr ec, unsigned char ns) {
   ec->num_features -= ec->atomics[ns].size();
   ec->total_sum_feat_sq -= ec->sum_feat_sq[ns];
   ec->sum_feat_sq[ns] = 0.;
   ec->atomics[ns].erase();
+  ec->audit_features[ns].erase();
+}
+
+bool ex_pop_namespace(example_ptr ec) {
+  if (ec->indices.size() == 0) return false;
+  unsigned char ns = ec->indices.pop();
+  ex_erase_namespace(ec, ns);
   return true;
 }
 
 void my_setup_example(vw_ptr vw, example_ptr ec) {
   VW::setup_example(*vw, ec.get());
 }
 
+void unsetup_example(vw_ptr vwP, example_ptr ae) {
+  vw&all = *vwP;
+  ae->partial_prediction = 0.;
+  ae->num_features = 0;
+  ae->total_sum_feat_sq = 0;
+  ae->loss = 0.;
+
+  if (all.ignore_some) {
+    cerr << "error: cannot unsetup example when some namespaces are ignored!" << endl;
+    throw exception();
+  }
+
+  if(all.ngram_strings.size() > 0) {
+    cerr << "error: cannot unsetup example when ngrams are in use!" << endl;
+    throw exception();
+  }
+
+  if (all.add_constant) {
+    ae->atomics[constant_namespace].erase();
+    ae->audit_features[constant_namespace].erase();
+    int hit_constant = -1;
+    size_t N = ae->indices.size();
+    for (size_t i=0; i<N; i++) {
+      size_t j = N - 1 - i;
+      if (ae->indices[j] == constant_namespace) {
+        if (hit_constant >= 0) { cerr << "error: hit constant namespace twice!" << endl; throw exception(); }
+        hit_constant = j;
+        break;
+      }
+    }
+    if (hit_constant >= 0) {
+      for (size_t i=hit_constant; i<N-1; i++)
+        ae->indices[i] = ae->indices[i+1];
+      ae->indices.pop();
+    }
+  }
+
+  uint32_t multiplier = all.wpp << all.reg.stride_shift;
+  if(multiplier != 1) { //make room for per-feature information.
+    for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
+      for(feature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
+        j->weight_index /= multiplier;
+    if (all.audit || all.hash_inv)
+      for (unsigned char* i = ae->indices.begin; i != ae->indices.end; i++)
+        for(audit_data* j = ae->audit_features[*i].begin; j != ae->audit_features[*i].end; j++)
+          j->weight_index /= multiplier;
+  }
+}
+
+
 void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) {
   // SPEEDUP: if it's already set properly, don't modify
   label_parser& old_lp = vw->p->lp;
@@ -466,6 +547,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
       .def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature")
       .def("finish_example", &my_finish_example, "tell VW that you're done with a given example")
       .def("setup_example", &my_setup_example, "given an example that you've created by hand, prepare it for learning (eg, compute quadratic feature)")
+      .def("unsetup_example", &unsetup_example, "reverse the process of setup, so that you can go back and modify this example")
 
       .def("num_weights", &VW::num_weights, "how many weights are we learning?")
       .def("get_weight", &VW::get_weight, "get the weight for a particular index")
@@ -513,10 +595,12 @@ BOOST_PYTHON_MODULE(pylibvw) {
 
       .def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)")
       .def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace")
+      .def("push_feature_dict", &ex_push_dictionary, "Add a (Python) dictionary of namespace/feature-list pairs")
       .def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty")
       .def("push_namespace", &ex_push_namespace, "Add a new namespace")
       .def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
       .def("pop_namespace", &ex_pop_namespace, "Remove the top namespace off; returns True iff the list was non-empty")
+      .def("erase_namespace", &ex_erase_namespace, "Remove all the features from a given namespace")
 
       .def("set_label_string", &ex_set_label_string, "(Re)assign the label of this example to this string")
 

diff --git a/python/pyvw.py b/python/pyvw.py
@@ -145,6 +145,8 @@ def predict(examples, my_tag, oracle, condition=None, allowed=None, learner_id=0
                         ec = examples[n]
                         while hasattr(ec, '__call__'): ec = ec()   # unfold the lambdas
                         if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()')
+                        if hasattr(ec, 'setup_done') and not ec.setup_done:
+                            ec.setup_example()
                         P.set_input_at(n, ec)
                 else:
                     pass # TODO: do we need to set the examples even though they're not used?
@@ -265,7 +267,7 @@ def push_feature(self, feature, v=1.):
     def pop_feature(self):
         """Remove the top feature from the current namespace; returns True
         if a feature was removed, returns False if there were no
-        features to pop. Fails if setup has run."""
+        features to pop."""
         return self.ex.pop_feature(self.ns)
 
     def push_features(self, ns, featureList):
@@ -399,16 +401,14 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
             self.setup_done = False
         elif isinstance(initStringOrDict, str):
             pylibvw.example.__init__(self, vw, labelType, initStringOrDict)
-            self.setup_done = True
+            self.setup_done = False
         elif isinstance(initStringOrDict, dict):
             pylibvw.example.__init__(self, vw, labelType)
             self.vw = vw
             self.stride = vw.get_stride()
             self.finished = False
+            self.push_feature_dict(vw, initStringOrDict)
             self.setup_done = False
-            for ns_char,feats in initStringOrDict.iteritems():
-                self.push_features(ns_char, feats)
-            self.setup_example()
         else:
             raise TypeError('expecting string or dict as argument for example construction')
 
@@ -468,6 +468,13 @@ def setup_example(self):
         self.vw.setup_example(self)
         self.setup_done = True
 
+    def unsetup_example(self):
+        """If this example has been setup, reverse that process so you can continue editing the examples."""
+        if not self.setup_done:
+            raise Exception('trying to unsetup_example that has not yet been setup')
+        self.vw.unsetup_example(self)
+        self.setup_done = False
+
     def learn(self):
         """Learn on this example (and before learning, automatically
         call setup_example if the example hasn't yet been setup)."""
@@ -501,42 +508,40 @@ def get_feature_id(self, ns, feature, ns_hash=None):
 
 
     def push_hashed_feature(self, ns, f, v=1.):
-        """Add a hashed feature to a given namespace (fails if setup
-        has already run on this example). Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        """Add a hashed feature to a given namespace."""
+        if self.setup_done: self.unsetup_example();
         pylibvw.example.push_hashed_feature(self, self.get_ns(ns).ord_ns, f, v)
 
     def push_feature(self, ns, feature, v=1., ns_hash=None):
-        """Add an unhashed feature to a given namespace (fails if
-        setup has already run on this example)."""
+        """Add an unhashed feature to a given namespace."""
         f = self.get_feature_id(ns, feature, ns_hash)
         self.push_hashed_feature(ns, f, v)
 
     def pop_feature(self, ns):
         """Remove the top feature from a given namespace; returns True
         if a feature was removed, returns False if there were no
-        features to pop. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        features to pop."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.pop_feature(self, self.get_ns(ns).ord_ns)
 
     def push_namespace(self, ns):
         """Push a new namespace onto this example. You should only do
         this if you're sure that this example doesn't already have the
-        given namespace. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        given namespace."""
+        if self.setup_done: self.unsetup_example();
         pylibvw.example.push_namespace(self, self.get_ns(ns).ord_ns)
 
     def pop_namespace(self):
         """Remove the top namespace from an example; returns True if a
         namespace was removed, or False if there were no namespaces
-        left. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        left."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.pop_namespace(self)
 
     def ensure_namespace_exists(self, ns):
         """Check to see if a namespace already exists. If it does, do
-        nothing. If it doesn't, add it. Fails if setup has run."""
-        if self.setup_done: raise Exception("error: modification to example after setup")
+        nothing. If it doesn't, add it."""
+        if self.setup_done: self.unsetup_example();
         return pylibvw.example.ensure_namespace_exists(self, self.get_ns(ns).ord_ns)
 
     def push_features(self, ns, featureList):
@@ -552,8 +557,7 @@ def push_features(self, ns, featureList):
            space_hash = vw.hash_space( 'x' )
            feat_hash  = vw.hash_feature( 'a', space_hash )
            ex.push_features('x', [feat_hash])    # note: 'x' should match the space_hash!
-
-        Fails if setup has run."""
+        """
         ns = self.get_ns(ns)
         self.ensure_namespace_exists(ns)
         self.push_feature_list(self.vw, ns.ord_ns, featureList)   # much faster just to do it in C++

diff --git a/python/test_partial_example.py b/python/test_partial_example.py
@@ -0,0 +1,15 @@
+import pyvw
+
+vw = pyvw.vw('--audit')
+full = vw.example( { 'a': ['b'], 'x': ['y'] } )
+full.learn()
+
+part = vw.example( {'a': ['b'] } )
+part.learn()
+
+part.push_features('x', ['y'])
+part.learn()
+
+part.erase_namespace(ord('x'))
+part.push_features('x', ['z'])
+part.learn()
diff --git a/test/train-sets/ref/dictionary_test.stderr b/test/train-sets/ref/dictionary_test.stderr
@@ -10,21 +10,21 @@ Reading datafile = train-sets/dictionary_test.dat
 num sources = 1
 average    since         example     example  current  current  current
 loss       last          counter      weight    label  predict features
-1.000000   1.000000            1         1.0   1.0000  -1.0000        2
-1.000000   1.000000            2         2.0  -1.0000   1.0000        3
-1.000000   1.000000            4         4.0  -1.0000   1.0000        5
-0.750000   0.500000            8         8.0  -1.0000   1.0000        5
-0.500000   0.250000           16        16.0  -1.0000  -1.0000        5
-0.375000   0.250000           32        32.0  -1.0000  -1.0000        5
-0.312500   0.250000           64        64.0  -1.0000  -1.0000        5
-0.164062   0.015625          128       128.0  -1.0000  -1.0000        5
+1.000000   1.000000          1      1.0     1.0000  -1.0000        2
+1.000000   1.000000          2      2.0    -1.0000   1.0000        2
+0.500000   0.000000          4      4.0    -1.0000  -1.0000        2
+0.250000   0.000000          8      8.0    -1.0000  -1.0000        2
+0.125000   0.000000         16     16.0    -1.0000  -1.0000        2
+0.062500   0.000000         32     32.0    -1.0000  -1.0000        2
+0.031250   0.000000         64     64.0    -1.0000  -1.0000        2
+0.015625   0.000000        128    128.0    -1.0000  -1.0000        2
 
 finished run
 number of examples per pass = 4
 passes used = 32
 weighted example sum = 128
 weighted label sum = 0
-average loss = 0.164062
+average loss = 0.015625
 best constant = 0
 best constant's loss = 1
-total feature number = 448
+total feature number = 256
diff --git a/test/train-sets/ref/search_er.stderr b/test/train-sets/ref/search_er.stderr
@@ -9,14 +9,14 @@ num sources = 1
 average    since      instance            current true      current predicted   cur   cur   predic    cache  examples          
 loss       last        counter           output prefix          output prefix  pass   pol     made     hits    gener  beta    
 1.000000   1.000000          1  [4                   ] [1                   ]     0     0        1        0        1  0.000000
-2.000000   3.000000          2  [2 4 2 5 10 10       ] [4 4 4 10 10 10      ]     0     0        7        0        7  0.000000
-2.875000   3.750000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
-1.437500   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       42        0       64  0.000001
+2.500000   4.000000          2  [2 4 2 5 10 10       ] [4 4 4 7 7 7         ]     0     0        7        0        7  0.000000
+3.250000   4.000000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
+1.625000   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       64        0       64  0.000001
 
 finished run
 number of examples per pass = 4
 passes used = 3
 weighted example sum = 12
 weighted label sum = 0
-average loss = 0.958333
-total feature number = 522
+average loss = 1.08333
+total feature number = 1185
diff --git a/vowpalwabbit/Makefile b/vowpalwabbit/Makefile
@@ -52,4 +52,7 @@ install: $(BINARIES)
 clean:
 	rm -f  *.o *.d $(BINARIES) *~ $(MANPAGES) libvw.a
 
+python: vw
+	cd .. ; $(MAKE) python
+
 .PHONY: all clean install test things