simplification of sparse weights

VowpalWabbit · Dec 30, 2016 · 912a803 · 912a803
1 parent b9a00e8
commit 912a803
Show file tree

Hide file tree

Showing 35 changed files with 429 additions and 496 deletions.
diff --git a/library/gd_mf_weights.cc b/library/gd_mf_weights.cc
@@ -58,7 +58,7 @@ int main(int argc, char *argv[])
   // global model params
   unsigned char left_ns = model->pairs[0][0];
   unsigned char right_ns = model->pairs[0][1];
-  weight_parameters& weights = model->weights;
+  dense_parameters& weights = model->weights.dense_weights;
 
   // const char *filename = argv[0];
   FILE* file = fopen(infile.c_str(), "r");

diff --git a/vowpalwabbit/OjaNewton.cc b/vowpalwabbit/OjaNewton.cc
@@ -101,10 +101,10 @@ struct OjaNewton {
 
 	void initialize_Z() //TODO: use weight_paramters::set_default for initialization
 	{
-		if (all->sparse)
-			initialize_Z<sparse_weight_parameters>(all->sparse_weights);
+		if (all->weights.sparse)
+			initialize_Z(all->weights.sparse_weights);
 		else
-			initialize_Z<weight_parameters>(all->weights);
+			initialize_Z(all->weights.dense_weights);
 	}
     void compute_AZx()
     {
@@ -310,10 +310,10 @@ struct OjaNewton {
 
 	void check()
 	{
-		if (all->sparse)
-			check<sparse_weight_parameters>(all->sparse_weights);
+		if (all->weights.sparse)
+			check(all->weights.sparse_weights);
 		else
-			check<weight_parameters>(all->weights);
+			check(all->weights.dense_weights);
 	}
 };
 
@@ -575,8 +575,8 @@ base_learner* OjaNewton_setup(vw& all, T& weights) {
 }
 base_learner* OjaNewton_setup(vw& all)
 {
-	if (all.sparse)
-		return OjaNewton_setup<sparse_weight_parameters>(all, all.sparse_weights);
+	if (all.weights.sparse)
+		return OjaNewton_setup(all, all.weights.sparse_weights);
 	else
-		return OjaNewton_setup<weight_parameters>(all, all.weights);
+		return OjaNewton_setup(all, all.weights.dense_weights);
 }
diff --git a/vowpalwabbit/accumulate.cc b/vowpalwabbit/accumulate.cc
@@ -19,18 +19,26 @@ using namespace std;
 
 void add_float(float& c1, const float& c2) { c1 += c2; }
 
-void accumulate(vw& all, weight_parameters& weights, size_t offset)
-{ uint32_t length = 1 << all.num_bits; //This is size of gradient
+void accumulate(vw& all, parameters& weights, size_t offset)
+{ uint64_t length = 1 << all.num_bits; //This is size of gradient
   float* local_grad = new float[length];
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-    local_grad[iter.index() >> weights.stride_shift()] = (&(*iter))[offset];
+  if (weights.sparse)
+    for (uint64_t i = 0; i < length; i++)
+      local_grad[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset];
+  else
+    for (uint64_t i = 0; i < length; i++)
+      local_grad[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset];    
 
   all_reduce<float, add_float>(all, local_grad, length); //TODO: modify to not use first()
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-    (&(*iter))[offset] = local_grad[iter.index() >> weights.stride_shift()];
-
+  if (weights.sparse)
+    for (uint64_t i = 0; i < length; i++)
+      (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset] = local_grad[i];
+  else
+    for (uint64_t i = 0; i < length; i++)
+      (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset] = local_grad[i];
+
   delete[] local_grad;
 }
 
@@ -40,18 +48,26 @@ float accumulate_scalar(vw& all, float local_sum)
   return temp;
 }
 
-void accumulate_avg(vw& all, weight_parameters& weights, size_t offset)
+void accumulate_avg(vw& all, parameters& weights, size_t offset)
 { uint32_t length = 1 << all.num_bits; //This is size of gradient
   float numnodes = (float)all.all_reduce->total;
   float* local_grad = new float[length];
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-    local_grad[iter.index() >> weights.stride_shift()] = (&(*iter))[offset];
+  if (weights.sparse)
+    for (uint64_t i = 0; i < length; i++)
+      local_grad[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset];
+  else
+    for (uint64_t i = 0; i < length; i++)
+      local_grad[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset];    
 
   all_reduce<float, add_float>(all, local_grad, length); //TODO: modify to not use first()
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-    (&(*iter))[offset] = local_grad[iter.index() >> weights.stride_shift()]/numnodes;
+  if (weights.sparse)
+    for (uint64_t i = 0; i < length; i++)
+      (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset] = local_grad[i] / numnodes;
+  else
+    for (uint64_t i = 0; i < length; i++)
+      (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset] = local_grad[i] / numnodes;
 
   delete[] local_grad;
 }
@@ -70,37 +86,54 @@ float min_elem(float* arr, int length)
   return min;
 }
 
-void accumulate_weighted_avg(vw& all, weight_parameters& weights)
+template<class T>
+void do_weighting(vw& all, uint64_t length, float* local_weights, T& weights)
+{
+  for (uint64_t i = 0; i < length; i++)
+    {
+      float* weight = &weights[i << weights.stride_shift()];
+      if (local_weights[i] > 0)
+	{ float ratio = weight[1] / local_weights[i];
+	  local_weights[i] = weight[0] * ratio;
+	  weight[0] *= ratio;
+	  weight[1] *= ratio; //A crude max
+	  if (all.normalized_updates)
+	    weight[all.normalized_idx] *= ratio; //A crude max
+	}
+      else
+	{  local_weights[i] = 0;
+	  *weight = 0;
+	}
+    }
+}
+
+void accumulate_weighted_avg(vw& all, parameters& weights)
 { if(!all.adaptive)
   { cerr<<"Weighted averaging is implemented only for adaptive gradient, use accumulate_avg instead\n";
     return;
   }
   uint32_t length = 1 << all.num_bits; //This is the number of parameters
   float* local_weights = new float[length];
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-    local_weights[iter.index() >> weights.stride_shift()] = (&(*iter))[1];
+  if (weights.sparse)
+    for (uint64_t i = 0; i < length; i++)
+      local_weights[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[1];
+  else
+    for (uint64_t i = 0; i < length; i++)
+      local_weights[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[1];    
 
   //First compute weights for averaging
   all_reduce<float, add_float>(all, local_weights, length);
 
-  for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
-  {
-    uint64_t i = iter.index() >> weights.stride_shift();
-    if (local_weights[i] > 0)
-	{  float ratio = (&(*iter))[1] / local_weights[i];
-		local_weights[i] = *iter * ratio;
-		*iter *= ratio;
-		(&(*iter))[1] *= ratio; //A crude max
-		if (all.normalized_updates)
-			(&(*iter))[all.normalized_idx] *= ratio; //A crude max
-	  }
-	  else
-	  {  local_weights[i] = 0;
-		 *iter = 0;
-	  }
-  }
-  all_reduce<float, add_float>(all, weights.first(), length*weights.stride_shift()); 
+  if (weights.sparse)
+    do_weighting(all, length, local_weights, weights.sparse_weights);
+  else
+    do_weighting(all, length, local_weights, weights.dense_weights);
+
+  if (weights.sparse)
+    cout << "sparse parameters not supported with parallel computation!" << endl;
+  else
+    all_reduce<float, add_float>(all, weights.dense_weights.first(), length*weights.stride_shift()); 
   delete[] local_weights;
 }
 
diff --git a/vowpalwabbit/accumulate.h b/vowpalwabbit/accumulate.h
@@ -7,7 +7,7 @@ license as described in the file LICENSE.
 #pragma once
 #include "global_data.h"
 
-void accumulate(vw& all, weight_parameters& weights, size_t o);
+void accumulate(vw& all, parameters& weights, size_t o);
 float accumulate_scalar(vw& all, float local_sum);
-void accumulate_weighted_avg(vw& all, weight_parameters& weights);
-void accumulate_avg(vw& all, weight_parameters& weights, size_t o);
+void accumulate_weighted_avg(vw& all, parameters& weights);
+void accumulate_avg(vw& all, parameters& weights, size_t o);