Skip to content

Commit

Permalink
simplification of sparse weights
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnLangford committed Dec 30, 2016
1 parent b9a00e8 commit 912a803
Show file tree
Hide file tree
Showing 35 changed files with 429 additions and 496 deletions.
2 changes: 1 addition & 1 deletion library/gd_mf_weights.cc
Expand Up @@ -58,7 +58,7 @@ int main(int argc, char *argv[])
// global model params
unsigned char left_ns = model->pairs[0][0];
unsigned char right_ns = model->pairs[0][1];
weight_parameters& weights = model->weights;
dense_parameters& weights = model->weights.dense_weights;

// const char *filename = argv[0];
FILE* file = fopen(infile.c_str(), "r");
Expand Down
18 changes: 9 additions & 9 deletions vowpalwabbit/OjaNewton.cc
Expand Up @@ -101,10 +101,10 @@ struct OjaNewton {

void initialize_Z() //TODO: use weight_paramters::set_default for initialization
{
if (all->sparse)
initialize_Z<sparse_weight_parameters>(all->sparse_weights);
if (all->weights.sparse)
initialize_Z(all->weights.sparse_weights);
else
initialize_Z<weight_parameters>(all->weights);
initialize_Z(all->weights.dense_weights);
}
void compute_AZx()
{
Expand Down Expand Up @@ -310,10 +310,10 @@ struct OjaNewton {

void check()
{
if (all->sparse)
check<sparse_weight_parameters>(all->sparse_weights);
if (all->weights.sparse)
check(all->weights.sparse_weights);
else
check<weight_parameters>(all->weights);
check(all->weights.dense_weights);
}
};

Expand Down Expand Up @@ -575,8 +575,8 @@ base_learner* OjaNewton_setup(vw& all, T& weights) {
}
base_learner* OjaNewton_setup(vw& all)
{
if (all.sparse)
return OjaNewton_setup<sparse_weight_parameters>(all, all.sparse_weights);
if (all.weights.sparse)
return OjaNewton_setup(all, all.weights.sparse_weights);
else
return OjaNewton_setup<weight_parameters>(all, all.weights);
return OjaNewton_setup(all, all.weights.dense_weights);
}
97 changes: 65 additions & 32 deletions vowpalwabbit/accumulate.cc
Expand Up @@ -19,18 +19,26 @@ using namespace std;

void add_float(float& c1, const float& c2) { c1 += c2; }

void accumulate(vw& all, weight_parameters& weights, size_t offset)
{ uint32_t length = 1 << all.num_bits; //This is size of gradient
void accumulate(vw& all, parameters& weights, size_t offset)
{ uint64_t length = 1 << all.num_bits; //This is size of gradient
float* local_grad = new float[length];

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
local_grad[iter.index() >> weights.stride_shift()] = (&(*iter))[offset];
if (weights.sparse)
for (uint64_t i = 0; i < length; i++)
local_grad[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset];
else
for (uint64_t i = 0; i < length; i++)
local_grad[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset];

all_reduce<float, add_float>(all, local_grad, length); //TODO: modify to not use first()

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
(&(*iter))[offset] = local_grad[iter.index() >> weights.stride_shift()];

if (weights.sparse)
for (uint64_t i = 0; i < length; i++)
(&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset] = local_grad[i];
else
for (uint64_t i = 0; i < length; i++)
(&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset] = local_grad[i];

delete[] local_grad;
}

Expand All @@ -40,18 +48,26 @@ float accumulate_scalar(vw& all, float local_sum)
return temp;
}

void accumulate_avg(vw& all, weight_parameters& weights, size_t offset)
void accumulate_avg(vw& all, parameters& weights, size_t offset)
{ uint32_t length = 1 << all.num_bits; //This is size of gradient
float numnodes = (float)all.all_reduce->total;
float* local_grad = new float[length];

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
local_grad[iter.index() >> weights.stride_shift()] = (&(*iter))[offset];
if (weights.sparse)
for (uint64_t i = 0; i < length; i++)
local_grad[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset];
else
for (uint64_t i = 0; i < length; i++)
local_grad[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset];

all_reduce<float, add_float>(all, local_grad, length); //TODO: modify to not use first()

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
(&(*iter))[offset] = local_grad[iter.index() >> weights.stride_shift()]/numnodes;
if (weights.sparse)
for (uint64_t i = 0; i < length; i++)
(&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[offset] = local_grad[i] / numnodes;
else
for (uint64_t i = 0; i < length; i++)
(&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[offset] = local_grad[i] / numnodes;

delete[] local_grad;
}
Expand All @@ -70,37 +86,54 @@ float min_elem(float* arr, int length)
return min;
}

void accumulate_weighted_avg(vw& all, weight_parameters& weights)
template<class T>
void do_weighting(vw& all, uint64_t length, float* local_weights, T& weights)
{
for (uint64_t i = 0; i < length; i++)
{
float* weight = &weights[i << weights.stride_shift()];
if (local_weights[i] > 0)
{ float ratio = weight[1] / local_weights[i];
local_weights[i] = weight[0] * ratio;
weight[0] *= ratio;
weight[1] *= ratio; //A crude max
if (all.normalized_updates)
weight[all.normalized_idx] *= ratio; //A crude max
}
else
{ local_weights[i] = 0;
*weight = 0;
}
}
}

void accumulate_weighted_avg(vw& all, parameters& weights)
{ if(!all.adaptive)
{ cerr<<"Weighted averaging is implemented only for adaptive gradient, use accumulate_avg instead\n";
return;
}
uint32_t length = 1 << all.num_bits; //This is the number of parameters
float* local_weights = new float[length];

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
local_weights[iter.index() >> weights.stride_shift()] = (&(*iter))[1];
if (weights.sparse)
for (uint64_t i = 0; i < length; i++)
local_weights[i] = (&(weights.sparse_weights[i << weights.sparse_weights.stride_shift()]))[1];
else
for (uint64_t i = 0; i < length; i++)
local_weights[i] = (&(weights.dense_weights[i << weights.dense_weights.stride_shift()]))[1];

//First compute weights for averaging
all_reduce<float, add_float>(all, local_weights, length);

for (weight_parameters::iterator iter = weights.begin(); iter != weights.end(); ++iter)
{
uint64_t i = iter.index() >> weights.stride_shift();
if (local_weights[i] > 0)
{ float ratio = (&(*iter))[1] / local_weights[i];
local_weights[i] = *iter * ratio;
*iter *= ratio;
(&(*iter))[1] *= ratio; //A crude max
if (all.normalized_updates)
(&(*iter))[all.normalized_idx] *= ratio; //A crude max
}
else
{ local_weights[i] = 0;
*iter = 0;
}
}
all_reduce<float, add_float>(all, weights.first(), length*weights.stride_shift());
if (weights.sparse)
do_weighting(all, length, local_weights, weights.sparse_weights);
else
do_weighting(all, length, local_weights, weights.dense_weights);

if (weights.sparse)
cout << "sparse parameters not supported with parallel computation!" << endl;
else
all_reduce<float, add_float>(all, weights.dense_weights.first(), length*weights.stride_shift());
delete[] local_weights;
}

6 changes: 3 additions & 3 deletions vowpalwabbit/accumulate.h
Expand Up @@ -7,7 +7,7 @@ license as described in the file LICENSE.
#pragma once
#include "global_data.h"

void accumulate(vw& all, weight_parameters& weights, size_t o);
void accumulate(vw& all, parameters& weights, size_t o);
float accumulate_scalar(vw& all, float local_sum);
void accumulate_weighted_avg(vw& all, weight_parameters& weights);
void accumulate_avg(vw& all, weight_parameters& weights, size_t o);
void accumulate_weighted_avg(vw& all, parameters& weights);
void accumulate_avg(vw& all, parameters& weights, size_t o);

0 comments on commit 912a803

Please sign in to comment.