LLNL · nychiang · Mar 8, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 9, 2023
diff --git a/src/LinAlg/VectorCudaKernels.cu b/src/LinAlg/VectorCudaKernels.cu
@@ -640,6 +640,89 @@ __global__ void copyToStartingAt_w_pattern_cu(int n_src,
   }
 }
 
+/** @brief process variable bounds */
+__global__ void process_bounds_cu(int n,
+                                  const double* xl,
+                                  const double* xu,
+                                  double* ixl,
+                                  double* ixu,
+                                  int* bnds_low,
+                                  int* bnds_upp,
+                                  int* bnds_lu,
+                                  int* fixed_vars,
+                                  double fixed_var_tol)
+{
+  const int num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = tid; i < n; i += num_threads) {
+    // preemptive loop to reduce number of iterations?
+    bnds_low[i] = 0;
+    bnds_upp[i] = 0;
+    bnds_lu[i] = 0;
+    fixed_vars[i] = 0;
+
+    if(xl[i] > -1e20) {
+      ixl[i] = 1.;
+      bnds_low[i] = 1;
+      if(xu[i] < 1e20) {
+        bnds_lu[i] = 1;
+      }
+    } else {
+      ixl[i] = 0.;
+    }
+
+    if(xu[i] < 1e20) {
+      ixu[i] = 1.;
+      bnds_upp[i] = 1;
+    } else {
+      ixu[i] = 0.;
+    }
+
+    if(xu[i] < 1e20 &&
+       fabs(xl[i]-xu[i]) <= fixed_var_tol*fmax(1.,std::fabs(xu[i]))) {
+      fixed_vars[i] = 1;
+    }
+  }
+}
+
+/** @brief relax variable bounds */
+__global__ void relax_bounds_cu(int n,
+                                double* xla,
+                                double* xua,
+                                double fixed_var_tol,
+                                double fixed_var_perturb)
+{
+  const int num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = tid; i < n; i += num_threads) {
+    double xuabs = fabs(xua[i]);
+    if(fabs(xua[i]-xla[i]) <= fixed_var_tol*fmax(1.,xuabs)) {
+      xua[i] += fixed_var_perturb * std::fmax(1.,xuabs);
+      xla[i] -= fixed_var_perturb * std::fmax(1.,xuabs);
+    }
+  }
+}
+
+/** @brief set d_ptr[i] = 1 if d1[i] == d2[i], otherwirse 0 */
+__global__ void set_if_match_cu(int n,
+                                int* d_ptr,
+                                const double* d1,
+                                const double* d2)
+{
+  const int num_threads = blockDim.x * gridDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int i = tid; i < n; i += num_threads) {
+    if(d1[i]==d2[i]) {
+      d_ptr[i] = 1;
+    } else {
+      d_ptr[i] = 0;
+    }
+  }
+}
+
 namespace hiop
 {
 namespace cuda
@@ -1112,7 +1195,6 @@ double min_w_pattern_kernel(int n, const double* d1, const double* id, double ma
 
   thrust::device_ptr<double> ret_dev_ptr = thrust::min_element(thrust::device, dv_ptr, dv_ptr+n);
 
-  // TODO: how to return double from device to host?
   double *ret_host = new double[1];
   double *ret_ptr = thrust::raw_pointer_cast(ret_dev_ptr);
   cudaError_t cuerr = cudaMemcpy(ret_host, ret_ptr, (1)*sizeof(double), cudaMemcpyDeviceToHost);
@@ -1276,7 +1358,88 @@ void copyToStartingAt_w_pattern_kernel(int n_src,
                                                            dd);
 }
 
+int num_match_local_kernel(int n, const double* d1, const double* d2)
+{
+  int num_blocks = (n+block_size-1)/block_size;
+
+  // TODO: how to avoid this temp vec?
+  thrust::device_ptr<int> dv_ptr = thrust::device_malloc(n*sizeof(int));
+  int* d_ptr = thrust::raw_pointer_cast(dv_ptr);
+
+  // set d_ptr[i] = 1 if d1[i] == d2[i], otherwirse 0
+  set_if_match_cu<<<num_blocks,block_size>>>(n, d_ptr, d1, d2);
+
+  int rval = thrust::reduce(thrust::device, d_ptr, d_ptr+n, 0, thrust::plus<int>());
+
+  thrust::device_free(dv_ptr);
+
+  return rval;
+}
 
+/** @brief process variable bounds */
+void process_bounds_local_kernel(int n_local,
+                                 const double* xl,
+                                 const double* xu,
+                                 double* ixl,
+                                 double* ixu,
+                                 int& n_bnds_low,
+                                 int& n_bnds_upp,
+                                 int& n_bnds_lu,
+                                 int& n_fixed_vars,
+                                 double fixed_var_tol)
+{
+  int num_blocks = (n_local+block_size-1)/block_size;
+
+  thrust::device_ptr<int> bnds_low_d_ptr = thrust::device_malloc(n_local*sizeof(int));
+  int* bnds_low_r_ptr = thrust::raw_pointer_cast(bnds_low_d_ptr);
+
+  thrust::device_ptr<int> bnds_upp_d_ptr = thrust::device_malloc(n_local*sizeof(int));
+  int* bnds_upp_r_ptr = thrust::raw_pointer_cast(bnds_upp_d_ptr);
+
+  thrust::device_ptr<int> bnds_lu_d_ptr = thrust::device_malloc(n_local*sizeof(int));
+  int* bnds_lu_r_ptr = thrust::raw_pointer_cast(bnds_lu_d_ptr);
+
+  thrust::device_ptr<int> n_fixed_vars_d_ptr = thrust::device_malloc(n_local*sizeof(int));
+  int* n_fixed_vars_r_ptr = thrust::raw_pointer_cast(n_fixed_vars_d_ptr);
+
+  // set values
+  process_bounds_cu<<<num_blocks,block_size>>>(n_local,
+                                               xl,
+                                               xu,
+                                               ixl,
+                                               ixu,
+                                               bnds_low_r_ptr,
+                                               bnds_upp_r_ptr,
+                                               bnds_lu_r_ptr,
+                                               n_fixed_vars_r_ptr,
+                                               fixed_var_tol);
+
+  // compute sum
+  n_bnds_low = thrust::reduce(thrust::device, bnds_low_d_ptr, bnds_low_d_ptr+n_local, 0.0, thrust::plus<int>());
+  n_bnds_upp = thrust::reduce(thrust::device, bnds_upp_d_ptr, bnds_upp_d_ptr+n_local, 0.0, thrust::plus<int>());
+  n_bnds_lu = thrust::reduce(thrust::device, bnds_lu_d_ptr, bnds_lu_d_ptr+n_local, 0.0, thrust::plus<int>());
+  n_fixed_vars = thrust::reduce(thrust::device, n_fixed_vars_d_ptr, n_fixed_vars_d_ptr+n_local, 0.0, thrust::plus<int>());
+
+  thrust::device_free(bnds_low_d_ptr);
+  thrust::device_free(bnds_upp_d_ptr);
+  thrust::device_free(bnds_lu_d_ptr);
+  thrust::device_free(n_fixed_vars_d_ptr);
+}
+
+/** @brief relax variable bounds */
+void relax_bounds_kernel(int n_local,
+                         double* xl,
+                         double* xu,
+                         double fixed_var_tol,
+                         double fixed_var_perturb)
+{
+  int num_blocks = (n_local+block_size-1)/block_size;
+  relax_bounds_cu<<<num_blocks,block_size>>>(n_local,
+                                             xl,
+                                             xu,
+                                             fixed_var_tol,
+                                             fixed_var_perturb);
+}
 
 /// for hiopVectorIntCuda
 /**

diff --git a/src/LinAlg/VectorCudaKernels.hpp b/src/LinAlg/VectorCudaKernels.hpp
@@ -296,6 +296,28 @@ void copyToStartingAt_w_pattern_kernel(int n_src,
                                        double *vd,
                                        const double* dd);
 
+/// @brief return the numbers of identical elements between two vectors
+int num_match_local_kernel(int n, const double* d1, const double* d2);
+
+/** @brief process variable bounds */
+void process_bounds_local_kernel(int n_local,
+                                 const double* xl,
+                                 const double* xu,
+                                 double* ixl,
+                                 double* ixu,
+                                 int& n_bnds_low,
+                                 int& n_bnds_upp,
+                                 int& n_bnds_lu,
+                                 int& n_fixed_vars,
+                                 double fixed_var_tol);
+
+/** @brief relax variable bounds */
+void relax_bounds_kernel(int n_local,
+                         double* xl,
+                         double* xu,
+                         double fixed_var_tol,
+                         double fixed_var_perturb);
+
 /// for hiopVectorIntCuda
 /**
  * @brief Set the vector entries to be a linear space of starting at i0 containing evenly