Merge branch 'master' of github.com:espressomd/espresso into shanchen

Marcello-Sega · Apr 29, 2014 · afe6599 · afe6599
2 parents 69eec8f + 5a3eb5c
commit afe6599
Show file tree

Hide file tree

Showing 52 changed files with 1,248 additions and 392 deletions.
diff --git a/NEWS b/NEWS
@@ -29,6 +29,8 @@ New user-visible features
 * New command time_integration to get the runtime of the integration
   loop.
 
+* New harmoic force that runs on the GPU.
+
 User-visible changes
 --------------------
 * Comfixed now works with periodic boundary conditions.

diff --git a/configure.ac b/configure.ac
@@ -300,10 +300,10 @@ AS_IF([test x$with_cuda != xno],[
     # NVCC compile check
     AC_MSG_CHECKING([whether CUDA compiles])
     
-    # if no other compute capability is defined by the user, we require at least 1.1
+    # if no other compute capability is defined by the user, we default to 2.0
     case "$NVCCFLAGS" in
        *-arch=*) ;;
-       *) NVCCFLAGS="$NVCCFLAGS --ptxas-options=-v -gencode arch=compute_11,code=compute_11 -gencode arch=compute_20,code=compute_20"
+       *) NVCCFLAGS="$NVCCFLAGS --ptxas-options=-v -gencode arch=compute_20,code=compute_20"
     esac
     
     # use nvcc

diff --git a/doc/ug/electrokinetics.tex b/doc/ug/electrokinetics.tex
@@ -473,6 +473,21 @@ \subsection{\label{ssec:ek-reac-init}Initialization and Geometry Definition}
 a reaction, as described above. To successfully specify a region all the 
 relevant arguments that go with the shape constraints must be provided.
 
+\subsubsection{\label{sssec:ek-pdb-parse}Parsing PDB Files}
+
+\begin{essyntax}
+  \require{1 or 2 or 3}{electrokinetics}
+  \require{2}{pdb-parse}
+  \var{pdb\_filename}
+  \var{itp\_filename}
+  \begin{features}
+  \required[1]{ELECTROKINETICS}
+  \required[2]{EK_BOUNDARIES}
+  \required[3]{EK_REACTIONS}
+  \end{features}
+\end{essyntax}
+The \lit{electrokinetics pdb-parse} feature allows the user to parse simple PDB files, a file format introduced by the protein database to encode molecular structures. Together with a topology file (here \var{itp\_filename}) the structure gets interpolated to the \lit{electrokinetics} grid. For the input you will need to prepare a PDB file with a \lit{gromacs} force field to generate the topology file. Normally the PDB file extension is \lit{.pdb}, the topology file extension is \lit{.itp}. Obviously the PDB file is placed instead of \var{pdb\_filename} and the topology file instead of \var{itp\_filename}. \todo{At the moment this fails badly, if you try to parse incorrectly formatted files. This will be fixed in the future.}
+
 \subsection{\label{ssec:ek-reac-output}Reaction-Specific Output}
 
 \begin{essyntax}

diff --git a/doc/ug/installation.tex b/doc/ug/installation.tex
@@ -155,9 +155,10 @@ \subsection{Options}
   be used to define compiler flags for the NVIDIA CUDA-compiler
   \texttt{nvcc}. For example, \texttt{NVCCFLAGS = "{}-gencode
     arch=compute_20,code=sm_20"{}} will compile code only for Fermi
-  cards.  Default is to compile for compute models 1.1 and 2.0,
-  i.e. everything with a G90 chip or newer.  Note that we require at
-  least compute model 1.1.
+  cards.  Default is to compile for compute model 2.0,
+  i.e. everything with a Fermi chip or newer.  Note that we require at
+  least compute model 1.1, that is G90. However, to use G90 (e.\,g.~Tesla
+  C1060), you need to manually specificy compute model 1.1.
 \end{description}
 
 \section{\texttt{make}: Compiling,  testing and installing \es}

diff --git a/doc/ug/part.tex b/doc/ug/part.tex
@@ -865,6 +865,20 @@ \subsection{Getting the currently defined constraints}
 Prints out all constraint information. If \var{num} is specified only this
 constraint is displayed, otherwise all constraints will be printed.
 
+\subsection{\texttt{harmonic_force}: Creating a harmonic trap}
+ \newescommand{harmonic-force}
+ \begin{essyntax}
+   harmonic_force \{ \var{x} \var{y} \var{z} \} \var{k}
+  \begin{features}
+    \required{HARMONICFORCE}
+    \required{CUDA}
+  \end{features}
+ \end{essyntax}
+
+ Calculates a spring force for all particles, where the equilibrium position
+ of the spring is at \var{x y z} and it's force constant is \var{k}. A more
+ flexible trap can be constructed with constraints, but this one runs on the GPU.
+
 \section{Virtual sites}
 \label{sec:virtual}
 \index{virtual sites|mainindex}

diff --git a/doc/ug/run.tex b/doc/ug/run.tex
@@ -24,7 +24,7 @@ \section{\texttt{integrate}: Running the simulation}
 \newescommand{integrate}
 
 \begin{essyntax}
-  \variant{1} integrate \var{steps}
+  \variant{1} integrate \opt{reuse_forces} \var{steps}
   \variant{2} integrate set \opt{nvt}
   \variant{3} integrate set npt_isotropic \var{p_{ext}} \var{piston} \opt{\var{x\: y\: z}} \opt{-cubic_box}
 \end{essyntax}
@@ -34,6 +34,27 @@ \section{\texttt{integrate}: Running the simulation}
 \texttt{steps} as parameter integrates the system for \texttt{steps}
 time steps.
 
+Note that this implementation of the Velocity Verlet algorithm reuses forces,
+that is, they are computed once in the middle of the time step, but used twice,
+at the beginning and end. However, in the first time step after setting up,
+there are no forces present yet. Therefore, \es{} has to compute them before the
+first time step. That has two consequences: first, random forces are redrawn,
+resulting in a narrower distribution of the random forces, which we compensate
+by stretching. Second, coupling forces of e.\,g. the Lattice Boltzmann fluid cannot
+be computed and are therefore lacking in the first half time step. In order to
+minimize these effects, \es{} has a quite conservative heuristics to decide whether
+a change makes it necessary to recompute forces before the first time step. Therefore,
+calling hundred times \texttt{integrate 1} does the same as \texttt{integrate 100},
+apart from some small calling overhead.
+
+However, for checkpointing, there is no way for \es{} to tell that the forces that you
+read back in actually match the parameters that are set. Therefore, \es{} would recompute
+the forces before the first time step, which makes it essentially impossible to checkpoint
+LB simulations, where it is vital to keep the coupling forces. Therefore, \texttt{integrate}
+has an additional parameter \opt{reuse_forces}, which tells integrate to not recalculate
+the forces for the first time step, but use that the values still stored with the particles.
+Use this only if you are absolutely sure that the forces stored match your current setup!
+
 Two methods for the integration can be set: For an NVT ensemble
 (thermostat) and for an NPT isotropic ensemble (barostat). The current
 method can be detected with the command \texttt{integrate set} without

diff --git a/maintainer/jenkins/configs/maxset.hpp b/maintainer/jenkins/configs/maxset.hpp
@@ -36,6 +36,7 @@
 #define ELECTROKINETICS
 #define EK_BOUNDARIES
 #define EK_REACTION
+#define HARMONICFORCE
 #endif
 
 #define AREA_FORCE_GLOBAL   

diff --git a/scripts/vtf.tcl b/scripts/vtf.tcl
@@ -195,8 +195,20 @@ proc writevsf { file args } {
             lappend aids "$from:$to"
         }
         unset from
-
-        puts $file "atom [join $aids ,] $desc"
+        # let's group atom ranges, so that there are no more than 8 per line
+        # This fixes a problem with the vmd plugin, and makes the vtf more 
+        # readable anyway.
+        set start 0
+        set maxlen 8
+        set ll  [llength [lrange $aids $start end ]]
+        while { $ll >= $maxlen } {
+              puts $file "atom [join [lrange $aids $start [expr $start + $maxlen -1]] ,] $desc"
+              incr start $maxlen
+              set ll  [llength [lrange $aids $start end ]]
+        }
+        if { $start < [llength $aids ] } { 
+          puts $file "atom [join [lrange $aids  $start  end] ,] $desc"
+        }
     }
 
     # Print bond data

diff --git a/src/EspressoSystemInterface.cpp b/src/EspressoSystemInterface.cpp
@@ -6,10 +6,16 @@
 
 #include <iostream>
 
+/* Need explicite specialization, otherwise some compilers do not produce the objects. */
+
+template class EspressoSystemInterface::const_iterator<SystemInterface::Real>;
+template class EspressoSystemInterface::const_iterator<SystemInterface::Vector3>;
+template class EspressoSystemInterface::const_iterator<int>;
+
 /********************************************************************************************/
 
 template<class value_type>
-const value_type EspressoSystemInterface::const_iterator<value_type>::operator*() const {
+value_type EspressoSystemInterface::const_iterator<value_type>::operator*() const {
   return (*m_const_iterator);
 }
 
@@ -53,7 +59,9 @@ void EspressoSystemInterface::gatherParticles() {
   if (m_gpu)
   {
     if(gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
+      ESIF_TRACE(puts("Calling copy_part_data_to_gpu()"));
       copy_part_data_to_gpu();
+      reallocDeviceMemory(gpu_get_global_particle_vars_pointer_host()->number_of_particles);
       if(m_splitParticleStructGpu && (this_node == 0)) 
 	split_particle_struct();
     }

diff --git a/src/EspressoSystemInterface.hpp b/src/EspressoSystemInterface.hpp
@@ -1,8 +1,9 @@
 #ifndef ESPRESSOSYSTEMINTERFACE_H
 #define ESPRESSOSYSTEMINTERFACE_H
 
+#define ESIF_TRACE(A)
+
 #include "SystemInterface.hpp"
-#include "particle_data.hpp"
 #include "cuda_interface.hpp"
 
 class EspressoSystemInterface : public SystemInterface {
@@ -20,7 +21,7 @@ class EspressoSystemInterface : public SystemInterface {
   template<class value_type>
   class const_iterator : public SystemInterface::const_iterator<value_type> {
   public:
-    const value_type operator*() const;
+    value_type operator*() const;
     SystemInterface::const_iterator<value_type> &operator=(const SystemInterface::const_iterator<value_type> &rhs);
     EspressoSystemInterface::const_iterator<value_type> &operator=(typename std::vector<value_type>::const_iterator rhs);
     bool operator==(SystemInterface::const_iterator<value_type> const &rhs) const;
@@ -45,13 +46,16 @@ class EspressoSystemInterface : public SystemInterface {
   bool hasQ() { return true; };
 #endif
 
+#ifdef CUDA
   float *rGpuBegin() { return m_r_gpu_begin; };
   float *rGpuEnd() { return m_r_gpu_end; };
   bool hasRGpu() { return true; };
   bool requestRGpu() { 
     m_needsRGpu = hasRGpu();
     m_splitParticleStructGpu |= m_needsRGpu;
     m_gpu |= m_needsRGpu;
+    if(m_gpu)
+      enableParticleCommunication();
     return m_needsRGpu; 
   };
 
@@ -62,6 +66,8 @@ class EspressoSystemInterface : public SystemInterface {
     m_needsVGpu = hasVGpu();
     m_splitParticleStructGpu |= m_needsVGpu;
     m_gpu |= m_needsVGpu;
+    if(m_gpu)
+      enableParticleCommunication();
     return m_needsVGpu; 
   };
 
@@ -72,18 +78,55 @@ class EspressoSystemInterface : public SystemInterface {
     m_needsQGpu = hasQGpu(); 
     m_splitParticleStructGpu |= m_needsQGpu;
     m_gpu |= m_needsQGpu;
+    if(m_gpu)
+      enableParticleCommunication();
     return m_needsQGpu; 
   };
 
   bool requestParticleStructGpu() {
     m_needsParticleStructGpu = true;
     m_gpu |= m_needsParticleStructGpu;
+    if(m_gpu)
+      enableParticleCommunication();
     return true;
   }
 
+  float *fGpuBegin() { return (float *)gpu_get_particle_force_pointer(); };
+  float *fGpuEnd() { return (float *)(gpu_get_particle_force_pointer()) + 3*m_gpu_npart; };
+  bool hasFGpu() { return true; };
+  bool requestFGpu() {
+    m_needsFGpu = hasFGpu();
+    m_gpu |= m_needsFGpu;
+    if(m_gpu)
+      enableParticleCommunication();
+    return m_needsFGpu;
+  };
+#endif
+
+  unsigned int npart_gpu() {
+#ifdef CUDA
+    return m_gpu_npart;
+#else
+    return 0;
+#endif
+  };
+
+
 protected:
   void gatherParticles();
   void split_particle_struct();
+#ifdef CUDA
+  void enableParticleCommunication() {
+    if(!gpu_get_global_particle_vars_pointer_host()->communication_enabled) {
+      ESIF_TRACE(puts("gpu communication not enabled;"));
+      ESIF_TRACE(puts("enableParticleCommunication"));
+      gpu_init_particle_comm();
+      cuda_bcast_global_part_params();
+      reallocDeviceMemory(gpu_get_global_particle_vars_pointer_host()->number_of_particles);
+    }
+  };
+  void reallocDeviceMemory(int n);
+#endif
 
   Vector3Container R;
   #ifdef ELECTROSTATICS
@@ -115,12 +158,6 @@ class EspressoSystemInterface : public SystemInterface {
   bool m_splitParticleStructGpu;
 };
 
-/* Need explicite specialization, otherwise some compilers do not produce the objects. */
-
-template class EspressoSystemInterface::const_iterator<SystemInterface::Real>;
-template class EspressoSystemInterface::const_iterator<SystemInterface::Vector3>;
-template class EspressoSystemInterface::const_iterator<int>;
-
 extern EspressoSystemInterface espressoSystemInterface;
 
 #endif
diff --git a/src/EspressoSystemInterface_cuda.cu b/src/EspressoSystemInterface_cuda.cu
@@ -1,3 +1,4 @@
+
 #include "EspressoSystemInterface.hpp"
 #include "cuda_interface.hpp"
 #include "cuda_utils.hpp"
@@ -57,34 +58,39 @@ __global__ void split_kernel_v(CUDA_particle_data *particles, float *v, int n) {
   v[idx + 2] = p.v[2];
 }
 
+void EspressoSystemInterface::reallocDeviceMemory(int n) {
+
+  if(m_needsRGpu && ((n != m_gpu_npart) || (m_r_gpu_begin == 0))) {
+    if(m_r_gpu_begin != 0)
+      cuda_safe_mem(cudaFree(m_r_gpu_begin));
+    cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3*n*sizeof(float)));
+    m_r_gpu_end = m_r_gpu_begin + 3*n;
+  }
+
+  if(m_needsVGpu && ((n != m_gpu_npart) || (m_v_gpu_begin == 0))) {
+    if(m_v_gpu_begin != 0)
+      cuda_safe_mem(cudaFree(m_v_gpu_begin));
+    cuda_safe_mem(cudaMalloc(&m_v_gpu_begin, 3*n*sizeof(float)));
+    m_v_gpu_end = m_v_gpu_begin + 3*n;
+  }
+
+  if(m_needsQGpu && ((n != m_gpu_npart) || (m_q_gpu_begin == 0))) {
+    if(m_q_gpu_begin != 0)
+      cuda_safe_mem(cudaFree(m_q_gpu_begin));
+    cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, 3*n*sizeof(float)));
+    m_q_gpu_end = m_q_gpu_begin + 3*n;
+  }
+
+  m_gpu_npart = n;
+}
+
 void EspressoSystemInterface::split_particle_struct() {
   int n = gpu_get_global_particle_vars_pointer_host()->number_of_particles;
   if(n == 0) 
     return;
-
-  if( (n != m_gpu_npart) ) {
-    if(m_needsRGpu) {
-      if(m_r_gpu_begin != 0)
-	cuda_safe_mem(cudaFree(m_r_gpu_begin));
-      cuda_safe_mem(cudaMalloc(&m_r_gpu_begin, 3*n*sizeof(float)));
-      m_r_gpu_end = m_r_gpu_begin + 3*n;
-    }
-    if(m_needsVGpu) {
-      if(m_v_gpu_begin != 0)
-	cuda_safe_mem(cudaFree(m_v_gpu_begin));
-      cuda_safe_mem(cudaMalloc(&m_v_gpu_begin, 3*n*sizeof(float)));
-      m_v_gpu_end = m_v_gpu_begin + 3*n;
-    }
-    if(m_needsQGpu) {
-      if(m_q_gpu_begin != 0)
-	cuda_safe_mem(cudaFree(m_q_gpu_begin));
-      cuda_safe_mem(cudaMalloc(&m_q_gpu_begin, n*sizeof(float)));
-      m_q_gpu_end = m_q_gpu_begin + n;
-    }
-  }
 
-  m_gpu_npart = n;
-
+  ESIF_TRACE(printf("n = %d, m_gpu_npart = %d\n", n, m_gpu_npart));
+    
   dim3 grid(n/512+1,1,1);
   dim3 block(512,1,1);
 

diff --git a/src/HarmonicForce.cu b/src/HarmonicForce.cu
@@ -0,0 +1,41 @@
+#include "HarmonicForce.hpp"
+
+#include "cuda_utils.hpp"
+#include <stdio.h>
+
+#ifdef HARMONICFORCE
+
+HarmonicForce *harmonicForce = 0;
+
+__global__ void HarmonicForce_kernel(float x, float y, float z, float k,
+				     int n, float *pos, float *f) {
+
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(id >= n)
+    return;
+
+  f[3*id + 0] += k*(x - pos[3*id + 0]);
+  f[3*id + 1] += k*(y - pos[3*id + 1]);
+  f[3*id + 2] += k*(z - pos[3*id + 2]);
+}
+
+
+void HarmonicForce_kernel_wrapper(float x, float y, float z, float k, int n, float *pos, float *f) {
+  dim3 grid(1,1,1);
+  dim3 block(1,1,1);
+
+  if(n == 0)
+    return;
+
+  if(n <= 512) {
+    grid.x = 1;
+    block.x = n;
+  } else {
+    grid.x = n/512 + 1;
+    block.x = 512;
+  }
+
+  KERNELCALL(HarmonicForce_kernel,grid,block,(x, y, z, k, n, pos, f))
+}
+#endif