Merge pull request deepmodeling#467 from uptonwu/master

Separate the GPU and CPU code in the .H header files under the dfLowMachFoam solver.
JX278 · Apr 7, 2024 · 7594b8d · 7594b8d
2 parents 67369ba + 44217db
commit 7594b8d
Show file tree

Hide file tree

Showing 12 changed files with 787 additions and 549 deletions.
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,106 +1,5 @@
 {
     volScalarField& he = thermo.he();
-#if defined GPUSolverNew_
-    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
-    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
-
-    EEqn_GPU.process();
-    EEqn_GPU.sync();
-    // EEqn_GPU.postProcess(h_he, h_boundary_he);
-
-    // copy h_he to he(cpu)
-    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
-
-    //DEBUG_TRACE;
-    //he.correctBoundaryConditions();
-    //DEBUG_TRACE;
-
-#if defined DEBUG_
-    fvScalarMatrix EEqn
-    (
-
-        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
-    +   fvc::ddt(rho, K) + fvc::div(phi, K)
-    -   dpdt
-    ==
-        (
-            turbName == "laminar"
-            ?
-            (
-                fvm::laplacian(turbulence->alpha(), he)
-            -   diffAlphaD
-            +   fvc::div(hDiffCorrFlux)
-            )
-            :
-            (
-                fvm::laplacian(turbulence->alphaEff(), he)
-            )
-        )
-    );
-    // EEqn.relax();
-    EEqn.solve("ha");
-    // checkResult
-    // TODO: for temp, now we compare ldu, finally we compare csr
-    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
-    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
-
-    offset = 0;
-    forAll(he.boundaryField(), patchi)
-    {
-        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
-        int patchSize = patchHe.size();
-        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
-        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
-        if (patchHe.type() == "processor"
-            || patchHe.type() == "processorCyclic") {
-            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
-            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
-            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
-            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
-            offset += patchSize * 2;
-        } else {
-            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
-            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
-            offset += patchSize;
-        }
-    }
-
-    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
-    offset = 0;
-    forAll(he.boundaryField(), patchi)
-    {
-        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
-        int patchSize = patchHe.size();
-        if (patchHe.type() == "processor"
-            || patchHe.type() == "processorCyclic") {
-            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
-            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
-            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
-            offset += patchSize * 2;
-        } else {
-            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
-            offset += patchSize;
-        }
-    }
-
-    bool printFlag = false;
-    int rank = -1;
-    if (mpi_init_flag) {
-        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    }
-    if (!mpi_init_flag || rank == 0) {
-		// DEBUG_TRACE;
-		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
-		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
-		// DEBUG_TRACE;
-        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
-    }
-
-    delete h_boundary_he_tmp;
-
-#endif
-
-#else
     start1 = std::clock();
     fvScalarMatrix EEqn
     (
@@ -133,5 +32,4 @@
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
 }
diff --git a/applications/solvers/dfLowMachFoam/EEqn_GPU.H b/applications/solvers/dfLowMachFoam/EEqn_GPU.H
@@ -0,0 +1,101 @@
+{
+    volScalarField& he = thermo.he();
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+
+    EEqn_GPU.process();
+    EEqn_GPU.sync();
+    // EEqn_GPU.postProcess(h_he, h_boundary_he);
+
+    // copy h_he to he(cpu)
+    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
+
+    //DEBUG_TRACE;
+    //he.correctBoundaryConditions();
+    //DEBUG_TRACE;
+
+#if defined DEBUG_
+    fvScalarMatrix EEqn
+    (
+
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
+            (
+                fvm::laplacian(turbulence->alphaEff(), he)
+            )
+        )
+    );
+    // EEqn.relax();
+    EEqn.solve("ha");
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+		// DEBUG_TRACE;
+		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
+		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+		// DEBUG_TRACE;
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+    }
+
+    delete h_boundary_he_tmp;
+
+#endif
+}
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,128 +1,4 @@
 // Solve the Momentum equation
-#ifdef GPUSolverNew_
-
-#if defined DEBUG_
-    // run CPU, for temp
-    TICK_START;
-    tmp<fvVectorMatrix> tUEqn
-    (
-        fvm::ddt(rho, U) 
-        + 
-        fvm::div(phi, U)
-        +  
-        turbulence->divDevRhoReff(U)
-    );
-    fvVectorMatrix& UEqn = tUEqn.ref();
-    TICK_STOP(CPU assembly time);
-
-    volTensorField gradU = fvc::grad(U);
-
-    double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
-    offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
-        int patchsize = patchGradU.size();
-        if (patchGradU.type() == "processor"
-                || patchGradU.type() == "processorCyclic") {
-            // print info
-            if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
-                Info << "gradU transform = true" << endl;
-            } else {
-                Info << "gradU transform = false" << endl;
-            }
-            Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;
-
-            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
-            tensorField patchGradUInternal = 
-                    dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
-            memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
-            offset += patchsize * 2;
-        } else {
-            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
-            offset += patchsize;
-        }
-    }
-#endif
-
-    // process
-    TICK_START;
-    UEqn_GPU.process();
-    UEqn_GPU.sync();
-    TICK_STOP(GPU process time);
-
-    // postProcess
-    // TICK_START;
-    // UEqn_GPU.postProcess(h_u);
-    // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
-    // U.correctBoundaryConditions();
-    // K = 0.5*magSqr(U);
-    // DEBUG_TRACE;
-    // TICK_STOP(post process time);
-
-#if defined DEBUG_
-    // UEqn.relax();
-    TICK_START;
-    solve(UEqn == -fvc::grad(p));
-    K.oldTime();
-    K = 0.5*magSqr(U);
-    TICK_STOP(CPU solve time);
-    // checkResult
-    // TODO: for temp, now we compare ldu, finally we compare csr
-    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
-
-    offset = 0;
-    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
-    {
-        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
-        int patchsize = dfDataBase.patch_size[patchi];
-        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
-        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
-        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
-        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
-        if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
-        else offset += patchsize;
-    }
-
-    double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
-    offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
-        int patchsize = dfDataBase.patch_size[patchi];
-
-        if (patchU.type() == "processor"
-            || patchU.type() == "processorCyclic") {
-            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
-            vectorField patchUInternal = 
-                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
-            memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
-            offset += 2 * patchsize;
-        } else {
-            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
-            offset += patchsize;
-        }
-    }
-
-    bool printFlag = false;
-
-    int rank = -1;
-    if (mpi_init_flag) {
-        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    }
-
-    if (!mpi_init_flag || rank == 0) {
-        // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
-        //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
-        //     // &gradU[0][0], h_boundary_gradU,
-        //     printFlag);
-        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
-    }
-    DEBUG_TRACE;
-#endif
-
-#else
     start1 = std::clock();
     tmp<fvVectorMatrix> tUEqn
     (
@@ -145,4 +21,4 @@
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
+