Skip to content

Commit

Permalink
Merge pull request deepmodeling#467 from uptonwu/master
Browse files Browse the repository at this point in the history
Separate the GPU and CPU code in the .H header files under the dfLowMachFoam solver.
  • Loading branch information
maorz1998 committed Apr 7, 2024
2 parents 67369ba + 44217db commit 7594b8d
Show file tree
Hide file tree
Showing 12 changed files with 787 additions and 549 deletions.
102 changes: 0 additions & 102 deletions applications/solvers/dfLowMachFoam/EEqn.H
Original file line number Diff line number Diff line change
@@ -1,106 +1,5 @@
{
volScalarField& he = thermo.he();
#if defined GPUSolverNew_
double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);

EEqn_GPU.process();
EEqn_GPU.sync();
// EEqn_GPU.postProcess(h_he, h_boundary_he);

// copy h_he to he(cpu)
// memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);

//DEBUG_TRACE;
//he.correctBoundaryConditions();
//DEBUG_TRACE;

#if defined DEBUG_
fvScalarMatrix EEqn
(

fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
- dpdt
==
(
turbName == "laminar"
?
(
fvm::laplacian(turbulence->alpha(), he)
- diffAlphaD
+ fvc::div(hDiffCorrFlux)
)
:
(
fvm::laplacian(turbulence->alphaEff(), he)
)
)
);
// EEqn.relax();
EEqn.solve("ha");
// checkResult
// TODO: for temp, now we compare ldu, finally we compare csr
std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);

offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
offset += patchSize;
}
}

double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
offset += patchSize;
}
}

bool printFlag = false;
int rank = -1;
if (mpi_init_flag) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}
if (!mpi_init_flag || rank == 0) {
// DEBUG_TRACE;
// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
// DEBUG_TRACE;
// EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
}

delete h_boundary_he_tmp;

#endif

#else
start1 = std::clock();
fvScalarMatrix EEqn
(
Expand Down Expand Up @@ -133,5 +32,4 @@
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif
}
101 changes: 101 additions & 0 deletions applications/solvers/dfLowMachFoam/EEqn_GPU.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
volScalarField& he = thermo.he();
double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);

EEqn_GPU.process();
EEqn_GPU.sync();
// EEqn_GPU.postProcess(h_he, h_boundary_he);

// copy h_he to he(cpu)
// memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);

//DEBUG_TRACE;
//he.correctBoundaryConditions();
//DEBUG_TRACE;

#if defined DEBUG_
fvScalarMatrix EEqn
(

fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
- dpdt
==
(
turbName == "laminar"
?
(
fvm::laplacian(turbulence->alpha(), he)
- diffAlphaD
+ fvc::div(hDiffCorrFlux)
)
:
(
fvm::laplacian(turbulence->alphaEff(), he)
)
)
);
// EEqn.relax();
EEqn.solve("ha");
// checkResult
// TODO: for temp, now we compare ldu, finally we compare csr
std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);

offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
offset += patchSize;
}
}

double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
offset += patchSize;
}
}

bool printFlag = false;
int rank = -1;
if (mpi_init_flag) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}
if (!mpi_init_flag || rank == 0) {
// DEBUG_TRACE;
// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
// DEBUG_TRACE;
// EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
}

delete h_boundary_he_tmp;

#endif
}
126 changes: 1 addition & 125 deletions applications/solvers/dfLowMachFoam/UEqn.H
Original file line number Diff line number Diff line change
@@ -1,128 +1,4 @@
// Solve the Momentum equation
#ifdef GPUSolverNew_

#if defined DEBUG_
// run CPU, for temp
TICK_START;
tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U)
+
fvm::div(phi, U)
+
turbulence->divDevRhoReff(U)
);
fvVectorMatrix& UEqn = tUEqn.ref();
TICK_STOP(CPU assembly time);

volTensorField gradU = fvc::grad(U);

double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
offset = 0;
forAll(U.boundaryField(), patchi)
{
const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
int patchsize = patchGradU.size();
if (patchGradU.type() == "processor"
|| patchGradU.type() == "processorCyclic") {
// print info
if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
Info << "gradU transform = true" << endl;
} else {
Info << "gradU transform = false" << endl;
}
Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;

memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
tensorField patchGradUInternal =
dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
offset += patchsize * 2;
} else {
memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
offset += patchsize;
}
}
#endif

// process
TICK_START;
UEqn_GPU.process();
UEqn_GPU.sync();
TICK_STOP(GPU process time);

// postProcess
// TICK_START;
// UEqn_GPU.postProcess(h_u);
// memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
// U.correctBoundaryConditions();
// K = 0.5*magSqr(U);
// DEBUG_TRACE;
// TICK_STOP(post process time);

#if defined DEBUG_
// UEqn.relax();
TICK_START;
solve(UEqn == -fvc::grad(p));
K.oldTime();
K = 0.5*magSqr(U);
TICK_STOP(CPU solve time);
// checkResult
// TODO: for temp, now we compare ldu, finally we compare csr
std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);

offset = 0;
for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
{
const fvPatchVectorField& patchU = U.boundaryField()[patchi];
int patchsize = dfDataBase.patch_size[patchi];
const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
else offset += patchsize;
}

double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
offset = 0;
forAll(U.boundaryField(), patchi)
{
const fvPatchVectorField& patchU = U.boundaryField()[patchi];
int patchsize = dfDataBase.patch_size[patchi];

if (patchU.type() == "processor"
|| patchU.type() == "processorCyclic") {
memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
vectorField patchUInternal =
dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
offset += 2 * patchsize;
} else {
memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
offset += patchsize;
}
}

bool printFlag = false;

int rank = -1;
if (mpi_init_flag) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}

if (!mpi_init_flag || rank == 0) {
// UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(),
// // &gradU[0][0], h_boundary_gradU,
// printFlag);
// UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
}
DEBUG_TRACE;
#endif

#else
start1 = std::clock();
tmp<fvVectorMatrix> tUEqn
(
Expand All @@ -145,4 +21,4 @@
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif

0 comments on commit 7594b8d

Please sign in to comment.