In [None]:
%%writefile file.cpp


// monte carlo paralel
// circle inside square
// Cada thread irá gerar N/NUM_THREADS números aleatórios, atualizando sum com os pontos dentro do semi-círculo.

#include <iostream>
#include <random>
#include <cmath>
#include <omp.h>

using namespace std;

int main()
{
    // start the timer
    double start = omp_get_wtime();
    int n = 1000000;
    int sum = 0;
    double x, y;
    // Each thread will generate N/NUM_THREADS random numbers, updating sum with the points inside the semi-circle.
    #pragma omp parallel
    {
        random_device rd;
        mt19937 gen(rd());
        uniform_real_distribution<> dis(0, 1);
        int local_sum = 0;
        #pragma omp for
        for (int i = 0; i < n; i++)
        {
            // print current thread number
            // cout << "Thread: " << omp_get_thread_num() << endl;
            x = dis(gen);
            y = dis(gen);
            if (sqrt(x * x + y * y) <= 1)
            {
                local_sum++;
            }
        }
        #pragma omp atomic
        sum += local_sum;
    }
    // end the timer
    double end = omp_get_wtime();
    cout << "Time: " << end - start << endl;
    cout << "Pi: " << 4.0 * sum / n << endl;
    return 0;
}

/*The line of code you've highlighted is a directive for OpenMP,
a library for parallel programming in C++. The #pragma omp atomic directive is used to
ensure that a specific memory location is updated atomically to prevent race conditions.

A race condition occurs when two or more threads can access shared data and they try to change it
at the same time. As a result, the values of variables may be unpredictable and vary depending
on the timings of context switches of the processes.

The #pragma omp atomic directive ensures that a specific block of code is executed atomically.
This means that the block of code is executed as a single, uninterruptible unit.
It's a way to ensure that the operation (or sequence of operations)
is completed by one thread before another thread has the
chance to interrupt and perform the same operation.

This directive is commonly used when performing increment, decrement, addition, and subtraction
operations on shared variables in a multi-threaded environment.
It's a simple and effective way to prevent race conditions and
ensure the correct execution of your program when working with shared memory
in a parallel computing context.*/

Overwriting file.cpp


In [None]:
!g++ -g -Wall -fopenmp -o file file.cpp

In [None]:
!./file

Time: 0.105606
Pi: 3.13884


In [None]:
%%file file2.cpp

#include<iostream>
#include<omp.h>
#include <unistd.h>
using namespace std;


int main() {
    long int i1, i2, imax = 1e9;
    double sum1 = 0, sum2 = 0;

    #pragma omp parallel sections num_threads(3)
    {
        #pragma omp section
        {
            for(i1 = 0; i1 < imax; i1++)
                sum1 +=  i1;
            cout << "Task 1 - Done." << endl;
        }

        #pragma omp section
        {
            for(i2 = 0; i2 < imax; i2++)
                sum2 -= i2;
            cout << "Task 2 - Done." << endl;

        }

        #pragma omp section
        {
            for(;i1<imax && i2<imax;){
                sleep(1);
                cout << "Task 3 - i1 = " << i1 << " i2 = " << i2 << endl;
            }
        }

    }


    return 0;
}

Writing file2.cpp


In [None]:
!g++ -g -Wall -fopenmp -o file2 file2.cpp
!./file2

Task 3 - i1 = 86286590 i2 = 83991204
Task 3 - i1 = 167442524 i2 = 167259672
Task 3 - i1 = 255380593 i2 = 247936334
Task 3 - i1 = 356946804 i2 = 364683798
Task 3 - i1 = 482419582 i2 = 493566167
Task 3 - i1 = 581967960 i2 = 610640562
Task 3 - i1 = 667634240 i2 = 692687267
Task 3 - i1 = 755950102 i2 = 775764815
Task 3 - i1 = 841071299 i2 = 858213831
Task 3 - i1 = 926680251 i2 = 937811584
Task 2 - Done.
Task 1 - Done.
Task 3 - i1 = 1000000000 i2 = 1000000000


In [None]:
%%file file3.cpp

// sequential leibniz pi
#include <iostream>
#include <omp.h>
#include <cmath>
#include <chrono>
using namespace std;

double sequentialLeibnizPi(int n) {
    double sum = 0.0;
    for (int i = 0; i < n; i++) {
        sum += pow(-1, i) / (2 * i + 1);
    }
    return 4 * sum;
}

double parallelLeibnizPi(int n) {
    double sum = 0.0;
    #pragma omp parallel for reduction(+:sum)
    for (int i = 0; i < n; i++) {
        sum += pow(-1, i) / (2 * i + 1);
    }
    return 4 * sum;
}

double TwoThreadsLeinbizPi(int n) {
    double sum1 = 0.0;
    double sum2 = 0.0;
    #pragma omp parallel sections
    {
        #pragma omp section
        {
            for (int i = 0; i < n / 2; i++) {
                sum1 += pow(-1, i) / (2 * i + 1);
            }
        }
        #pragma omp section
        {
            for (int i = n / 2; i < n; i++) {
                sum2 += pow(-1, i) / (2 * i + 1);
            }
        }
    }
    return 4 * (sum1 + sum2);
}

double XThreadsLeibnizPi(int n, int numThreads) {
    double sum = 0.0;
    #pragma omp parallel for num_threads(numThreads) reduction(+:sum)
    for (int i = 0; i < n; i++) {
        sum += pow(-1, i) / (2 * i + 1);
    }
    return 4 * sum;
}

double MasterThreadLeibnizPi(int n) {
    double sum = 0.0;
    #pragma omp parallel
    {
        #pragma omp master
        {
            cout << "Number of threads: " << omp_get_num_threads() << endl;
        }
        #pragma omp for reduction(+:sum)
        for (int i = 0; i < n; i++) {
            sum += pow(-1, i) / (2 * i + 1);
        }
    }
    return 4 * sum;
}

int main() {
    int n = 1000000;
    auto start = chrono::high_resolution_clock::now();
    cout << "Sequential: " << sequentialLeibnizPi(n) << endl;
    auto end = chrono::high_resolution_clock::now();
    cout << "Sequential time: " << chrono::duration_cast<chrono::milliseconds>(end - start).count() << "ms" << endl;
    start = chrono::high_resolution_clock::now();
    cout << "Parallel: " << parallelLeibnizPi(n) << endl;
    end = chrono::high_resolution_clock::now();
    cout << "Parallel time: " << chrono::duration_cast<chrono::milliseconds>(end - start).count() << "ms" << endl;
    start = chrono::high_resolution_clock::now();
    cout << "Two threads: " << TwoThreadsLeinbizPi(n) << endl;
    end = chrono::high_resolution_clock::now();
    cout << "Two threads time: " << chrono::duration_cast<chrono::milliseconds>(end - start).count() << "ms" << endl;
    start = chrono::high_resolution_clock::now();
    cout << "X threads: " << XThreadsLeibnizPi(n, 12) << endl;
    end = chrono::high_resolution_clock::now();
    cout << "X threads time: " << chrono::duration_cast<chrono::milliseconds>(end - start).count() << "ms" << endl;
    start = chrono::high_resolution_clock::now();
    cout << "Master thread: " << MasterThreadLeibnizPi(n) << endl;
    end = chrono::high_resolution_clock::now();
    cout << "Master thread time: " << chrono::duration_cast<chrono::milliseconds>(end - start).count() << "ms" << endl;
    return 0;
}


Overwriting file3.cpp


In [None]:
!g++ -g -Wall -fopenmp -o file3 file3.cpp
!./file3

Sequential: 3.14159
Sequential time: 21ms
Parallel: 3.14159
Parallel time: 18ms
Two threads: 3.14159
Two threads time: 21ms
X threads: 3.14159
X threads time: 18ms
Master thread: Number of threads: 2
3.14159
Master thread time: 24ms


In [None]:
%%file pi_rec.cpp

#include <omp.h>
#include <iostream>
#include <iomanip>
static long num_steps = 1024*1024*1024;

#define MIN_BLK  1024*1024*256

double sum = 0;

void pi_r(long Nstart, long Nfinish, double step) {
    long i,iblk;
    if (Nfinish-Nstart < MIN_BLK){
        #pragma omp parallel for reduction(+:sum)
        for (i = Nstart; i < Nfinish; i++){
            double x = (i+0.5)*step;
            sum += 4.0/(1.0+x*x);
        }
    } else {
        iblk = Nfinish-Nstart;
        #pragma omp task
        pi_r(Nstart,         Nfinish-iblk/2,step);
        #pragma omp task
        pi_r(Nfinish-iblk/2, Nfinish,       step);
        #pragma omp taskwait
    }
}

int main () {
    double step, pi;
    double init_time, final_time;
    step = 1.0/(double) num_steps;
    init_time = omp_get_wtime();
    pi_r(0, num_steps, step);
    pi = step * sum;
    final_time = omp_get_wtime() - init_time;

    std::cout << "for " << num_steps << " steps pi = " << std::setprecision(15) << pi << " in " << final_time << " secs\n";
}

Overwriting pi_rec.cpp


In [None]:
!g++ -g -Wall -fopenmp -o pi_rec pi_rec.cpp
!./pi_rec

for 1073741824 steps pi = 3.14159265358981 in 2.68497309599979 secs


Antes de usar a parte em cuda thrust lembre-se de mudar o ambiente de execução para a gpu (é só ir na setinha ao lado de ram e disco e selecionar alterar tipo de ambiente de execução)

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
%%file version.cu
#include <thrust/version.h>
#include <iostream>

int main(void)
{
  int major = THRUST_MAJOR_VERSION;
  int minor = THRUST_MINOR_VERSION;

  std::cout << "Thrust v" << major << "." << minor << std::endl;

  return 0;
}

Overwriting version.cu


In [None]:
!nvcc version.cu -o version
!./version

Thrust v2.2


In [None]:
%%file thrust_example.cu
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <algorithm>
#include <cstdlib>

int main(void)
{
  // generate 32M random numbers serially
  thrust::host_vector<int> h_vec(32 << 20);
  std::generate(h_vec.begin(), h_vec.end(), rand);

  // transfer data to the device
  thrust::device_vector<int> d_vec = h_vec;

  // sort data on the device
  thrust::sort(d_vec.begin(), d_vec.end());

  // transfer data back to host
  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());

  return 0;
}

Overwriting thrust_example.cu


In [None]:
!nvcc thrust_example.cu -o thrust_example
!./thrust_example

In [None]:
!nvprof ./thrust_example

==3080== NVPROF is profiling process 3080, command: ./thrust_example
==3080== Profiling application: ./thrust_example
==3080== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   39.59%  32.056ms         1  32.056ms  32.056ms  32.056ms  [CUDA memcpy HtoD]
                   34.04%  27.562ms         1  27.562ms  27.562ms  27.562ms  [CUDA memcpy DtoH]
                   13.97%  11.308ms         3  3.7695ms  3.7646ms  3.7788ms  void cub::CUB_200200_520_NS::DeviceRadixSortDownsweepKernel<cub::CUB_200200_520_NS::DeviceRadixSortPolicy<int, cub::CUB_200200_520_NS::NullType, unsigned int>::Policy900, bool=1, bool=0, int, cub::CUB_200200_520_NS::NullType, unsigned int, cub::CUB_200200_520_NS::detail::identity_decomposer_t>(cub::CUB_200200_520_NS::DeviceRadixSortPolicy<int, cub::CUB_200200_520_NS::NullType, unsigned int>::Policy900 const *, cub::CUB_200200_520_NS::DeviceRadixSortDownsweepKernel<cub::CUB_200200_520_NS::DeviceRadix

In [None]:
!nvidia-smi

Thu May 23 12:41:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   2
  On-line CPU(s) list:    0,1
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   1
    Socket(s):            1
    Stepping:             0
    BogoMIPS:             4399.99
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 cl
                          flush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc re
                          p_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3
                           fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand
                           hypervisor lahf_lm abm 3dnowprefetch i

In [None]:
%%file t_ex2.cu

// This example demonstrates computing the sum of some random numbers in parallel:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>

int main() {
  // Generate random data serially.
  thrust::default_random_engine rng(1337);
  thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
  thrust::host_vector<double> h_vec(32);
  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });

  // Transfer to device and compute the sum.
  thrust::device_vector<double> d_vec = h_vec;
  double x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());

  // Print the result.
  std::cout << "Sum: " << x << std::endl;
  return 0;
}

Writing t_ex2.cu


In [None]:
!nvcc t_ex2.cu -o t_ex2
!./t_ex2

Sum: -448


In [None]:
%%file t_ex3.cu

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

int main() {

// alocate device vector
thrust::device_vector<int> d_vec(4);

thrust::device_vector<int>::iterator begin = d_vec.begin();
thrust::device_vector<int>::iterator end = d_vec.end();

int length = end - begin; // compute size of sequence

end = d_vec.begin() + 3; // define a sequence of elemets

// print the sequence
for (int i = 0; i < length; i++) {
  std::cout << d_vec[i] << " ";
}
std::cout << std::endl;

 // print the end
std::cout << *end << std::endl;


}

Overwriting t_ex3.cu


In [None]:
!ls

cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb  t_ex2	 thrust_example
file						       t_ex2.cu  thrust_example.cu
file.cpp					       t_ex3	 version
sample_data					       t_ex3.cu  version.cu


In [None]:
!nvcc t_ex3.cu -o t_ex3
!./t_ex3

0 0 0 0 
0


In [None]:
%%file t_ex4.cu

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

int main() {
// thrust constant iterator
thrust::constant_iterator<int> begin(10);
thrust::constant_iterator<int> end = begin + 3;

// begin[0] = 10, begin[1] = 10, begin[2] = 10
// end[0] = 10, end[1] = 10, end[2] = 10

int result;
result = thrust::reduce(begin, end); // result = 30
// print the result
std::cout << "Sum: " << result << std::endl;
return 0;
}

Overwriting t_ex4.cu


In [None]:
!nvcc t_ex4.cu -o t_ex4
!./t_ex4

Sum: 30


In [None]:
%%file t_ex5.cu

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>

int main(){
// thrust zip iterator
thrust::device_vector<int> A(4);
thrust::device_vector<char> B(4);

A[0] = 1; A[1] = 2; A[2] = 3; A[3] = 4;
B[0] = 'a'; B[1] = 'b'; B[2] = 'c'; B[3] = 'd';

thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<char>::iterator> > begin;
thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<char>::iterator> > end;

begin = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin()));
end = thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end()));

// begin[0] = (1,'a'), begin[1] = (2,'b'), begin[2] = (3,'c'), begin[3] = (4,'d')
// end[0] = (1,'a'), end[1] = (2,'b'), end[2] = (3,'c'), end[3] = (4,'d')

// maximum of [begin, end) with respect to the sum of the first elements
thrust::maximum<thrust::tuple<int,char> > binary_op;
thrust::reduce(begin, end, thrust::make_tuple(0,0), binary_op); // returns (4,'d')

// declare the result
thrust::tuple<int,char> result;
result = thrust::reduce(begin, end, thrust::make_tuple(0,0), binary_op);

// print the result
std::cout << "Result: " << thrust::get<0>(result) << " " << thrust::get<1>(result) << std::endl;
return 0;
}

Overwriting t_ex5.cu


In [None]:
!nvcc t_ex5.cu -o t_ex5
!./t_ex5

Result: 4 d


In [None]:
%%file integration.cu

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <nvfunctional>
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <iomanip>

// define the square functor
struct square
{
  __host__ __device__
  float operator()(const float& x) const
  {
    return x * x;
  }
};

int main() {
  // declare the variables
  thrust::device_vector<float> width(11, 0.1);
  thrust::device_vector<float> height(11, 0.1);
  thrust::device_vector<float> area(11, 0.1);
  thrust::device_vector<float> accumulated_area(11, 0.1);
  float total_area;

  // declare the x-coordinate vector
  thrust::device_vector<float> x(11);


  thrust::sequence(x.begin(), x.end(), 0.0f, 0.1f);
  std::cout << "x: ";
  for (int i = 0; i < x.size(); i++) {
    std::cout << x[i] << " ";
  }
  std::cout << std::endl;

  // calculate the height of the trapezoids
  thrust::transform(x.begin(), x.end(), height.begin(), square());

  std::cout << "Height: ";
  for (int i = 0; i < height.size(); i++) {
    std::cout << height[i] << " ";
  }
  std::cout << std::endl;

  // calculate the area of the trapezoids
  thrust::transform(width.begin(), width.end(), height.begin(), area.begin(), thrust::multiplies<float>());

  // print the area of the trapezoids
    std::cout << "Area: ";
    for (int i = 0; i < area.size(); i++) {
        std::cout << area[i] << " ";
        }
    std::cout << std::endl;

  // calculate the total area under the curve
  total_area = thrust::reduce(area.begin(), area.end());

  // calculate the accumulated area under the curve at each x-coordinate
  thrust::inclusive_scan(area.begin(), area.end(), accumulated_area.begin());

  // print the results
  std::cout << "Total area: " << total_area << std::endl;
  std::cout << "Accumulated area: ";
  for (int i = 0; i < accumulated_area.size(); i++) {
    std::cout << accumulated_area[i] << " ";
  }
  std::cout << std::endl;

  return 0;
}

Overwriting integration.cu


In [None]:
!nvcc integration.cu -o integration
!./integration

x: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1 
Height: 0 0.01 0.04 0.09 0.16 0.25 0.36 0.49 0.64 0.81 1 
Area: 0 0.001 0.004 0.009 0.016 0.025 0.036 0.049 0.064 0.081 0.1 
Total area: 0.385
Accumulated area: 0 0.001 0.005 0.014 0.03 0.055 0.091 0.14 0.204 0.285 0.385 


In [None]:
%%file teste_mpi.cpp

#include <unistd.h>
#include <iostream>
#include <mpi.h>

using namespace std;

int main(int argc, char **argv) {
    MPI_Init(&argc, &argv);
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    if (size != 2) {
        cout << "This program requires exactly 2 processes" << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    char hostname[256];
    gethostname(hostname, sizeof(hostname));
    int msg;

    if (rank == 0){
        msg = 42;
        // send command format: MPI_Send(&data, count, datatype, destination, tag, communicator)
        MPI_Send(&msg, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
        cout << "Process" << rank << " sent message " << msg << " to process 1" << endl;

        MPI_Recv(&msg, 1, MPI_INT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        cout << "Process" << rank << " received message " << msg << " from process 1" << endl;

    }
    else {
        MPI_Recv(&msg, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        cout << "Process " << rank << " received message " << msg << " from process 0" << endl;

        msg = 43;
        MPI_Send(&msg, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
        cout << "Process " << rank << " sent message " << msg << " to process 0" << endl;
    }

    MPI_Finalize();
    return 0;
}

Writing teste_mpi.cpp


In [None]:
!mpic++ teste_mpi.cpp -o teste_mpi

In [None]:
!mpirun --allow-run-as-root --oversubscribe -n 2 ./teste_mpi

Process0 sent message 42 to process 1
Process 1 received message 42 from process 0
Process 1 sent message 43 to process 0
Process0 received message 43 from process 1


In [None]:
%%file scatter_test.cpp

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

float compute_avg(float *arr, int size) {
    float sum = 0.0;
    for (int i = 0; i < size; i++) {
        sum += arr[i];
    }
    return sum / size;
}

float *create_rand_nums(int num_elements) {
    float *rand_nums = new float [num_elements];
    for (int i = 0; i < num_elements; i++) {
        rand_nums[i] = (rand() / (float)RAND_MAX);
    }
    return rand_nums;
}

int main(int argc, char** argv) {
    MPI_Init(NULL, NULL);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    int elements_per_proc = 100; // Example number of elements per process
    float *rand_nums = NULL; // Random numbers on root
    float *sub_rand_nums = new float [elements_per_proc]; // Buffer for each process

    if (world_rank == 0) {
        rand_nums = create_rand_nums(elements_per_proc * world_size); // Assuming create_rand_nums is defined elsewhere
    }

    // Scatter the random numbers to all processes
    MPI_Scatter(rand_nums, elements_per_proc, MPI_FLOAT, sub_rand_nums,
                elements_per_proc, MPI_FLOAT, 0, MPI_COMM_WORLD);

    // Compute the average of your subset
    float sub_avg = compute_avg(sub_rand_nums, elements_per_proc);

    // Gather all partial averages down to the root process
    float *sub_avgs = NULL;
    if (world_rank == 0) {
        sub_avgs = new float [world_size];
    }

    // Gather all partial averages down to the root process
    MPI_Gather(&sub_avg, 1, MPI_FLOAT, sub_avgs, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);

    if (world_rank == 0) {
        float avg = compute_avg(sub_avgs, world_size);
        printf("Global average: %f\n", avg);
    }

    delete(sub_rand_nums);
    if (world_rank == 0) {
        delete(rand_nums);
        delete(sub_avgs);
    }

    MPI_Finalize();
    return 0;
}

Writing scatter_test.cpp


In [None]:
!mpic++ scatter_test.cpp -o scatter_test

In [None]:
!mpirun --allow-run-as-root --oversubscribe -n 5 ./scatter_test

Global average: 0.506206


In [None]:
%%file mpi_omp.cpp

#include <stdio.h>
#include <mpi.h>
#include <omp.h>

int main(int argc, char *argv[])
{
    int numprocs, rank, namelen;
    char processor_name[20];
    int iam = 0, np = 1;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Get_processor_name(processor_name, &namelen);

    #pragma omp parallel default(shared) private(iam)
    {
        np = omp_get_num_threads();
        iam = omp_get_thread_num();
        printf("Hybrid: Hello from thread %d out of %d from process %d out of %d on %s\n",
                iam, np, rank, numprocs, processor_name);
    }

    MPI_Finalize();

    return 0;
}

Writing mpi_omp.cpp


In [None]:
!mpic++ -fopenmp mpi_omp.cpp -o mpi_omp
!mpirun --allow-run-as-root --oversubscribe -n 2 ./mpi_omp

Hybrid: Hello from thread 1 out of 2 from process 1 out of 2 on 76a253b8e3ea
Hybrid: Hello from thread 0 out of 2 from process 1 out of 2 on 76a253b8e3ea
Hybrid: Hello from thread 1 out of 2 from process 0 out of 2 on 76a253b8e3ea
Hybrid: Hello from thread 0 out of 2 from process 0 out of 2 on 76a253b8e3ea


In [None]:
%%file quadrado.cpp

#include <iostream>
#include <mpi.h>
#include <omp.h>

int main(int argc, char *argv[]) {
	MPI_Init(&argc, &argv);

	int rank, size;
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);

	const int N=10;
	int data[N][N];


	// Inicialização do array pelo processo 0
	if (rank == 0) {
		for(int i=0; i<N; i++) {
			for(int j=0; j<N; j++) {
				data[i][j] = i+j;
			}
		}

        for(int i=0; i<N; i++) {
			for(int j=0; j<N; j++) {
				std::cout << data[i][j] << " ";
			}
			std::cout << std::endl;
		}
        std::cout << std::endl << std::endl;
	}

	// Dividir o array entre os processos
	int chunk_size = N / size;
	int local_data[chunk_size][N];
	MPI_Scatter(&data, chunk_size*N, MPI_INT, &local_data, chunk_size*N, MPI_INT, 0, MPI_COMM_WORLD);

	// Paralelização com OpenMP
	#pragma omp parallel for collapse(2)
	for(int i=0; i<chunk_size; i++) {
		for(int j=0; j<N; j++) {
			local_data[i][j] *= local_data[i][j];
		}
	}

	// Reunir os resultados no processo 0
	MPI_Gather(&local_data, chunk_size*N, MPI_INT, &data, chunk_size*N, MPI_INT, 0, MPI_COMM_WORLD);

	// Processo 0 imprime os resultados
	if (rank == 0) {
		for(int i=0; i<N; i++) {
			for(int j=0; j<N; j++) {
				std::cout << data[i][j] << " ";
			}
			std::cout << std::endl;
		}
	}

	MPI_Finalize();
	return 0;
}

Writing quadrado.cpp


In [None]:
!mpic++ -fopenmp quadrado.cpp -o quadrado
!mpirun --allow-run-as-root --oversubscribe -n 4 ./quadrado

0 1 2 3 4 5 6 7 8 9 
1 2 3 4 5 6 7 8 9 10 
2 3 4 5 6 7 8 9 10 11 
3 4 5 6 7 8 9 10 11 12 
4 5 6 7 8 9 10 11 12 13 
5 6 7 8 9 10 11 12 13 14 
6 7 8 9 10 11 12 13 14 15 
7 8 9 10 11 12 13 14 15 16 
8 9 10 11 12 13 14 15 16 17 
9 10 11 12 13 14 15 16 17 18 


0 1 4 9 16 25 36 49 64 81 
1 4 9 16 25 36 49 64 81 100 
4 9 16 25 36 49 64 81 100 121 
9 16 25 36 49 64 81 100 121 144 
16 25 36 49 64 81 100 121 144 169 
25 36 49 64 81 100 121 144 169 196 
36 49 64 81 100 121 144 169 196 225 
49 64 81 100 121 144 169 196 225 256 
8 9 10 11 12 13 14 15 16 17 
9 10 11 12 13 14 15 16 17 18 


In [None]:
%%file media.cpp

#include <iostream>
#include <vector>
#include <mpi.h>
#include <omp.h>
#include <chrono>

int main(int argc, char* argv[]) {

    auto start = std::chrono::high_resolution_clock::now();

    MPI_Init(&argc, &argv);
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    const int N = 1000000000; // Total de numeros
    const int local_N = N / size; // Elementos por processo (máquina)

    // Aloca memória para o vetor local
    std::vector<double> local_data(local_N);

    // Inicializa o vetor local. Para simplificar, usaremos o índice.
    for (int i = 0; i < local_N; i++) {
        local_data[i] = i + rank * local_N;
    }

    double local_sum = 0.0;

    // Usa OpenMP para calcular a soma local
    #pragma omp parallel for reduction(+:local_sum)
    for (int i = 0; i < local_N; i++) {
        local_sum += local_data[i];
    }

    // Reúne todas as somas locais para calcular a soma global
    double global_sum = 0.0;
    MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    // Calcula a média no processo principal
    if (rank == 0) {
        double average = global_sum / N;
        std::cout << "The average is " << average << std::endl;

        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
        std::cout << "Time taken: " << duration.count() << " milliseconds" << std::endl;
    }

    MPI_Finalize();
    return 0;
}


Writing media.cpp


In [None]:
!mpic++ -fopenmp media.cpp -o media
!mpirun --allow-run-as-root --oversubscribe -n 4 ./media

The average is 5e+08
Time taken: 11986 milliseconds
