# Travaille Pratique 
Nom: Gustavo BARRETO

## Exercice BucketSort


In [2]:
import subprocess
import pandas as pd

### Function Declarations

In [3]:
def get_avg_times(logfile: str) -> pd.DataFrame:
    df = pd.read_csv(logfile, sep=";", engine="python", header=None, names=["PROC","OP", "TIME_MS"])
    df['TIME_MS'] = df['TIME_MS'].astype(float)
    
    return df

def run_mpi(repeats=5, np=4, script="bucket-mpi.py", logfile="bucket.log"):
    for _ in range(repeats):
        subprocess.run(
            ["mpiexec", "-np", str(np), "python", script],
            stdout=open(logfile, "a"),
            stderr=subprocess.STDOUT
        )

def run_serial(repeats=5,args=None ,script="bucket-serial.py", logfile="bucket.log"):
    for _ in range(repeats):
        subprocess.run(
            ["python", script] + (args if args else []),
            stdout=open(logfile, "a"),
            stderr=subprocess.STDOUT
        )
        
    


To execute the code:

In [19]:
run_mpi(repeats=10, np=4, script="bucket-mpi.py", logfile="bucket.log")


In [None]:
run_serial(repeats=10, args=["4"] ,script="bucket-serial.py", logfile="bucket.log")

In [20]:
#Obtain the results from the executions
df = get_avg_times("bucket.log")

# Display the average times per operation
avg = df.groupby(["OP","PROC"], as_index=False)[["TIME_MS"]].mean()
display(avg)


# Calculate speedup
serial_time = avg.loc[avg["OP"] == "Total(serial)", "TIME_MS"].values[0]
mpi_time = avg.loc[avg["OP"] == "Total", "TIME_MS"].min()
speedup = serial_time / mpi_time
print(f"Speedup (Serial / MPI): {speedup:.2f}x")



Unnamed: 0,OP,PROC,TIME_MS
0,Total,4,13.7504
1,Total,8,51.88
2,Total(serial),1,269.781
3,bucket-transfer,4,4.6276
4,bucket-transfer,8,14.449977
5,local-sort,4,0.1216
6,local-sort,8,0.8345


Speedup (Serial / MPI): 19.62x


La parallélisation réduit fortement le temps d’exécution par rapport au cas séquentiel, avec un speed-up maximal de $5.20x$ obtenu pour 8 processus et  $19.62x$ quand on utilise 4 processeurs. En revanche, l’augmentation du nombre de processus au-delà de cette valeur n’apporte plus de gain et peut même dégrader les performances, en raison du surcoût des communications et des synchronisations MPI. 

Dans ce cas, la quantité de bucket equivaux à la quantité de processeurs, afin de faciliter l'implementation du code.


## Utilisation d'une nouvelle strategie: Slave-Master

In [4]:
for i in [4, 8, 16]:
    run_mpi(repeats=5, np=i, script="bucket-mpi-histogram.py", logfile="bucket-hist.log")
    run_mpi(repeats=5, np=i, script="bucket-mpi.py", logfile="bucket-hist.log")
    



In [15]:
df = get_avg_times("bucket-hist.log")

# Display the average times per operation
avg = df.groupby(["OP","PROC"], as_index=False)[["TIME_MS"]].mean()
avg = avg.sort_values(by=["OP", "PROC"])
#remove other than "Total(..)" operations
avg = avg[avg["OP"].str.startswith("Total(")]
display(avg)



# Calculate speedup
histogram_time = avg.loc[avg["OP"] == "Total(histogram)", "TIME_MS"].min()
mpi_time = avg.loc[avg["OP"] == "Total(mpi)", "TIME_MS"].min()
speedup = histogram_time / mpi_time
print(f"Speedup (Histogram / MPI): {speedup:.2f}x")




Unnamed: 0,OP,PROC,TIME_MS
0,Total(histogram),4,17.8676
1,Total(histogram),8,75.8787
2,Total(histogram),16,1125.8714
3,Total(mpi),4,13.3297
4,Total(mpi),8,26.7739
5,Total(mpi),16,1017.2674


Speedup (Histogram / MPI): 1.34x


Cette stratégie maître–esclave n'etait le plus efficace sachant que la fonction de generation de donnés aleatoire est déjà equilibré. Cette strategie nous permet d'assurer qu'il aura un  equilibre dynamique de donnés dans notre processus.