# A simple bottleneck

Very simple code that :
- make_random_edges : generate a random graph given a number of edge and nodes
- find_unique_edges : filter the list of edges to only keep an instance of each (i.e. remove duplicate edges)
- contains : verify if an edge is part of a list

In [1]:
import random

def make_random_edges(n_edges=100, n_nodes=10):
    random.seed(42)
    edges = [[random.randint(0, n_nodes), random.randint(0, n_nodes)] for e in range(n_edges)]
    return edges

def find_unique_edges(edges):
    edges = list(edges)
    unique_edges = []
    while edges:
        edge = edges.pop()
        if not contains(edges, edge):
            unique_edges.append(edge)
    return unique_edges

def contains(edges, edge):
    for e in edges:
        if sorted(e) == sorted(edge):
            return True
    return False


print("Generating random edges...")
edges = make_random_edges()
print(f"Total edges generated: {len(edges)}")
print(edges)
print("Finding unique edges...")
unique_edges = find_unique_edges(edges)
print(f"Total unique edges found: {len(unique_edges)}")
print(unique_edges)

Generating random edges...
Total edges generated: 100
[[10, 1], [0, 4], [3, 3], [2, 1], [10, 8], [1, 9], [6, 0], [0, 1], [3, 3], [8, 9], [0, 8], [3, 10], [8, 6], [3, 7], [9, 4], [0, 2], [6, 5], [4, 2], [3, 5], [1, 1], [6, 1], [5, 5], [9, 4], [0, 7], [8, 1], [6, 1], [8, 4], [10, 9], [5, 9], [3, 1], [0, 10], [3, 4], [1, 3], [1, 6], [4, 7], [10, 5], [2, 5], [5, 3], [10, 4], [10, 10], [1, 9], [10, 2], [8, 3], [2, 7], [6, 4], [10, 8], [3, 10], [5, 0], [3, 0], [5, 6], [4, 1], [3, 9], [5, 3], [10, 7], [6, 10], [7, 2], [4, 2], [3, 8], [8, 4], [9, 6], [9, 6], [5, 3], [2, 8], [7, 1], [0, 1], [2, 10], [2, 10], [6, 9], [1, 6], [6, 9], [7, 8], [4, 8], [0, 10], [1, 10], [8, 4], [10, 5], [1, 4], [6, 2], [7, 0], [4, 8], [2, 8], [1, 10], [4, 10], [8, 9], [3, 2], [5, 2], [8, 8], [0, 9], [5, 7], [0, 1], [5, 4], [3, 0], [3, 9], [1, 1], [7, 1], [8, 2], [2, 10], [7, 8], [2, 4], [8, 9]]
Finding unique edges...
Total unique edges found: 54
[[5, 4], [5, 7], [0, 9], [8, 8], [3, 2], [6, 2], [7, 8], [7, 1], [2, 8

Some profiling functions to ease the process

In [2]:
from functools import wraps
from cProfile import Profile
from tempfile import NamedTemporaryFile
import pstats

_time_profiles = {}

def profile_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        profile = Profile()
        ret = profile.runcall(func, *args, **kwargs)
        _time_profiles[(wrapper, ) + args] = profile
        
        return ret 
    return wrapper

def profile_stats(profile):
    temp_stats = NamedTemporaryFile()
    profile.dump_stats(temp_stats.name)
    return pstats.Stats(temp_stats.name)

Simple profiling with text outputs  

In [3]:
@profile_time
def remove_duplicate_edges(n_edges=2000):
    edges = make_random_edges(n_edges, 200)
    unique_edges = find_unique_edges(edges)
    return unique_edges

unique_edges = remove_duplicate_edges(2000)

profile = _time_profiles[(remove_duplicate_edges, 2000)]

stats = profile_stats(profile) # convert raw profile data to pstats readable format
stats.strip_dirs() # remove extraneous path from file names
stats.sort_stats('time') # sort by internal time spent in each function, slowest first
stats.print_stats() # print the stats to stdout

Wed Nov 12 16:33:02 2025    /tmp/tmp3q3xeymn

         3942601 function calls in 1.272 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  3903622    0.650    0.000    0.650    0.000 {built-in method builtins.sorted}
     2000    0.591    0.000    1.242    0.001 3639459437.py:17(contains)
     4000    0.010    0.000    0.012    0.000 random.py:239(_randbelow_with_getrandbits)
     4000    0.008    0.000    0.020    0.000 random.py:292(randrange)
     4000    0.006    0.000    0.026    0.000 random.py:366(randint)
        1    0.002    0.002    1.244    1.244 3639459437.py:8(find_unique_edges)
        1    0.001    0.001    0.027    0.027 3639459437.py:5(<listcomp>)
    12000    0.001    0.000    0.001    0.000 {built-in method _operator.index}
     5051    0.001    0.000    0.001    0.000 {method 'getrandbits' of '_random.Random' objects}
     4000    0.000    0.000    0.000    0.000 {method 'bit_length' of 'int' objects}
   

<pstats.Stats at 0x7fbb14f83850>

Advance profiling with visualization

In [4]:
%reload_ext snakeviz

In [5]:
from snakeviz.ipymagic import open_snakeviz_and_display_in_notebook

def display_stats(profile):
    temp_stats = NamedTemporaryFile(delete=False)
    profile.dump_stats(temp_stats.name)
    return open_snakeviz_and_display_in_notebook(temp_stats.name)

In [6]:
@profile_time
def remove_duplicate_edges(n_edges=2000):
    edges = make_random_edges(n_edges, 200)
    unique_edges = find_unique_edges(edges)
    return unique_edges

unique_edges = remove_duplicate_edges(2000)

profile = _time_profiles[(remove_duplicate_edges, 2000)]
#display_stats(profile)

# Save profile to a file and open with snakeviz in browser
import os
profile_file = '/tmp/profile_output.prof'
profile.dump_stats(profile_file)
print(f"Profile saved to: {profile_file}")
print("\nTo view in browser, run this command in a terminal:")
print(f"snakeviz {profile_file}")
print("\nOr run this to open automatically:")
os.system(f"snakeviz {profile_file} &")

Profile saved to: /tmp/profile_output.prof

To view in browser, run this command in a terminal:
snakeviz /tmp/profile_output.prof

Or run this to open automatically:


0

Propose a new implementation and profile it

Nous allons à présent proposer une nouvelle implémentation qui réalise le même but que l'algo initial. L'algorithme doit donc créer une liste de sommets, puis enlever tous les doublons. Dans cette implementation, nous allons implementer du parallellislme massif dans le but de reduire le temps d'execution de l'algorithme. Nous allons egalement profiler le nouvel algo pour pouvoir comparer les deux implementations.

Nous allons d'abord utiliser l'algo de tri en parallele que nous avons implemente dans la partie precedente "complexity".

In [8]:
import numba as numba


@numba.njit
def insertion_sort_numba(ar):
    ar = ar.copy()
    n = len(ar)
    for i in numba.prange(1, n):
        key_item = ar[i]
        j = i - 1
        while j >= 0 and ar[j] > key_item:
            ar[j + 1] = ar[j]
            j -= 1
        ar[j + 1] = key_item
    return ar

def make_random_edges_hpc(n_edges=100, n_nodes=10):
    random.seed(42)
    edges = [[random.randint(0, n_nodes), random.randint(0, n_nodes)] for e in range(n_edges)]
    return edges

def find_unique_edges_hpc(edges):
    edges = list(edges)
    unique_edges = []
    while edges:
        edge = edges.pop()
        if not contains(edges, edge):
            unique_edges.append(edge)
    return unique_edges

def contains_hpc(edges, edge):
    for e in edges:
        if insertion_sort_numba(e) == insertion_sort_numba(edge):
            return True
    return False


print("Generating random edges...")
edges = make_random_edges_hpc()
print(f"Total edges generated: {len(edges)}")
print(edges)
print("Finding unique edges...")
unique_edges = find_unique_edges_hpc(edges)
print(f"Total unique edges found: {len(unique_edges)}")
print(unique_edges)

Generating random edges...
Total edges generated: 100
[[10, 1], [0, 4], [3, 3], [2, 1], [10, 8], [1, 9], [6, 0], [0, 1], [3, 3], [8, 9], [0, 8], [3, 10], [8, 6], [3, 7], [9, 4], [0, 2], [6, 5], [4, 2], [3, 5], [1, 1], [6, 1], [5, 5], [9, 4], [0, 7], [8, 1], [6, 1], [8, 4], [10, 9], [5, 9], [3, 1], [0, 10], [3, 4], [1, 3], [1, 6], [4, 7], [10, 5], [2, 5], [5, 3], [10, 4], [10, 10], [1, 9], [10, 2], [8, 3], [2, 7], [6, 4], [10, 8], [3, 10], [5, 0], [3, 0], [5, 6], [4, 1], [3, 9], [5, 3], [10, 7], [6, 10], [7, 2], [4, 2], [3, 8], [8, 4], [9, 6], [9, 6], [5, 3], [2, 8], [7, 1], [0, 1], [2, 10], [2, 10], [6, 9], [1, 6], [6, 9], [7, 8], [4, 8], [0, 10], [1, 10], [8, 4], [10, 5], [1, 4], [6, 2], [7, 0], [4, 8], [2, 8], [1, 10], [4, 10], [8, 9], [3, 2], [5, 2], [8, 8], [0, 9], [5, 7], [0, 1], [5, 4], [3, 0], [3, 9], [1, 1], [7, 1], [8, 2], [2, 10], [7, 8], [2, 4], [8, 9]]
Finding unique edges...
Total unique edges found: 54
[[5, 4], [5, 7], [0, 9], [8, 8], [3, 2], [6, 2], [7, 8], [7, 1], [2, 8

In [None]:
@profile_time
def remove_duplicate_edges_hpc(n_edges=2000):
    edges = make_random_edges_hpc(n_edges, 200)
    unique_edges = find_unique_edges_hpc(edges)
    return unique_edges

unique_edges = remove_duplicate_edges_hpc(2000)