# A simple bottleneck

In [1]:
import random

from functools import wraps
from cProfile import Profile
from tempfile import NamedTemporaryFile
import pstats
import subprocess

Very simple code that :
- make_random_edges : generate a random graph given a number of edge and nodes
- find_unique_edges : filter the list of edges to only keep an instance of each (i.e. remove duplicate edges)
- contains : verify if an edge is part of a list

In [2]:
def make_random_edges(n_edges=100, n_nodes=10):
    random.seed(42)
    edges = [[random.randint(0, n_nodes), random.randint(0, n_nodes)] for _ in range(n_edges)]
    return edges

def find_unique_edges(edges):
    edges = list(edges)
    unique_edges = []
    while edges:
        edge = edges.pop()
        if not contains(edges, edge):
            unique_edges.append(edge)
    return unique_edges

def contains(edges, edge):
    for e in edges:
        if sorted(e) == sorted(edge):
            return True
    return False


Some profiling functions to ease the process

In [3]:
_time_profiles = {}

def profile_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        profile = Profile()
        ret = profile.runcall(func, *args, **kwargs)
        _time_profiles[(wrapper, ) + args] = profile
        
        return ret 
    return wrapper

def profile_stats(profile):
    temp_stats = NamedTemporaryFile(delete_on_close=False)
    profile.dump_stats(temp_stats.name)
    return pstats.Stats(temp_stats.name)

## Simple profiling with text outputs  

In [4]:
@profile_time
def remove_duplicate_edges(n_edges=2000):
    edges = make_random_edges(n_edges, 200)
    unique_edges = find_unique_edges(edges)
    return unique_edges

unique_edges = remove_duplicate_edges(2000)

profile = _time_profiles[(remove_duplicate_edges, 2000)]

stats = profile_stats(profile)
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()

Wed Oct 23 22:21:47 2024    C:\Users\UTILIS~1\AppData\Local\Temp\tmp12erk6zz

         3943002 function calls (3942995 primitive calls) in 2.783 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  3903622    1.720    0.000    1.720    0.000 {built-in method builtins.sorted}
     2000    1.030    0.001    2.749    0.001 1201873293.py:15(contains)
     4000    0.008    0.000    0.020    0.000 random.py:291(randrange)
     4000    0.007    0.000    0.010    0.000 random.py:242(_randbelow_with_getrandbits)
     4000    0.003    0.000    0.023    0.000 random.py:332(randint)
        1    0.003    0.003    0.026    0.026 1201873293.py:1(make_random_edges)
    12000    0.003    0.000    0.003    0.000 {built-in method _operator.index}
        1    0.002    0.002    1.338    1.338 1201873293.py:6(find_unique_edges)
     4000    0.002    0.000    0.002    0.000 {method 'bit_length' of 'int' objects}
     5051    0.001    0.000    0.00

<pstats.Stats at 0x237bf7cb140>

## Advanced profiling with visualization

In [5]:
%reload_ext snakeviz

In [6]:
def display_stats(profile):
    with NamedTemporaryFile(delete=False, suffix='.prof') as temp_stats:
        profile.dump_stats(temp_stats.name)
        temp_stats.close()
        subprocess.run(['snakeviz', temp_stats.name])

### Profiling the first implementation

It opens a window in a web browser.

/!\ Be careful, we couldn't stop this part or the second implementation, so you have to only run one at a time. And if you want to run the second part, comment this one /!\

In [7]:
"""@profile_time
def remove_duplicate_edges(n_edges=2000):
    edges = make_random_edges(n_edges, 200)
    unique_edges = find_unique_edges(edges)
    return unique_edges

unique_edges = remove_duplicate_edges(2000)

profile = _time_profiles[(remove_duplicate_edges, 2000)]
display_stats(profile)"""

'@profile_time\ndef remove_duplicate_edges(n_edges=2000):\n    edges = make_random_edges(n_edges, 200)\n    unique_edges = find_unique_edges(edges)\n    return unique_edges\n\nunique_edges = remove_duplicate_edges(2000)\n\nprofile = _time_profiles[(remove_duplicate_edges, 2000)]\ndisplay_stats(profile)'

### Profiling the second implementation

It opens a window in a web browser.

In [None]:
def find_unique_edges_better(edges):
    """We use the property of python dictionaries that they can only have unique keys to remove duplicates.
    Simply:
    unique_edges = list(dict(list(edges)))
    """
    #print(len(edges))
    unique_edges = {}
    unique_edges.update(edges)
    unique_edges = list(unique_edges)
    #print(len(unique_edges))
    return unique_edges

@profile_time
def remove_duplicate_edges_better(n_edges=2000):
    edges = make_random_edges(n_edges, 200)
    unique_edges = find_unique_edges_better(edges)
    return unique_edges

unique_edges = remove_duplicate_edges_better(2000)

profile = _time_profiles[(remove_duplicate_edges_better, 2000)]
display_stats(profile)