In [1]:
from functools import partial
import affinegap
import pandas as pd
from itertools import combinations


affineGap = partial(
    affinegap.affineGapDistance, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1
)

In [2]:
df = pd.read_csv("sequences.csv", index_col=0)
s=df["sequential_cat"].to_list()

## reference

In [5]:
%%timeit
for i in combinations(s, r=2):
    affineGap(*i)

39.6 s ± 1.02 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## move to char*

In [4]:
%%timeit
for i in combinations(s, r=2):
    affineGap(*i)

34.4 s ± 396 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## loop in C

In [6]:
%%timeit
r = affinegap.affinaGapDistanceArray(s, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1)

32.1 s ± 360 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## change output type to c++ vector

In [3]:
%%timeit
r = affinegap.affinaGapDistanceArray(s, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1)

30.8 s ± 389 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## automatic infer types

In [3]:
%%timeit
r = affinegap.affinaGapDistanceArray(s, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1)

31.5 s ± 415 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## string

In [5]:
%%timeit
r = affinegap.affinaGapDistanceArray(s, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1)

30.7 s ± 411 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## change datatype from str to memoryview

In [3]:
import array
new_s = [array.array('i', [ord(c) for c in w]) for w in s]


In [8]:
%%timeit
r = affinegap.affinaGapDistanceArray(new_s)

28.2 s ± 329 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## memory opt

In [4]:
%%timeit
r = affinegap.affinaGapDistanceArray(new_s)

27.8 s ± 363 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
r = affinegap.affinaGapDistanceArray(new_s)

In [4]:
%load_ext line_profiler

In [5]:
from affinegap import (
    affinaGapDistanceArray,
    affineGapDistance,
    affineGapDistanceInputOrder,
    normalizedAffineGapDistance,
)


In [6]:
ss = new_s[:50]

In [7]:
%lprun -T lprof0 -f affineGapDistanceInputOrder affinegap.affinaGapDistanceArray(ss, matchWeight=0, mismatchWeight=1, gapWeight=0.5, spaceWeight=0.5, abbreviation_scale=1)



*** Profile printout saved to text file 'lprof0'. 


Timer unit: 1e-06 s

Total time: 0.523494 s
File: affinegap/affinegap.pyx
Function: affineGapDistanceInputOrder at line 22

Line #      Hits         Time  Per Hit   % Time  Line Contents
    22                                           cpdef float affineGapDistanceInputOrder(int[::1] int_memview_1, int[::1] int_memview_2, const int length1, const int length2,
    23                                                                         const float matchWeight,
    24                                                                         const float mismatchWeight,
    25                                                                         const float gapWeight,
    26                                                                         const float spaceWeight,
    27                                                                         const float abbreviation_scale):
    28                                               # suppose len(int_memview_1) <= len(int_memview_2)
    2