In [15]:
import httpx
import numpy
import requests
from bs4 import BeautifulSoup
import json

In [5]:
plasmid_urls = [
    "https://www.addgene.org/vector-database/1542/",
    "https://www.addgene.org/vector-database/1528/",
    "https://www.addgene.org/vector-database/8272/",
    "https://www.addgene.org/vector-database/1462/",
    "https://www.addgene.org/vector-database/1432/"
]

In [18]:
response = requests.get(plasmid_urls[0])

In [19]:
def parse_vector_html(html: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")

    data = {}

    # Vector name
    h2 = soup.select_one("#vector-info-section h2")
    if h2:
        data["plasmid_name"] = h2.get_text(strip=True).replace("Plasmid:", "").strip()

    # Information section (dt/dd pairs)
    info = {}
    for dt, dd in zip(soup.select("section dl dt"), soup.select("section dl dd")):
        key = dt.get_text(strip=True)
        val = dd.get_text(strip=True)
        info[key] = val
    data["information"] = info

    # Sequence
    seq_area = soup.select_one("#vdb-sequence-value")
    if seq_area:
        data["sequence"] = seq_area.get_text(strip=True)

    return data

In [20]:
data = parse_vector_html(response.content)
data

{'plasmid_name': 'Lorist4',
 'information': {'Plasmid Type': 'Unspecified',
  'Cloning Method': 'Unknown',
  'Size': '5407',
  'Bacterial Resistance': 'Kanamycin',
  'GenBank': 'X99439',
  'Stable': 'Unspecified',
  'Constitutive': 'Unspecified',
  'Viral/Non-Viral': 'Unspecified'},
 'sequence': 'GATCCTAGTATTCTATAGTGTCACCTAAATCGTATGTGTATGATACATAAGGTTATGTATTAATTGTAGCCGCGTTCTAACGACAATATGTACAAGCCTAATTGTGTAGCATCTGGCTTACTGAAGCAGACCCTATCATCTCTCTCGTAAACTGCCGTCAGAGTCGGTTTGGTTGGACGAACCTTCTGAGTTTCTGGTAACGCCGTCCCGCACCCGGAAATGGTCAGCGAACCAATCAGCAGGGTCATCGCTAGAAATCATCCTTAGCGAAAGCTAAGGATTTTTTTTATCTGAATTCTAGCCAGATCCCCGCTGATTTATGCTGGTTACTGTTGCGCCTGTTAGCGCGGCAACGTCCGGCGCACAGAAGCTATTATGCGTCCCCAGGTAATGAATAATTGCCTCTTTGCCCGTCATACACTTGCTCCTTTCAGTCCGAACTTAGCTTTGATTTCTGCGATCTTCGCCAGAGCCTGTGCACGATTTAGAGGTCTACCGCCCATGACAGGAAGTTGTTTTACTGGTTCAGGGATCGCCTCACCACGGTTAATTCTCGCAGTCATATGGACAAGCTCATCTGCGGCCTTACGGCGTAATTCCGCATCAGTAAGCGCATTGGCCCGCATGTTCTGATACAGGTTGGTAACCAGCCAGTAGTGCGCGTTTGATTTCCACGGATAAGACTCCGCATCCGGATACAGG

In [21]:
plasmid_data = [parse_vector_html(requests.get(plasmid_url).content) for plasmid_url in plasmid_urls]

In [22]:
server_url = "http://localhost:8080"

In [33]:
async def get_plannotate_info(seq: str) -> dict:
    params = {
        "html": "false",
        "format": "json"
    }
    async with httpx.AsyncClient() as client:
        response = await client.post(
            f"{server_url}/plannotate/text",
            params=params,
            headers={"accept": "text/plain", "Content-Type": "text/plain"},
            content=seq,
            timeout=30
        )
        response.raise_for_status()
        return response.json()


async def get_amrfinder_info(seq: str) -> dict:
    params = {
        "is_protein": "false",
        "format": "json"
    }
    async with httpx.AsyncClient() as client:
        response = await client.post(
            f"{server_url}/amrfinder/text",
            params=params,
            headers={"accept": "text/plain", "Content-Type": "text/plain"},
            content=seq
        )
        response.raise_for_status()
        return response.json()


async def get_prodigal_info(seq: str) -> dict:
    params = {
        "mode": "auto",
        "format": "json"
    }
    async with httpx.AsyncClient() as client:
        response = await client.post(
            f"{server_url}/prodigal/text",
            params=params,
            headers={"accept": "text/plain", "Content-Type": "text/plain"},
            content=seq
        )
        response.raise_for_status()
        return response.json()

async def get_plannotate_fast_info(seq: str) -> dict:
    params = {
        "format": "json"
    }
    async with httpx.AsyncClient() as client:
        response = await client.post(
            f"{server_url}/plannotate/fast",
            params=params,
            headers={"accept": "text/plain", "Content-Type": "text/plain"},
            content=seq
        )
        response.raise_for_status()
        return response.json()



In [28]:
plannotate_info = await get_plannotate_info(plasmid_data[0]["sequence"])
plannotate_info

{'metadata': {'locus': 'plasmid',
  'length': '5407',
  'molecule_type': 'DNA',
  'topology': 'circular',
  'definition': '.',
  'comment': 'Annotated with pLannotate v1.2.2'},
 'features': [{'type': 'CDS',
   'location': '3370..4161',
   'qualifiers': {'note': 'pLannotate',
    'label': 'neo',
    'database': 'swissprot',
    'identity': 100.0,
    'match_length': 100.0,
    'fragment': 'False',
    'other': 'CDS'}},
  {'type': 'CDS',
   'location': 'complement(417..1115)',
   'qualifiers': {'note': 'pLannotate',
    'label': 'P',
    'database': 'swissprot',
    'identity': 100.0,
    'match_length': 100.0,
    'fragment': 'False',
    'other': 'CDS'}},
  {'type': 'terminator',
   'location': '2015..2109',
   'qualifiers': {'note': 'pLannotate',
    'label': 'lambda t0 terminator',
    'database': 'snapgene',
    'identity': 100.0,
    'match_length': 100.0,
    'fragment': 'False',
    'other': 'terminator'}},
  {'type': 'CDS',
   'location': 'complement(2459..2656)',
   'qualifiers

In [30]:
import asyncio
import random
import time
import statistics as stats
from typing import Callable, Dict, List, Any
import pandas as pd

def ms(x: float) -> float:
    return round(x * 1000.0, 3)

async def _run_with_timing(
    func: Callable[[str], Any],
    seq: str,
    sem: asyncio.Semaphore,
) -> Dict[str, Any]:
    """
    Runs a single request with timing and error capture.
    Returns {"ok": bool, "latency": float (seconds), "exc": Optional[Exception]}.
    """
    async with sem:
        t0 = time.perf_counter()
        try:
            _ = await func(seq)
            t1 = time.perf_counter()
            return {"ok": True, "latency": t1 - t0, "exc": None}
        except Exception as e:
            t1 = time.perf_counter()
            return {"ok": False, "latency": t1 - t0, "exc": e}

async def bench_func(
    name: str,
    func: Callable[[str], Any],
    sequences: List[str],
    concurrency: int = 4,
    warmup: int = 1,
) -> Dict[str, Any]:
    """
    Benchmark a single async function over a list of sequences.
    - Runs `warmup` warm-up calls (not included in stats).
    - Then runs timed calls with up to `concurrency` in flight.
    Returns a metrics dict.
    """
    # Warm-up (best-effort)
    for i in range(warmup):
        try:
            await func(sequences[min(i, len(sequences)-1)])
        except Exception:
            # ignore warmup errors
            pass

    sem = asyncio.Semaphore(concurrency)
    t_start = time.perf_counter()
    tasks = [asyncio.create_task(_run_with_timing(func, seq, sem)) for seq in sequences]
    results = await asyncio.gather(*tasks)
    wall = time.perf_counter() - t_start

    latencies = [r["latency"] for r in results if r["ok"]]
    errors = [r for r in results if not r["ok"]]
    n = len(results)
    n_ok = len(latencies)
    n_err = len(errors)

    def percentile(data, p):
        if not data:
            return None
        k = max(0, min(len(data)-1, int(round((p/100) * (len(data)-1)))))
        return sorted(data)[k]

    metrics = {
        "function": name,
        "requests": n,
        "success": n_ok,
        "errors": n_err,
        "total_wall_s": round(wall, 3),
        "throughput_rps": round(n_ok / wall if wall > 0 else 0.0, 3),
        "latency_mean_ms": ms(stats.mean(latencies)) if latencies else None,
        "latency_std_ms": ms(stats.pstdev(latencies)) if len(latencies) > 1 else 0.0 if latencies else None,
        "latency_p50_ms": ms(percentile(latencies, 50)) if latencies else None,
        "latency_p95_ms": ms(percentile(latencies, 95)) if latencies else None,
        "latency_min_ms": ms(min(latencies)) if latencies else None,
        "latency_max_ms": ms(max(latencies)) if latencies else None,
    }
    return metrics

async def benchmark_all(
    funcs: Dict[str, Callable[[str], Any]],
    n_requests: int = 20,
    seq_length: int = 120,
    concurrency: int = 4,
    warmup: int = 1,
    fixed_seq: str = None,
) -> pd.DataFrame:
    """
    Benchmarks each function in `funcs` with the same set of inputs.
    - If `fixed_seq` is provided, uses the same sequence for all requests.
    - Otherwise generates random DNA sequences of length `seq_length`.
    """
    if fixed_seq is not None:
        sequences = [fixed_seq] * n_requests
    else:
        sequences = [random_dna(seq_length) for _ in range(n_requests)]

    results = []
    for name, fn in funcs.items():
        metrics = await bench_func(
            name=name,
            func=fn,
            sequences=sequences,
            concurrency=concurrency,
            warmup=warmup,
        )
        results.append(metrics)
    df = pd.DataFrame(results)
    # Sort by mean latency if available
    if "latency_mean_ms" in df.columns:
        df = df.sort_values(by=["latency_mean_ms"], na_position="last").reset_index(drop=True)
    return df


In [32]:
funcs = {
    "plannotate": get_plannotate_info,
    "plannotate_fast":  get_plannotate_fast_info,
    "amrfinder":  get_amrfinder_info,
    "prodigal":   get_prodigal_info,
}

fixed = plasmid_data[0]['sequence']

df_fixed = await benchmark_all(
    funcs,
    n_requests=30,
    fixed_seq=fixed,
    concurrency=5,
    warmup=2
)
df_fixed


Unnamed: 0,function,requests,success,errors,total_wall_s,throughput_rps,latency_mean_ms,latency_std_ms,latency_p50_ms,latency_p95_ms,latency_min_ms,latency_max_ms
0,prodigal,30,30,0,0.534,56.161,81.759,41.608,63.407,184.612,43.471,191.514
1,amrfinder,30,30,0,2.226,13.479,350.787,42.679,336.623,447.644,285.994,451.533
2,plannotate,30,30,0,51.56,0.582,8557.703,748.876,8234.933,10230.408,7794.266,10237.499
