In [147]:
import re
import requests

import pandas as pd
import os

from bs4 import BeautifulSoup
bacteria_regex = r'(?P<bacterium>[A-Z]\. [a-z]+)(?P<strain>\s?[A-Z]+\s?[0-9]+)?'

In [150]:
SAVE_FILES = False

In [2]:
def get_sequence(sequence_string):
    return sequence_string

def get_references(reference_string, reference_link):
    reference = reference_string
    if reference_link:
        reference += ' | ' + reference_link
    return [reference]

def get_data(data_string):

    all_bacteria = {}
    unit_strings = ['uM', 'ug/ml']
    unit_delimiters = [u + ')' for u in unit_strings]
    unit_regex_pattern = '(' + '|'.join(map(re.escape, unit_delimiters)) + ')'
    fields = re.split(unit_regex_pattern, data_string)
    fields_and_units = []
    for i, f in enumerate(fields[:-1]):  # Parentheses around regex mean match expression is also returned
        # Up to -1 to skip the last one, which has no corresponding unit match
        if i % 2 == 0:
            fields_and_units.append({'bacteria_string': f, 'unit': fields[i+1]})

    for field_and_unit in fields_and_units:

        bacteria_string = field_and_unit['bacteria_string']
        unit = field_and_unit['unit']
        
        bacteria_matches = re.finditer(bacteria_regex, bacteria_string)
        
        def _extract_value(range_expr):
            if '-' not in range_expr:
                return range_expr
            bounds = range_expr.split('-')

            def _geometric_mean(b0, b1):
                return (b0 * b1) ** (0.5)

            try:
                b0, b1 = float(bounds[0]), float(bounds[1])
                return str(_geometric_mean(b0, b1))  # Take geometric mean (log scale)
            except:
                print("?????")
                return bounds[0]
                
        
        numeric_range_regex = r'MIC \d+\.?\s?\-?\s?\d*'
        numeric_match = re.search(numeric_range_regex, bacteria_string)
        if not numeric_match:
            continue
        mic_match_string = numeric_match.group(0)
        numeric_part = mic_match_string[4:]
        value = _extract_value(numeric_part)
        for bacteria_match in bacteria_matches:
            bacterium = bacteria_match.groupdict()['bacterium']
            strain = bacteria_match.groupdict()['strain']
            if strain:
                strain = strain.strip()
            all_bacteria[(bacterium, strain)] = {
                'unit': unit,
                'value': value,
            }
            
    return all_bacteria

def get_modifications(modifications_string):
    modifications = []
    if re.match(r'XX[B-Z]', modifications_string):
        modifications.append('unknown_modification: XX[B-Z]')
    for bad_string in ['UCBB', 'UCSB', 'UCSS1b']:
        if bad_string in modifications_string:
            modifications.append('unknown_modification: ' + bad_string)
    if 'UCSS1a' in modifications_string or 'S=S' in modifications_string or 'S-S' in modifications_string:
        modifications.append('disulfide')
    if 'XXA' in modifications_string:
        modifications.append('C-Terminal')
    return modifications

In [None]:
def get_data_from_rows(rows):
    row_names = ["Hemolytik ID", 
                 "PMID", 
                 "YEAR", 
                 "SEQUENCE", 
                 "LENGTH", 
                 "NAME", 
                 "C-ter Modification", 
                 "N-ter Modification", 
                 "Linear/Cyclic",
                 "Stereochemistry",
                 "Non-Natural",
                 "NATURE",
                 "ACTIVITY",
                 "RBCs SOURCE"]
    output = {}
    for row in rows:
        for row_name in row_names:
            if row_name in row.text:
                output[row_name]=row.text.split(row_name)[1]
    
    return output

In [2]:
hemolytik_dir = "../../../../"
hemolytik_sequences_with_Lstereo = pd.read_csv(hemolytik_dir+"hemolytik_sequences_Lstereo.txt",sep="\t")

In [3]:
hemolytik_sequences_with_Lstereo.head(2)

Unnamed: 0,ID,SEQ
0,1002,ALWMTLLKKVLKAAAKAALDAVLVGANA
1,1003,ALWDTLLKKVLKAAAKAALNAVLVGANA


In [6]:
data_ids = hemolytik_sequences_with_Lstereo.ID.values
print(data_ids[:5])

[1002 1003 1004 1005 1006]


In [9]:
DATABASE_NAME = 'Hemolytik'
url_base = 'http://crdd.osdd.net/raghava/hemolytik/display.php?details='

NUM_AMPS = len(data_ids)
amps = {}

In [140]:
NUM_AMPS

2388

In [141]:
results = {}
for i in range(max(len(amps), 1), NUM_AMPS):
    if i%100==0:
        print(f"processed {i}/{NUM_AMPS}")
    id_ = data_ids[i]
    url = url_base + str(id_)
    content = requests.get(url).content

    soup = BeautifulSoup(content, 'html.parser')

    table = soup.findAll(lambda tag: tag.name=='table' and tag.has_attr("bordercolor"))# and tag.has_attr('class'))# and tag['name']=="tab") 
    rows = table[1].findAll(lambda tag: tag.name=='tr')
    table_data = get_data_from_rows(rows)
    results[id_]=table_data


processed 100/2388
processed 200/2388
processed 300/2388
processed 400/2388
processed 500/2388
processed 600/2388
processed 700/2388
processed 800/2388
processed 900/2388
processed 1000/2388
processed 1100/2388
processed 1200/2388
processed 1300/2388
processed 1400/2388
processed 1500/2388
processed 1600/2388
processed 1700/2388
processed 1800/2388
processed 1900/2388
processed 2000/2388
processed 2100/2388
processed 2200/2388
processed 2300/2388


In [149]:
dataframe_data = {}
for row_name in row_names:
    dataframe_data[row_name]=[]
    
for _, vd in results.items():
    for k, v in vd.items():
        dataframe_data[k].append(v)
dataframe_data = pd.DataFrame(dataframe_data)
print(f"{len(dataframe_data)=}")
print(dataframe_data.head(2))

if SAVE_FILES:
    dataframe_data.to_csv("hemolytik_seqsLstereo_unprocessed.data")

len(dataframe_data)=2387
  Hemolytik ID      PMID  YEAR                      SEQUENCE LENGTH      NAME  \
0         1003  10660589  2000  ALWDTLLKKVLKAAAKAALNAVLVGANA     28     D4-S4   
1         1004  10660589  2000  ALWDTLLKKVLKAAAKAALDAVLVGANA     28  D4D20-S4   

  C-ter Modification N-ter Modification Linear/Cyclic Stereochemistry  \
0               Free               Free        Linear               L   
1               Free               Free        Linear               L   

  Non-Natural         NATURE         ACTIVITY RBCs SOURCE  
0        None  Antimicrobial  LC50 =2.3±0.3μM       Human  
1        None  Antimicrobial      LC50 =5±1μM       Human  


In [151]:
os.getcwd()

'/media/jmenard/Data/projects/compare-latent-spaces-amps/oracles/Antimicrobial-Peptides/data'

the ACTIVITY column has variance in how it is reported, somewhat inconveniently

In [155]:
set(dataframe_data.ACTIVITY)

{'No hemolysis upto 500μM',
 '3.5% hemolytic at 40μM',
 '5% hemolysis at 30μM',
 '5.07% hemolysis at 25μg/ml  ',
 '0.08% hemolysis at 100µg/ml',
 'HD50=77μg/ml',
 '14% hemolysis at 200μg/ml ',
 '9.1% hemolytic at 15.7μM',
 '0% hemolysis at 3.13-25 μM',
 '40% hemolysis at 400g/l',
 '95.7±4.12% hemolysis at 100μg/ml',
 'LD50 > 600µM',
 'MHC <81μM',
 '37.9% hemolysis at 50µM',
 '0% hemolysis upto 100μg/ml ',
 '0-5% hemolysis at 40μg/ml',
 'LC50  >100μM',
 '~70% hemolysis at 50μM',
 '0% hemolysis upto 500μg/ml (non hemolytic)',
 'EC50 = 45000 nM',
 'HC10  = 3840 μg/ml',
 'HD50  =8.9x103μM',
 'HC10  = 1.78 μg/ml',
 '0.8% hemolysis at 0.125mg/ml ',
 'MHC =214.81μM ',
 ' LD50 = 100µM',
 '100%  hemolysis at 100µM',
 '100% hemolysis at 30 µg/mL',
 'HC10  = 203 μg/ml',
 '0.19% hemolysis at 50μg/ml',
 'LC50 =50μM',
 'EC50 =80μM',
 'EC50 = 590000 nM',
 '>40% hemolysis at 50μM',
 '0% hemolysis at 300μM',
 '18% hemolysis at 10µM',
 '100% hemolysis at 50μM',
 'HC10  = 9350  μg/ml',
 '40% hemolytic at

In [172]:
dataframe_data.head()

Unnamed: 0,Hemolytik ID,PMID,YEAR,SEQUENCE,LENGTH,NAME,C-ter Modification,N-ter Modification,Linear/Cyclic,Stereochemistry,Non-Natural,NATURE,ACTIVITY,RBCs SOURCE
0,1003,10660589,2000,ALWDTLLKKVLKAAAKAALNAVLVGANA,28,D4-S4,Free,Free,Linear,L,,Antimicrobial,LC50 =2.3±0.3μM,Human
1,1004,10660589,2000,ALWDTLLKKVLKAAAKAALDAVLVGANA,28,D4D20-S4,Free,Free,Linear,L,,Antimicrobial,LC50 =5±1μM,Human
2,1005,10660589,2000,ALWMTLLKKVLKAAAKAALKAVLVGANA,28,K20-S4,Free,Free,Linear,L,,Antimicrobial,LC50 =1.2±0.4μM,Human
3,1006,10660589,2000,ALWKTLLKKVLKAAAKAALNAVLVGANA,28,K4-S4,Free,Free,Linear,L,,Antimicrobial,LC50 =2±0.1μM,Human
4,1007,10660589,2000,ALWKTLLKKVLKAAAKAALKAVLVGANA,28,K4K20-S4,Free,Free,Linear,L,,Antimicrobial,LC50 =0.5±0.1μM,Human


In [245]:
def check_if_range(value:str)->bool:
    if "-" in value:
        return True
    elif ">" in value:
        return True
    elif "<" in value:
        return True
    elif ">=" in value:
        return True
    elif "<=" in value: 
        return True
    else:
        try:
            tst_ = float(value)
            succeeded = True
        except ValueError:
            succeeded = False
    
    if succeeded:
        return False
    else:
        return True

def get_bounds_from_range(value_range):
    if "-" in value_range:
        n1 = value_range.split("-")[0]
        n2 = value_range.split("-")[1]
    elif ">" in value_range:
        n1 = value_range.split(">")[1]
        n2 = None
    elif "~" in value_range:
        n1 = value_range.split("~")[1]
        n2 = None
    return n1,n2
    
def trim_percentage_values(percentage):
    """
    removes useful information, but information that makes data processing 
    difficult. E.g. ">70%" becomes "70%", or "~80%" becomes "80"
    """
    if not isinstance(percentage, str):
        raise ValueError(f"{percentage=} is not of type string")

    _symbols_to_remove = [">","<","~", "±"]
    new_percentage = percentage
    for _symbol in _symbols_to_remove:
        if (_symbol in percentage) and (_symbol != "±"):
            new_percentage = percentage.split(_symbol)[1]
        elif (_symbol in percentage) and (_symbol == "±"):
            new_percentage = percentage.split(_symbol)[0]
    
    return new_percentage
            

def parse_unique_activity_values(set_of_values):
    # some values are of the form:
    # formA: X% hemolysis at Y*U
    # Units can be
    # -g/l
    # - \mu g/ml
    # - \mu M
    # - nM

    units = ["μM", "μg/ml", "g/l", "nM", "mg/ml"]
    final_units = ["μM","μg/ml"]
    unit_conversions = {
        "μM":(1,"μM"),
        "nM":(10**(-3),"μM"),
        "μg/ml":(1,"μg/ml"),
        "g/l":(10**3, "μg/ml"),
        "mg/ml":(10**3, "μg/ml")
    }
    unit_conversion_coef = {
        "μM_per_nM":10**(-3),
        "μg/ml_per_mg/ml":10**3,
        "g/l_per_mg/ml":1
    }
    output = {"hemolytic at":[],
             "concentration_lower_bound":[],
             "concentration_upper_bound":[],
              "unit":[]
    }
    counter = 0
    for value_ in set_of_values:
        value = value_.strip()
        
        formA = (("hemolysis at" in value) or ("hemolytic at" in value) )
        formB = ('hemolysis upto' in value)

        possible_percents_  = list(range(10,110,10))
        possible_50reports_ = ["EC", "LC", "HC", "LD", "HD"]
        possible_reports_ = []
        formC = False
        for percent_ in possible_percents_:
            for report_ in possible_50reports_:
                if report_+str(percent_) in value:
                    formC = True
                    formC_percent_ = percent_
                    formC_report_  = report_
                    break
            if formC:
                break
            
        if formA: 
            counter += 1
            # first grab the percentage
            percent_value_ = value.split("%")[0]
            percent_value_ = trim_percentage_values(percent_value_)

            # concentration
            concentration_ = value.split("at ")[1]
            new_concentration_ = 1.0
            new_unit_ = ""
            for unit_ in units:
                if unit_ in concentration_:
                    new_concentration_ = concentration_.split(unit_)[0]
                    if check_if_range(new_concentration_):
                        bounds = get_bounds_from_range(new_concentration_)
                        lower_bound_, upper_bound_ = bounds
                    else:
                        # if a bound is not reported
                        lower_bound_ = new_concentration_
                        upper_bound_ = None
                    
                    
                    new_concentration_lower_bound_ = float(lower_bound_) * unit_conversions[unit_][0]
                    if upper_bound_ is not None:
                        new_concentration_upper_bound_ = float(upper_bound_) * unit_conversions[unit_][0]
                    else:
                        new_concentration_upper_bound_ = None
                    new_unit_ = unit_conversions[unit_][1]
                    break
            # print(f"Form A", percent_value_, new_concentration_lower_bound_, new_concentration_upper_bound_, new_unit_)
            print(counter)
        elif formB:
            counter += 1
            # first grab the percentage
            percent_value_ = value.split("%")[0]
            percent_value_ = trim_percentage_values(percent_value_)

            # concentration
            concentration_ = value.split("upto ")[1]
            new_concentration_ = 1.0
            new_unit_ = ""
            for unit_ in units:
                if unit_ in concentration_:
                    new_concentration_ = concentration_.split(unit_)[0]
                    if check_if_range(new_concentration_):
                        bounds = get_bounds_from_range(new_concentration_)
                        lower_bound_, upper_bound_ = bounds
                    else:
                        # if a bound is not reported
                        lower_bound_ = None
                        upper_bound_ = new_concentration_

                    new_concentration_upper_bound_ = float(upper_bound_) * unit_conversions[unit_][0]
                    if lower_bound_ is not None:
                        new_concentration_lower_bound_ = float(lower_bound_) * unit_conversions[unit_][0]
                    else:
                        new_concentration_lower_bound_ = None
                    
                    new_unit_ = unit_conversions[unit_][1]
                        
            # print(f"Form B", percent_value_, new_concentration_lower_bound_, new_concentration_upper_bound_, new_unit_)
        elif formC:
            percent_value_ = formC_percent_
            formC_report_  = report_
            concentration_ = value.split("=")[-1]
            concentration_ = concentration_.strip()
            new_unit_ = ""
            if "2.1x102" in concentration_:
                concentration_ = str(2.1* 10**2)
                new_unit_ = "μM"
            
            print(f"Form C", percent_value_,concentration_, new_unit_, "value=",value)
        else:
            print("===== NO FORM: ",value)

HCX := minimal concentration inducing X% hemolysis in human erythrocytes. \
EC50 := concentration required to obtain a 50% effect \
IC50 := concentration required to inhibit a process by 50% \
HD50 := concentration causing 50% hemolysis of red blood cells \
LC50 or LD50 := The value of LD50 for a substance is the dose required to kill half the members of a tested population after a specified test duration. (concentration causing 50% hemolysis in this case)

In [246]:
parse_unique_activity_values(set(dataframe_data.ACTIVITY))

2
3
4
5
Form C 50 77μg/ml  value= HD50=77μg/ml
6
7
8
9
10
Form C 50 LD50 > 600µM  value= LD50 > 600µM
===== NO FORM:  MHC <81μM
11
13
Form C 50 LC50  >100μM  value= LC50  >100μM
14
Form C 50 45000 nM  value= EC50 = 45000 nM
Form C 10 3840 μg/ml  value= HC10  = 3840 μg/ml
Form C 50 8.9x103μM  value= HD50  =8.9x103μM
Form C 10 1.78 μg/ml  value= HC10  = 1.78 μg/ml
16
===== NO FORM:  MHC =214.81μM
Form C 50 100µM  value= LD50 = 100µM
17
18
Form C 10 203 μg/ml  value= HC10  = 203 μg/ml
19
Form C 50 50μM  value= LC50 =50μM
Form C 50 80μM  value= EC50 =80μM
Form C 50 590000 nM  value= EC50 = 590000 nM
20
21
22
23
Form C 10 9350  μg/ml  value= HC10  = 9350  μg/ml
24
25
Form C 50 8.7μM  value= HD50 = 8.7μM
26
===== NO FORM:  Hemolytic activity expresssed as the initial rate (∆A/∆t, min- 1) i.e. at above 100 µM activity is  ~0.002 min-1
27
===== NO FORM:  MHC =1000 μg/ml
===== NO FORM:  0% hemolysis  at 50 µM (non-hemolytic)
28
29
Form C 50 27.3µM  value= LC50 = 27.3µM
30
31
32
33
===== NO FORM

IndexError: list index out of range

In [135]:
print(len(rows))
row_names = ["Hemolytik ID", 
             "PMID", 
             "YEAR", 
             "SEQUENCE", 
             "LENGTH", 
             "NAME", 
             "C-ter Modification", 
             "N-ter Modification", 
             "Linear/Cyclic",
             "Stereochemistry",
             "Non-Natural",
             "NATURE",
             "ACTIVITY",
             "RBCs SOURCE"]
for row in rows:
    for row_name in row_names:
        if row_name in row.text:
            print(row_name, row.text.split(row_name)[1])

19
Hemolytik ID 1003
PMID 10660589
YEAR 2000
SEQUENCE ALWDTLLKKVLKAAAKAALNAVLVGANA
LENGTH 28
NAME D4-S4
C-ter Modification Free
N-ter Modification Free
Linear/Cyclic Linear
Stereochemistry L
Non-Natural None
NATURE Antimicrobial
ACTIVITY LC50 =2.3±0.3μM
RBCs SOURCE Human


In [None]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))