# Import and define functions

Installs packages

Imports libraries and OpenAI key

In [1]:
"""
Configuration and Imports
=========================
This cell sets up all necessary imports, configuration, and constants for the notebook.
"""

# Standard library imports
import os
import re
import string
import time
import logging
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
from datetime import datetime

# Third-party imports
import numpy as np
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from Bio import Entrez
from dotenv import load_dotenv
from openai import OpenAI

# ============================================================================
# CONFIGURATION
# ============================================================================

# Load environment variables from .env file
load_dotenv()

# PubMed API Configuration
PUBMED_EMAIL = os.environ.get('PUBMED_EMAIL', 'kuhfeldrf@oregonstate.edu')
Entrez.email = PUBMED_EMAIL

# OpenAI API Configuration
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
DEFAULT_OPENAI_MODEL = os.environ.get('OPENAI_MODEL', 'gpt-4o-mini')

# Initialize OpenAI client
if OPENAI_API_KEY:
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
else:
    openai_client = None
    print("⚠ Warning: OPENAI_API_KEY not found in environment variables.")

# ============================================================================
# CONSTANTS
# ============================================================================

# File paths
DATA_DIR = Path('data')
PROTEIN_LISTS_DIR = Path('protein_lists')
XML_EXAMPLES_DIR = Path('xml_examples')

# UniProt XML namespace
UNIPROT_NAMESPACE = "{http://uniprot.org/uniprot}"

# API endpoints
UNIPROT_BASE_URL = "https://www.uniprot.org/uniprot"

# Processing settings
DEFAULT_REQUEST_TIMEOUT = 30  # seconds
DEFAULT_RETRY_ATTEMPTS = 3

# ============================================================================
# LOGGING CONFIGURATION
# ============================================================================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# ============================================================================
# VERIFICATION
# ============================================================================

def verify_setup() -> None:
    """Verify that all required configuration is set up correctly."""
    issues = []
    
    if not OPENAI_API_KEY:
        issues.append("OpenAI API key not found")
    else:
        logger.info(f"✓ OpenAI API key loaded successfully")
        logger.info(f"✓ Default model: {DEFAULT_OPENAI_MODEL}")
    
    if not PUBMED_EMAIL:
        issues.append("PubMed email not configured")
    else:
        logger.info(f"✓ PubMed email configured: {PUBMED_EMAIL}")
    
    # Verify directories exist
    for dir_path, dir_name in [(DATA_DIR, 'data'), (PROTEIN_LISTS_DIR, 'protein_lists')]:
        if not dir_path.exists():
            logger.warning(f"⚠ Directory '{dir_name}' does not exist. Creating it.")
            dir_path.mkdir(parents=True, exist_ok=True)
    
    if issues:
        logger.warning("⚠ Configuration issues found:")
        for issue in issues:
            logger.warning(f"  - {issue}")
        logger.info("  Please check your .env file or configuration settings.")
    else:
        logger.info("✓ All configuration verified successfully")

# Run verification
verify_setup()

2025-11-21 14:58:12 - __main__ - INFO - ✓ OpenAI API key loaded successfully
2025-11-21 14:58:12 - __main__ - INFO - ✓ Default model: gpt-4o-mini
2025-11-21 14:58:12 - __main__ - INFO - ✓ PubMed email configured: kuhfeldrf@oregonstate.edu
2025-11-21 14:58:12 - __main__ - INFO - ✓ All configuration verified successfully


<b>Uniport Function: </b>Extracts the protein sequence information from the Uniprot protein page

In [2]:
def fetch_sequence(lines: List[str]) -> Tuple[str, str, str]:
    """
    Extract protein sequence, name, and description from UniProt XML data.
    
    Parses XML lines to extract protein sequence information, protein name,
    and protein description using regex patterns.
    
    Args:
        lines: List of strings representing XML lines from UniProt data.
    
    Returns:
        Tuple containing:
            - protein_sequence (str): The extracted protein sequence.
            - protein_name (str): The first protein name found, empty string if none.
            - protein_desc (str): The first protein description found, empty string if none.
    
    Example:
        >>> xml_lines = ['<name>Protein Name</name>', '<sequence>MKTAY...</sequence>']
        >>> seq, name, desc = fetch_sequence(xml_lines)
        >>> print(name)
        'Protein Name'
    """
    if not lines:
        logger.warning("Empty lines list provided to fetch_sequence")
        return '', '', ''
    
    # Initialize variables
    protein_sequence = ''
    protein_name = ''
    protein_desc = ''
    protein_desc_list = []
    protein_name_list = []
    
    # Compile regex patterns once for efficiency
    sequence_pattern = re.compile(r'<sequence[^>]*>(.*?)</sequence>')
    protein_desc_pattern = re.compile(r'<fullName.*?>(.*?)</fullName>')
    protein_name_pattern = re.compile(r'<name>(.*?)</name>')
    
    try:
        for line in lines:
            if not isinstance(line, str):
                continue
                
            # Extract protein name
            if '<name' in line:
                match = protein_name_pattern.search(line)
                if match:
                    protein_name_list.append(match.group(1))
            
            # Extract protein description (only first occurrence)
            if not protein_desc and '<fullName' in line:
                match = protein_desc_pattern.search(line)
                if match:
                    protein_desc_list.append(match.group(1))
            
            # Extract protein sequence
            if '<sequence' in line:
                match = sequence_pattern.search(line)
                if match:
                    protein_sequence = match.group(1)
        
        # Get first occurrence of name and description
        if protein_name_list:
            protein_name = protein_name_list[0]
        if protein_desc_list:
            protein_desc = protein_desc_list[0]
        
        logger.debug(f"Extracted sequence (length: {len(protein_sequence)}), "
                    f"name: {protein_name[:50] if protein_name else 'N/A'}")
        
    except Exception as e:
        logger.error(f"Error extracting sequence information: {e}")
        raise
    
    return protein_sequence, protein_name, protein_desc

<b>Uniport Function: </b>Extract a mapping of evidence keys to their associated PubMed IDs from the XML root node.

In [3]:
def extract_evidence_to_pubmed_mapping(root: ET.Element) -> Dict[str, str]:
    """
    Extract a mapping of evidence keys to their associated PubMed IDs from XML.
    
    Parses UniProt XML to create a dictionary mapping evidence keys to PubMed IDs.
    This mapping is used to associate peptide features with their supporting references.
    
    Args:
        root: XML root element from parsed UniProt XML data.
    
    Returns:
        Dictionary mapping evidence keys (str) to PubMed IDs (str).
        Empty dictionary if no mappings found.
    
    Example:
        >>> import xml.etree.ElementTree as ET
        >>> root = ET.fromstring(xml_data)
        >>> mapping = extract_evidence_to_pubmed_mapping(root)
        >>> print(mapping)
        {'1': '12345678', '2': '87654321'}
    """
    # Use namespace constant if available, otherwise use hardcoded value
    try:
        namespace = UNIPROT_NAMESPACE
    except NameError:
        namespace = "{http://uniprot.org/uniprot}"
    
    if root is None:
        try:
            logger.warning("None root provided to extract_evidence_to_pubmed_mapping")
        except NameError:
            print("Warning: None root provided to extract_evidence_to_pubmed_mapping")
        return {}
    
    evidence_to_pubmed = {}
    
    try:
        # Use namespace constant for consistency
        evidence_xpath = f".//{namespace}evidence"
        db_ref_xpath = f".//{namespace}dbReference[@type='PubMed']"
        
        for evidence in root.findall(evidence_xpath):
            evidence_key = evidence.attrib.get('key')
            source = evidence.find(db_ref_xpath)
            
            if source is not None:
                pubmed_id = source.attrib.get('id')
                if evidence_key and pubmed_id:
                    evidence_to_pubmed[evidence_key] = pubmed_id
        
        try:
            logger.debug(f"Extracted {len(evidence_to_pubmed)} evidence-to-PubMed mappings")
        except NameError:
            pass  # Logger not available, skip logging
        
    except Exception as e:
        try:
            logger.error(f"Error extracting evidence to PubMed mapping: {e}")
        except NameError:
            print(f"Error extracting evidence to PubMed mapping: {e}")
        raise
    
    return evidence_to_pubmed

<b>Uniport Function: </b>Extracts peptide features and function comments from a given XML data.

In [4]:
def extract_peptide_and_function(data: str, protein_id: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], Dict[str, str]]:
    """
    Extract peptide features and function comments from UniProt XML data.
    
    Parses UniProt XML to extract:
    1. Peptide features (sequences, intervals, descriptions, evidence)
    2. Function comments (molecular functions, evidence)
    3. Evidence-to-PubMed mapping
    
    Args:
        data: XML string from UniProt API response.
        protein_id: UniProt protein accession ID (e.g., 'P02666').
    
    Returns:
        Tuple containing:
            - peptide_features: List of dictionaries with peptide information.
            - function_comments: List of dictionaries with function information.
            - evidence_to_pubmed: Dictionary mapping evidence keys to PubMed IDs.
    
    Raises:
        ET.ParseError: If XML data is malformed.
        ValueError: If protein_id is invalid or data is empty.
    
    Example:
        >>> xml_data = "<uniprot>...</uniprot>"
        >>> peptides, functions, mapping = extract_peptide_and_function(xml_data, "P02666")
        >>> print(f"Found {len(peptides)} peptides")
    """
    if not data or not isinstance(data, str):
        raise ValueError("Invalid XML data provided")
    
    if not protein_id or not isinstance(protein_id, str):
        raise ValueError("Invalid protein_id provided")
    
    # Use namespace constant if available, otherwise use hardcoded value
    try:
        namespace = UNIPROT_NAMESPACE
    except NameError:
        namespace = "{http://uniprot.org/uniprot}"
    
    try:
        # Parse XML content
        root = ET.fromstring(data)
        lines = data.split('\n')
        
        # Extract protein sequence and metadata
        protein_sequence, protein_name, protein_desc = fetch_sequence(lines)
        
        # Extract evidence to PubMed mapping
        evidence_to_pubmed = extract_evidence_to_pubmed_mapping(root)
        
        # Extract peptide features
        peptide_features = []
        feature_xpath = f".//{namespace}feature[@type='peptide']"
        location_begin_xpath = f"{namespace}location/{namespace}begin"
        location_end_xpath = f"{namespace}location/{namespace}end"
        
        for feature in root.findall(feature_xpath):
            try:
                begin_elem = feature.find(location_begin_xpath)
                end_elem = feature.find(location_end_xpath)
                
                if begin_elem is None or end_elem is None:
                    logger.warning(f"Skipping peptide feature with missing location for {protein_id}")
                    continue
                
                begin = int(begin_elem.attrib.get('position', 0))
                end = int(end_elem.attrib.get('position', 0))
                
                if begin < 1 or end < begin:
                    logger.warning(f"Invalid peptide interval [{begin}-{end}] for {protein_id}")
                    continue
                
                # Extract peptide sequence (convert to 0-based indexing)
                peptide_seq = protein_sequence[begin-1:end] if protein_sequence else ''
                interval = f'{begin}-{end}'
                
                peptide_info = {
                    'proteinID': protein_id,
                    'protein_name': protein_name,
                    'protein_desc': protein_desc,
                    'interval': interval,
                    'peptide': peptide_seq,
                    'description': feature.attrib.get('description', None),
                    'feature_evidence': feature.attrib.get('evidence', None),
                    'evidence_to_pubmed': evidence_to_pubmed,
                }
                peptide_features.append(peptide_info)
                
            except (ValueError, AttributeError) as e:
                logger.warning(f"Error processing peptide feature for {protein_id}: {e}")
                continue
        
        # Extract function comments
        function_comments = []
        comment_xpath = f".//{namespace}comment[@type='function']"
        
        for comment in root.findall(comment_xpath):
            try:
                molecule_elem = comment.find(f"{namespace}molecule")
                text_elem = comment.find(f"{namespace}text")
                
                function_info = {
                    'molecule': molecule_elem.text if molecule_elem is not None else None,
                    'text': text_elem.text if text_elem is not None else None,
                    'comment_evidence': text_elem.attrib.get('evidence', None) if text_elem is not None else None
                }
                function_comments.append(function_info)
                
            except AttributeError as e:
                logger.warning(f"Error processing function comment for {protein_id}: {e}")
                continue
        
        logger.info(f"Extracted {len(peptide_features)} peptides and {len(function_comments)} "
                   f"function comments for {protein_id}")
        
        return peptide_features, function_comments, evidence_to_pubmed
        
    except ET.ParseError as e:
        logger.error(f"XML parsing error for {protein_id}: {e}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error extracting peptides/functions for {protein_id}: {e}")
        raise

<b>Uniport Function: </b>To associate peptide features with related function comments based on shared evidence or matching descriptions.

In [5]:
def associate_peptide_with_function(
    peptide_features: List[Dict[str, Any]],
    function_comments: List[Dict[str, Any]],
    evidence_to_pubmed: Dict[str, str]
) -> List[Dict[str, Any]]:

    """
    Associate peptide features with function comments based on shared evidence.
    
    Links peptide features to their biological functions by matching:
    1. Shared evidence keys between peptides and function comments
    2. Matching peptide descriptions with comment molecules
    
    Args:
        peptide_features: List of dictionaries containing peptide information.
        function_comments: List of dictionaries containing function comment information.
        evidence_to_pubmed: Dictionary mapping evidence keys to PubMed IDs.
    
    Returns:
        List of peptide dictionaries with added 'associated_function', 
        'non_associated_function', and 'evidence_pubmed' fields.
    
    Example:
        >>> peptides = [{'feature_evidence': '1 2', 'description': 'Casoxin'}]
        >>> functions = [{'comment_evidence': '1', 'text': 'Opioid antagonist'}]
        >>> mapping = {'1': '12345678'}
        >>> result = associate_peptide_with_function(peptides, functions, mapping)
    """
    if not peptide_features:
        logger.warning("Empty peptide_features list provided")
        return []
    
    if not isinstance(peptide_features, list):
        raise ValueError("peptide_features must be a list")
    
    associated_data = []
    
    try:
        for peptide in peptide_features:
            # Create a copy to avoid modifying the original
            peptide_copy = peptide.copy()
            
            associated_comment_texts = []
            all_functions = []
            
            # Parse evidence keys
            peptide_evidence_str = peptide_copy.get('feature_evidence', '')
            peptide_evidence = set(peptide_evidence_str.split()) if peptide_evidence_str else set()
            
            # Fetch associated PubMed IDs
            peptide_pubmed = {
                evidence_to_pubmed[eid] 
                for eid in peptide_evidence 
                if eid in evidence_to_pubmed
            }
            
            direct_link = False
            
            # Match peptides with function comments
            for comment in function_comments:
                comment_evidence_str = comment.get('comment_evidence', '')
                comment_evidence = set(comment_evidence_str.split()) if comment_evidence_str else set()
                
                # Check for shared evidence or matching description
                has_shared_evidence = bool(peptide_evidence & comment_evidence)
                description_matches = (
                    peptide_copy.get('description') == comment.get('molecule')
                )
                
                if has_shared_evidence or description_matches:
                    peptide_copy['comment_evidence'] = comment_evidence
                    comment_text = comment.get('text', '')
                    if comment_text:
                        associated_comment_texts.append(comment_text)
                    direct_link = True
                    
                    # Add PubMed IDs from comment evidence
                    comment_pubmed = {
                        evidence_to_pubmed[eid] 
                        for eid in comment_evidence 
                        if eid in evidence_to_pubmed
                    }
                    peptide_pubmed.update(comment_pubmed)
                else:
                    comment_text = comment.get('text', '')
                    if comment_text:
                        all_functions.append(comment_text)
            
            # Set associated and non-associated functions
            peptide_copy['associated_function'] = '; '.join(associated_comment_texts) if associated_comment_texts else ''
            
            if not direct_link and all_functions:
                peptide_copy['non_associated_function'] = all_functions
            
            # Convert PubMed set to list for JSON serialization
            peptide_copy['evidence_pubmed'] = list(peptide_pubmed)
            
            associated_data.append(peptide_copy)
        
        logger.debug(f"Associated {len(associated_data)} peptides with functions")
        
    except Exception as e:
        logger.error(f"Error associating peptides with functions: {e}")
        raise
    
    return associated_data

<b>Uniport Function: </b>To extract references from the provided XML data and return them as a DataFrame.

In [6]:
def extract_references(data, protein_id):
    """
    To extract references from the provided XML data and return them as a DataFrame.
    """
    lines = data.split('\n')

    # Continue extracting reference data
    reference_data = []
    inside_reference = False
    title, pubmed_id, doi, reference_key, formatted_authors, scope_range = None, None, None, None, None, None
    author_list = []

    # Regular expressions for extracting information
    reference_key_pattern = re.compile(r'<reference key="(\d+)">')
    title_pattern = re.compile(r'<title>(.*?)</title>')
    person_name_pattern = re.compile(r'<person name="(.*?)"/>')
    dbReference_type_pattern = re.compile(r'<dbReference type="(.*?)"')
    dbReference_id_pattern = re.compile(r'id="(.*?)"/>')
    scope_pattern = re.compile(r'PROTEIN SEQUENCE OF (\d+-\d+)')

    for line in lines:
        try:
            # Extract reference information
            if '<reference key="' in line:
                inside_reference = True
                reference_key = reference_key_pattern.search(line).group(1)
                # Ensure author_list is always initialized as an empty list
                author_list = []
            elif inside_reference:
                if '<title>' in line:
                    title = title_pattern.search(line).group(1)
                elif '<person name="' in line:
                    author = person_name_pattern.search(line).group(1)
                    author_list.append(author)
                elif '<dbReference type="' in line:
                    ref_type = dbReference_type_pattern.search(line).group(1)
                    ref_id = dbReference_id_pattern.search(line).group(1)
                    if ref_type == "PubMed":
                        pubmed_id = ref_id
                    elif ref_type == "DOI":
                        doi = ref_id
                elif '<scope>' in line:
                    scope_match = scope_pattern.search(line)
                    if scope_match:
                        scope_range = scope_match.group(1)
                elif '</reference>' in line:
                    inside_reference = False
                    # Format the authors list
                    if author_list:
                        if len(author_list) >= 1:
                            last_name, first_name_initial = author_list[0].split(' ')[0], author_list[0].split(' ')[1][0]
                            formatted_authors = f"{last_name}, {first_name_initial}. et al."
                    else:
                        formatted_authors = None

                    reference_data.append({
                        'proteinID': protein_id,
                        'reference key': reference_key,
                        'title': title,
                        'authors': formatted_authors,
                        'pubmed': pubmed_id,
                        'doi': doi,
                        'scope_range': scope_range
                    })

                    # Reset for the next reference
                    author_list, pubmed_id, doi, title, reference_key, formatted_authors, scope_range = [], None, None, None, None, None, None 
        except Exception as e:
            print(f"Error processing line: {line}. Error: {e}")

    # Convert reference_data list to a DataFrame
    
    return pd.DataFrame(reference_data)  # Note that this should return reference_df, not reference_data

<b>Uniport Function: </b>Fetch protein information from UniProt for a given protein ID.

In [7]:
def fetch_protein_info(protein_id: str, retry_attempts: int = DEFAULT_RETRY_ATTEMPTS) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Fetch protein information from UniProt API for a given protein ID.
    
    Retrieves XML data from UniProt, extracts peptide features, function comments,
    and references, then associates peptides with their functions.
    
    Args:
        protein_id: UniProt protein accession ID (e.g., 'P02666').
        retry_attempts: Number of retry attempts for failed requests (default: 3).
    
    Returns:
        Tuple containing:
            - peptide_df: DataFrame with peptide features and associated functions.
            - reference_df: DataFrame with reference information (PubMed IDs, titles, etc.).
        
        Returns empty DataFrames if request fails or no data found.
    
    Raises:
        ValueError: If protein_id is invalid.
        requests.RequestException: If API request fails after retries.
    
    Example:
        >>> peptide_df, ref_df = fetch_protein_info("P02666")
        >>> print(f"Found {len(peptide_df)} peptides")
    """
    if not protein_id or not isinstance(protein_id, str):
        raise ValueError(f"Invalid protein_id: {protein_id}")
    
    url = f'{UNIPROT_BASE_URL}/{protein_id}.xml'
    
    # Retry logic for API requests
    for attempt in range(retry_attempts):
        try:
            logger.debug(f"Fetching {protein_id} (attempt {attempt + 1}/{retry_attempts})")
            response = requests.get(url, timeout=DEFAULT_REQUEST_TIMEOUT)
            response.raise_for_status()  # Raises exception for bad status codes
            break
            
        except requests.exceptions.Timeout:
            logger.warning(f"Timeout fetching {protein_id} (attempt {attempt + 1})")
            if attempt == retry_attempts - 1:
                logger.error(f"Failed to fetch {protein_id} after {retry_attempts} attempts")
                return pd.DataFrame(), pd.DataFrame()
            time.sleep(1)  # Brief delay before retry
            
        except requests.exceptions.HTTPError as e:
            if response.status_code == 404:
                logger.warning(f"Protein {protein_id} not found in UniProt")
                return pd.DataFrame(), pd.DataFrame()
            logger.warning(f"HTTP error fetching {protein_id}: {e} (attempt {attempt + 1})")
            if attempt == retry_attempts - 1:
                return pd.DataFrame(), pd.DataFrame()
            time.sleep(1)
            
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error fetching {protein_id}: {e}")
            if attempt == retry_attempts - 1:
                return pd.DataFrame(), pd.DataFrame()
            time.sleep(1)
    
    try:
        data = response.text
        
        if not data:
            logger.warning(f"Empty response for {protein_id}")
            return pd.DataFrame(), pd.DataFrame()
        
        # Extract peptide features and function comments
        peptide_features, function_comments, evidence_to_pubmed = extract_peptide_and_function(
            data, protein_id
        )
        
        # Extract reference data
        reference_df = extract_references(data, protein_id)
        
        # Associate peptide with function
        associated_data = associate_peptide_with_function(
            peptide_features, function_comments, evidence_to_pubmed
        )
        
        # Convert to DataFrame
        peptide_df = pd.DataFrame(associated_data) if associated_data else pd.DataFrame()
        
        logger.info(f"Successfully processed {protein_id}: {len(peptide_df)} peptides, "
                   f"{len(reference_df)} references")
        
        return peptide_df, reference_df
        
    except Exception as e:
        logger.error(f"Error processing data for {protein_id}: {e}")
        return pd.DataFrame(), pd.DataFrame()

<b>Uniport Function: </b>Update associated function updates the asscoaitated function column from the non_Asscoiated_function matching pubmed ID or description column

In [8]:
def update_associated_function(row: pd.Series) -> str:
    """
    Update associated_function from non_associated_function based on matching criteria.
    
    If associated_function is empty, searches non_associated_function for entries
    that match the peptide description or PubMed ID, and promotes them to
    associated_function.
    
    Args:
        row: Pandas Series representing a single row from a DataFrame.
             Must contain 'associated_function', 'non_associated_function',
             'description', and optionally 'pubmed_id' columns.
    
    Returns:
        Updated associated_function string. Returns original value if already set.
    
    Example:
        >>> row = pd.Series({
        ...     'associated_function': '',
        ...     'non_associated_function': ['Antimicrobial activity'],
        ...     'description': 'Antimicrobial',
        ...     'pubmed_id': '12345678'
        ... })
        >>> result = update_associated_function(row)
    """
    import ast  # Safe alternative to eval()
    
    # Check if associated_function already has a value
    associated_func = row.get('associated_function', '')
    if pd.notna(associated_func) and associated_func != '':
        return associated_func
    
    # Parse non_associated_function
    non_associated_func = row.get('non_associated_function', [])
    non_associated_entries = []
    
    try:
        if isinstance(non_associated_func, str):
            # Try to parse string representation of list
            if non_associated_func.strip().startswith('[') and non_associated_func.strip().endswith(']'):
                # Use ast.literal_eval for safe evaluation (safer than eval)
                non_associated_entries = ast.literal_eval(non_associated_func)
            else:
                # Treat as single string entry
                non_associated_entries = [non_associated_func] if non_associated_func else []
        elif isinstance(non_associated_func, list):
            non_associated_entries = non_associated_func
        else:
            logger.warning(f"Unexpected type for non_associated_function: {type(non_associated_func)}")
            non_associated_entries = []
    except (ValueError, SyntaxError) as e:
        logger.warning(f"Error parsing non_associated_function: {e}")
        non_associated_entries = []
    
    if not non_associated_entries:
        return ''
    
    # Normalize description for matching
    description = ''
    if pd.notna(row.get('description')):
        description = str(row['description']).lower().replace("-", " ").strip()
    
    # Get PubMed ID
    pubmed_id = "-1"  # Sentinel value that won't match
    if 'pubmed_id' in row.index and pd.notna(row.get('pubmed_id')):
        pubmed_id = str(row['pubmed_id'])
    
    # Find matching entries
    associated_info_parts = []
    
    for entry in non_associated_entries:
        if not isinstance(entry, str):
            entry = str(entry)
        
        entry_normalized = entry.lower().replace("-", " ")
        
        # Check if description or PubMed ID matches
        description_match = description and description in entry_normalized
        pubmed_match = pubmed_id != "-1" and pubmed_id in entry
        
        if description_match or pubmed_match:
            associated_info_parts.append(entry)
    
    result = ' '.join(associated_info_parts) if associated_info_parts else ''
    
    if result:
        logger.debug(f"Updated associated_function: {result[:50]}...")
    
    return result

<b>PubMed Function: </b>Function to fetch abstract, title, authors, and doi for a PubMed ID

In [9]:
def search_pubmed_by_title(title: str) -> List[str]:
    """
    Search PubMed for articles matching a given title.
    
    Args:
        title: Article title to search for.
    
    Returns:
        List of PubMed IDs (strings) matching the title.
        Returns empty list if no matches found or on error.
    
    Raises:
        ValueError: If title is empty or invalid.
    """
    if not title or not isinstance(title, str):
        raise ValueError("title must be a non-empty string")
    
    try:
        logger.debug(f"Searching PubMed for title: {title[:50]}...")
        handle = Entrez.esearch(db="pubmed", term=title, retmax=10)
        record = Entrez.read(handle)
        handle.close()
        
        pubmed_ids = record.get("IdList", [])
        logger.info(f"Found {len(pubmed_ids)} PubMed IDs for title search")
        return pubmed_ids
        
    except Exception as e:
        logger.error(f"Error searching PubMed by title '{title[:50]}...': {e}")
        return []

def fetch_details(row: pd.Series, row_num: int, total_rows: int) -> Optional[Dict[str, Any]]:
    """
    Fetch article details from PubMed API.
    
    Retrieves abstract, title, authors, and DOI from PubMed using either:
    1. Direct PubMed ID (if available)
    2. Title search (if PubMed ID is missing)
    
    Args:
        row: Pandas Series containing 'pubmed_id' and optionally 'title'.
        row_num: Current row number (for progress tracking).
        total_rows: Total number of rows being processed.
    
    Returns:
        Dictionary containing:
            - 'pubmed_id': PubMed ID
            - 'abstract': Article abstract
            - 'title': Article title
            - 'authors': Formatted author list
            - 'doi': DOI if available
        
        Returns None if fetch fails or no data found.
    
    Example:
        >>> row = pd.Series({'pubmed_id': '12345678', 'title': 'Test Article'})
        >>> details = fetch_details(row, 1, 10)
        >>> print(details['abstract'])
    """
    if not isinstance(row, pd.Series):
        raise ValueError("row must be a pandas Series")
    
    details: Dict[str, Any] = {}
    pubmed_id = row.get('pubmed_id')
    
    # Try to get PubMed ID or search by title
    if pd.isnull(pubmed_id) or pubmed_id == '':
        title = row.get('title', '')
        if pd.notna(title) and title:
            logger.info(f"Row {row_num}/{total_rows}: Searching PubMed by title")
            matching_ids = search_pubmed_by_title(str(title))
            if matching_ids:
                pubmed_id = matching_ids[0]
                logger.info(f"Row {row_num}/{total_rows}: Found PubMed ID {pubmed_id} for title")
            else:
                logger.warning(f"Row {row_num}/{total_rows}: No PubMed ID found for title '{title[:50]}...'")
                return None
        else:
            logger.warning(f"Row {row_num}/{total_rows}: No PubMed ID or title available")
            return None
    
    # Convert to string if needed
    pubmed_id = str(pubmed_id)
    
    # Fetch details by PubMed ID
    try:
        logger.debug(f"Row {row_num}/{total_rows}: Fetching PubMed ID {pubmed_id}")
        handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")
        record = Entrez.read(handle)
        handle.close()
        
        if not record.get('PubmedArticle'):
            logger.warning(f"Row {row_num}/{total_rows}: No article data for PubMed ID {pubmed_id}")
            return None
        
        article = record['PubmedArticle'][0]
        citation = article.get('MedlineCitation', {})
        article_data = citation.get('Article', {})
        
        # Extract abstract
        try:
            abstract_elem = article_data.get('Abstract', {})
            abstract_text = abstract_elem.get('AbstractText', [])
            details['abstract'] = abstract_text[0] if abstract_text else None
        except (KeyError, IndexError, TypeError):
            details['abstract'] = None
        
        # Extract title
        try:
            details['title'] = article_data.get('ArticleTitle', None)
        except (KeyError, TypeError):
            details['title'] = None
        
        # Extract and format authors
        try:
            author_list_elem = article_data.get('AuthorList', [])
            author_list = [
                f"{author.get('LastName', '')}, {author.get('Initials', '')}."
                for author in author_list_elem
                if author.get('LastName') and author.get('Initials')
            ]
            
            if len(author_list) > 1:
                details['authors'] = f"{author_list[0]} et al."
            elif len(author_list) == 1:
                details['authors'] = author_list[0]
            else:
                details['authors'] = None
        except (KeyError, TypeError, AttributeError):
            details['authors'] = None
        
        # Extract DOI
        try:
            pubmed_data = article.get('PubmedData', {})
            article_id_list = pubmed_data.get('ArticleIdList', [])
            
            details['doi'] = None
            for article_id in article_id_list:
                if hasattr(article_id, 'attributes') and article_id.attributes.get('IdType') == "doi":
                    details['doi'] = str(article_id)
                    break
        except (KeyError, TypeError, AttributeError):
            details['doi'] = None
        
        details['pubmed_id'] = pubmed_id
        
        logger.info(f"Row {row_num}/{total_rows}: Successfully fetched details for PubMed ID {pubmed_id}")
        return details
        
    except Exception as e:
        logger.error(f"Row {row_num}/{total_rows}: Error fetching details for PubMed ID {pubmed_id}: {e}")
        return None

<b>Function: </b>Loops through a speciese specific list of proteins, fetching info from Uniprot, then returns two data frames with and without fetchable reference data named after a the provided species

In [10]:
def process_species_data(protein_ids_list: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process multiple protein IDs and aggregate results into DataFrames.
    
    Iterates through a list of protein IDs, fetches data from UniProt for each,
    and combines all results into two DataFrames (peptides and references).
    
    Args:
        protein_ids_list: List of UniProt protein accession IDs to process.
    
    Returns:
        Tuple containing:
            - peptides_raw: Combined DataFrame of all peptide data.
            - references_raw: Combined DataFrame of all reference data.
        
        Returns empty DataFrames if input list is empty or all requests fail.
    
    Raises:
        ValueError: If protein_ids_list is empty or invalid.
    
    Example:
        >>> protein_ids = ["P02666", "P02668", "P02788"]
        >>> peptides_df, refs_df = process_species_data(protein_ids)
        >>> print(f"Processed {len(peptides_df)} total peptides")
    """
    if not protein_ids_list:
        logger.warning("Empty protein_ids_list provided")
        return pd.DataFrame(), pd.DataFrame()
    
    if not isinstance(protein_ids_list, list):
        raise ValueError("protein_ids_list must be a list")
    
    reference_data_dfs = []
    peptide_data_dfs = []
    failed_proteins = []
    
    total_proteins = len(protein_ids_list)
    logger.info(f"Starting processing of {total_proteins} proteins")
    
    try:
        for index, protein_id in enumerate(protein_ids_list, start=1):
            if not protein_id or not isinstance(protein_id, str):
                logger.warning(f"Skipping invalid protein_id at index {index}: {protein_id}")
                failed_proteins.append(protein_id)
                continue
            
            start_time = time.time()
            
            try:
                peptide_data_df, reference_data_df = fetch_protein_info(protein_id)
                elapsed_time = time.time() - start_time
                
                # Only append non-empty DataFrames
                if not peptide_data_df.empty:
                    peptide_data_dfs.append(peptide_data_df)
                if not reference_data_df.empty:
                    reference_data_dfs.append(reference_data_df)
                
                logger.info(f"Processed {protein_id} ({index}/{total_proteins}) "
                           f"in {elapsed_time:.2f}s - "
                           f"{len(peptide_data_df)} peptides, {len(reference_data_df)} references")
                
            except Exception as e:
                elapsed_time = time.time() - start_time
                logger.error(f"Error processing {protein_id} ({index}/{total_proteins}): {e}")
                failed_proteins.append(protein_id)
        
        # Combine all DataFrames
        if peptide_data_dfs:
            peptides_raw = pd.concat(peptide_data_dfs, ignore_index=True)
        else:
            logger.warning("No peptide data collected")
            peptides_raw = pd.DataFrame()
        
        if reference_data_dfs:
            references_raw = pd.concat(reference_data_dfs, ignore_index=True)
        else:
            logger.warning("No reference data collected")
            references_raw = pd.DataFrame()
        
        # Summary logging
        success_count = total_proteins - len(failed_proteins)
        logger.info(f"Processing complete: {success_count}/{total_proteins} proteins successful")
        logger.info(f"Total peptides: {len(peptides_raw)}, "
                   f"Total references: {len(references_raw)}")
        
        if failed_proteins:
            logger.warning(f"Failed proteins: {failed_proteins}")
        
        return peptides_raw, references_raw
        
    except Exception as e:
        logger.error(f"Unexpected error in process_species_data: {e}")
        raise   

<b>Function: </b>Prints inportant info on the dataframes

In [11]:
def print_critical_info(df: pd.DataFrame, df_name: str) -> None:
    """
    Print comprehensive summary information about a DataFrame.
    
    Displays shape, columns, missing values, and basic statistics for
    data quality assessment and debugging.
    
    Args:
        df: Pandas DataFrame to analyze.
        df_name: Descriptive name for the DataFrame (used in output).
    
    Example:
        >>> df = pd.DataFrame({'col1': [1, 2, None], 'col2': ['a', 'b', 'c']})
        >>> print_critical_info(df, "test_dataframe")
    """
    if not isinstance(df, pd.DataFrame):
        logger.error(f"Invalid input: expected DataFrame, got {type(df)}")
        return
    
    if df.empty:
        logger.warning(f"DataFrame '{df_name}' is empty")
        print(f"Information for DataFrame: {df_name}")
        print("-" * 40)
        print("DataFrame is empty (0 rows)")
        print("=" * 40 + "\n")
        return
    
    print(f"Information for DataFrame: {df_name}")
    print("-" * 40)
    
    # Shape and size information
    print(f"Number of rows (peptides): {df.shape[0]:,}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"Shape: {df.shape}")
    
    # Column information
    print(f"\nColumns ({len(df.columns)}): {df.columns.tolist()}")
    
    # Missing values analysis
    missing_counts = df.isnull().sum()
    missing_pct = (missing_counts / len(df) * 100).round(2)
    
    print("\nMissing Values:")
    if missing_counts.sum() == 0:
        print("  No missing values")
    else:
        missing_df = pd.DataFrame({
            'Count': missing_counts,
            'Percentage': missing_pct
        })
        missing_df = missing_df[missing_df['Count'] > 0].sort_values('Count', ascending=False)
        print(missing_df.to_string())
    
    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory usage: {memory_mb:.2f} MB")
    
    print("\n" + "=" * 40 + "\n")
    
    # Log summary
    logger.info(f"DataFrame '{df_name}': {df.shape[0]} rows, {df.shape[1]} columns, "
               f"{missing_counts.sum()} missing values")

<b>Function:</b> Combines peptide and reference data, expanding the peptide list to create one entry per reference when multiple references are associated with a peptide.

In [12]:
def merge_peptides_with_references(
    peptides_raw: pd.DataFrame,
    references_raw: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Merge peptide data with reference data based on PubMed IDs and intervals.
    
    This function performs a complex merge operation that:
    1. Explodes peptide data with multiple PubMed IDs (one row per PubMed ID)
    2. Merges peptides with references using PubMed IDs when available
    3. Falls back to interval-based matching when PubMed IDs are missing
    4. Creates a unified 'pubmed_id' column resolving conflicts between sources
    5. Separates peptides with references from those without references
    
    Args:
        peptides_raw: DataFrame containing peptide data with columns:
            - 'proteinID': UniProt protein accession ID
            - 'evidence_pubmed': List of PubMed IDs (can be list or single value)
            - 'interval': Peptide interval string (e.g., '1-10')
            - Other peptide-related columns
        references_raw: DataFrame containing reference data with columns:
            - 'proteinID': UniProt protein accession ID
            - 'pubmed': PubMed ID
            - 'scope_range': Peptide interval range (e.g., '1-10')
            - Other reference-related columns
    
    Returns:
        Tuple containing:
            - peptides_with_references: DataFrame of peptides successfully matched with references.
              Contains peptides that have PubMed IDs or matched intervals.
            - no_peptides_with_references: DataFrame of peptides without matching references.
              Contains peptides that couldn't be matched to any reference.
    
    Raises:
        ValueError: If required columns are missing from input DataFrames.
        TypeError: If inputs are not pandas DataFrames.
    
    Example:
        >>> peptide_df = pd.DataFrame({
        ...     'proteinID': ['P02666', 'P02666'],
        ...     'evidence_pubmed': [['12345678'], None],
        ...     'interval': ['1-10', '20-30']
        ... })
        >>> ref_df = pd.DataFrame({
        ...     'proteinID': ['P02666'],
        ...     'pubmed': ['12345678'],
        ...     'scope_range': ['1-10']
        ... })
        >>> with_refs, without_refs = merge_peptides_with_references(peptide_df, ref_df)
        >>> print(f"Peptides with refs: {len(with_refs)}, without refs: {len(without_refs)}")
    """
    # Input validation
    if not isinstance(peptides_raw, pd.DataFrame):
        raise TypeError("peptides_raw must be a pandas DataFrame")
    
    if not isinstance(references_raw, pd.DataFrame):
        raise TypeError("references_raw must be a pandas DataFrame")
    
    # Check required columns
    required_peptide_cols = ['proteinID', 'evidence_pubmed', 'interval']
    required_ref_cols = ['proteinID', 'pubmed', 'scope_range']
    
    missing_peptide_cols = [col for col in required_peptide_cols if col not in peptides_raw.columns]
    missing_ref_cols = [col for col in required_ref_cols if col not in references_raw.columns]
    
    if missing_peptide_cols:
        raise ValueError(f"Missing required columns in peptides_raw: {missing_peptide_cols}")
    
    if missing_ref_cols:
        raise ValueError(f"Missing required columns in references_raw: {missing_ref_cols}")
    
    # Handle empty DataFrames
    if peptides_raw.empty:
        logger.warning("peptides_raw is empty, returning empty DataFrames")
        return pd.DataFrame(), pd.DataFrame()
    
    if references_raw.empty:
        logger.warning("references_raw is empty, returning peptides without references")
        return pd.DataFrame(), peptides_raw.copy()
    
    try:
        logger.info(f"Starting merge: {len(peptides_raw)} peptides, {len(references_raw)} references")
        
        # Step 1: Explode the 'evidence_pubmed' column (list to rows)
        # This creates one row for each PubMed ID in the list
        expanded_df = peptides_raw.explode('evidence_pubmed')
        logger.debug(f"After explode: {len(expanded_df)} rows")
        
        # Step 2: Split into rows with and without PubMed IDs
        filtered_df = expanded_df.dropna(subset=['evidence_pubmed'])
        no_pubmed_df = expanded_df[expanded_df['evidence_pubmed'].isna()].copy()
        
        logger.debug(f"Peptides with PubMed IDs: {len(filtered_df)}, without: {len(no_pubmed_df)}")
        
        # Step 3: Merge peptides with PubMed IDs to references
        merged_df = pd.merge(
            filtered_df,
            references_raw,
            left_on=['proteinID', 'evidence_pubmed'],
            right_on=['proteinID', 'pubmed'],
            how='left'
        )
        
        # Step 4: Merge peptides without PubMed IDs using interval matching
        merged_df_nopubmed_df = pd.merge(
            no_pubmed_df,
            references_raw,
            left_on=['proteinID', 'interval'],
            right_on=['proteinID', 'scope_range'],
            how='left'
        )
        
        # Step 5: Extract peptides with matching intervals but no PubMed ID in reference
        matching_interval_df_no_pubmed = merged_df_nopubmed_df[
            (merged_df_nopubmed_df['interval'] == merged_df_nopubmed_df['scope_range']) &
            merged_df_nopubmed_df['pubmed'].isna()
        ].copy()
        
        # Step 6: Combine all merged data
        full_merged_df = pd.concat([merged_df, merged_df_nopubmed_df], ignore_index=True)
        
        # Step 7: Create unified 'pubmed_id' column with conflict resolution logic
        # Priority: evidence_pubmed > pubmed > match if equal > None if conflict
        def resolve_pubmed_id(row):
            """Resolve PubMed ID from evidence_pubmed and pubmed columns."""
            evidence_pubmed = row.get('evidence_pubmed')
            pubmed = row.get('pubmed')
            
            # If pubmed is missing, use evidence_pubmed
            if pd.isna(pubmed):
                return evidence_pubmed
            
            # If evidence_pubmed is missing, use pubmed
            if pd.isna(evidence_pubmed):
                return pubmed
            
            # If both exist and match, use either (prefer evidence_pubmed)
            if evidence_pubmed == pubmed:
                return evidence_pubmed
            
            # If both exist but don't match, return None (conflict)
            return None
        
        full_merged_df['pubmed_id'] = full_merged_df.apply(resolve_pubmed_id, axis=1)
        
        # Step 8: Separate peptides with and without references
        peptides_with_references = full_merged_df[full_merged_df['pubmed_id'].notna()].copy()
        
        # Add peptides with matching intervals but no PubMed ID
        peptides_with_references = pd.concat([peptides_with_references, matching_interval_df_no_pubmed], ignore_index=True)
        
        # Step 9: Identify peptides without references
        # Create unique identifier for matching
        full_merged_df['uid'] = (
            full_merged_df['proteinID'] + '_' + 
            full_merged_df['interval'].astype(str)
        )
        matching_interval_df_no_pubmed['uid'] = (
            matching_interval_df_no_pubmed['proteinID'] + '_' + 
            matching_interval_df_no_pubmed['interval'].astype(str)
        )
        
        # Check which peptides are in the matching set
        full_merged_df['in_matching'] = full_merged_df['uid'].isin(matching_interval_df_no_pubmed['uid'])
        
        # Extract peptides without references (no pubmed_id and not in matching set)
        no_peptides_with_references = full_merged_df[
            (full_merged_df['pubmed_id'].isna()) & 
            (~full_merged_df['in_matching'])
        ].copy()
        
        # Step 10: Clean up temporary columns
        cleanup_cols = ['uid', 'in_matching']
        peptides_with_references.drop(columns=[col for col in cleanup_cols if col in peptides_with_references.columns], inplace=True, errors='ignore')
        no_peptides_with_references.drop(columns=cleanup_cols, inplace=True, errors='ignore')
        full_merged_df.drop(columns=cleanup_cols, inplace=True, errors='ignore')
        matching_interval_df_no_pubmed.drop(columns=['uid'], inplace=True, errors='ignore')
        
        # Log results
        logger.info(f"Merge complete: {len(peptides_with_references)} peptides with references, "
                   f"{len(no_peptides_with_references)} peptides without references")
        
        return peptides_with_references, no_peptides_with_references
        
    except Exception as e:
        logger.error(f"Error merging peptides with references: {e}")
        raise

<b>Function:</b> Fetchess abstract and reference infromation from PubMed, takes ~1 second per row

In [13]:
def fetch_pubmed_details_batch(
    peptides_with_references: pd.DataFrame,
    batch_size: Optional[int] = None,
    delay_between_requests: float = 0.1
) -> pd.DataFrame:
    """
    Fetch PubMed details for all rows in a DataFrame using the fetch_details function.
    
    Processes each row of the input DataFrame, calls fetch_details to retrieve
    article information (abstract, title, authors, DOI) from PubMed API, and
    returns a DataFrame with the extracted information.
    
    Args:
        peptides_with_references: DataFrame containing rows with 'pubmed_id' and optionally 'title'.
                       Each row will be processed to fetch PubMed details.
        batch_size: Optional batch size for processing. If None, processes all rows at once.
                   Useful for large datasets to avoid memory issues.
        delay_between_requests: Delay in seconds between API requests to avoid rate limiting.
                               Default: 0.1 seconds (NCBI recommends at least 0.1s).
    
    Returns:
        DataFrame with columns:
            - 'pubmed_id': PubMed ID
            - 'abstract': Article abstract (if available)
            - 'title': Article title (if available)
            - 'authors': Formatted author list (if available)
            - 'doi': Digital Object Identifier (if available)
        
        Rows correspond to input DataFrame rows. None values indicate missing data.
    
    Raises:
        ValueError: If peptides_with_references is empty or missing required columns.
        TypeError: If peptides_with_references is not a pandas DataFrame.
    
    Example:
        >>> df = pd.DataFrame({
        ...     'pubmed_id': ['12345678', '87654321'],
        ...     'proteinID': ['P02666', 'P02668']
        ... })
        >>> details_df = fetch_pubmed_details_batch(df)
        >>> print(f"Fetched details for {len(details_df)} articles")
    
    Note:
        This function makes API calls to PubMed/NCBI Entrez. Be mindful of:
        - Rate limiting (NCBI recommends max 3 requests/second)
        - Network latency
        - API availability
        
        The function includes automatic delays between requests to comply with
        NCBI usage guidelines.
    """
    # Input validation
    if not isinstance(peptides_with_references, pd.DataFrame):
        raise TypeError("peptides_with_references must be a pandas DataFrame")
    
    if peptides_with_references.empty:
        logger.warning("peptides_with_references is empty, returning empty DataFrame")
        return pd.DataFrame(columns=['pubmed_id', 'abstract', 'title', 'authors', 'doi'])
    
    # Check for required columns (at least one of pubmed_id or title should exist)
    has_pubmed_id = 'pubmed_id' in peptides_with_references.columns
    has_title = 'title' in peptides_with_references.columns
    
    if not has_pubmed_id and not has_title:
        raise ValueError(
            "peptides_with_references must contain at least one of: 'pubmed_id' or 'title' columns"
        )
    
    total_rows = len(peptides_with_references)
    logger.info(f"Starting batch fetch of PubMed details for {total_rows} rows")
    
    # Initialize results list for efficient appending
    results_list = []
    failed_count = 0
    
    try:
        # Process rows with progress tracking
        for idx, (_, row) in enumerate(peptides_with_references.iterrows(), start=1):
            try:
                # Fetch details for this row
                details = fetch_details(row, idx, total_rows)
                
                if details:
                    results_list.append(details)
                else:
                    # Create entry with None values if fetch failed
                    results_list.append({
                        'pubmed_id': row.get('pubmed_id') if has_pubmed_id else None,
                        'abstract': None,
                        'title': None,
                        'authors': None,
                        'doi': None
                    })
                    failed_count += 1
                
                # Add delay between requests to avoid rate limiting
                if delay_between_requests > 0 and idx < total_rows:
                    time.sleep(delay_between_requests)
                    
            except Exception as e:
                logger.error(f"Error processing row {idx}/{total_rows}: {e}")
                # Add entry with None values on error
                results_list.append({
                    'pubmed_id': row.get('pubmed_id') if has_pubmed_id else None,
                    'abstract': None,
                    'title': None,
                    'authors': None,
                    'doi': None
                })
                failed_count += 1
        
        # Convert results list to DataFrame efficiently
        # Single pass conversion is more efficient than multiple apply calls
        pubmed_metadata = pd.DataFrame(results_list)
        
        # Ensure all expected columns exist (handle missing keys gracefully)
        expected_columns = ['pubmed_id', 'abstract', 'title', 'authors', 'doi']
        for col in expected_columns:
            if col not in pubmed_metadata.columns:
                pubmed_metadata[col] = None
        
        # Reorder columns for consistency
        pubmed_metadata = pubmed_metadata[expected_columns]
        
        # Log summary
        success_count = total_rows - failed_count
        logger.info(
            f"Batch fetch complete: {success_count}/{total_rows} successful, "
            f"{failed_count} failed"
        )
        
        if failed_count > 0:
            logger.warning(f"{failed_count} rows failed to fetch details")
        
        # Log data quality metrics
        non_null_counts = pubmed_metadata.notna().sum()
        logger.debug("Data quality summary:")
        for col in expected_columns:
            logger.debug(f"  {col}: {non_null_counts[col]}/{total_rows} non-null")
        
        return pubmed_metadata
        
    except Exception as e:
        logger.error(f"Unexpected error in fetch_pubmed_details_batch: {e}")
        raise

# Bring in protein & MBPDB lists

MBPDB list imported from Summer 2023

In [14]:
mbpdb_data = pd.read_csv('data/exported_data.tsv', sep='\t')
mbpdb_data.rename(columns={'protein_pid': 'proteinID', 'intervals': 'interval'}, inplace=True)
mbpdb_function_list = list(set(mbpdb_data['function']))
mbpdb_data.head(n=3)

Unnamed: 0,peptide,proteinID,protein_desc,protein_species,interval,function,additional_details,ic50,inhibition_type,inhibited_microorganisms,ptm,title,authors,abstract,doi
0,YVPFP,P47710,Alpha-S1-casein,Homo sapiens,158-162,Anticancer,Inhibits TR7D breast cancer cell proliferation,,,,,Identification of a novel opioid peptide (Tyr-...,"Kampa, M. et al.",A new casomorphin pentapeptide (αS1-casomorphi...,10.1042/bj3190903
1,YVPFP,P47710,Alpha-S1-casein,Homo sapiens,158-162,Opioid,,,,,,Identification of a novel opioid peptide (Tyr-...,"Kampa, M. et al.",A new casomorphin pentapeptide (αS1-casomorphi...,10.1042/bj3190903
2,YLGYLE,P02662,Alpha-S1-casein,Bos taurus,106-111,ACE-inhibitory,,85.76,,,,In Silico and In Vitro Analysis of Multifuncti...,"Amigo, L. et al.","Currently, the associations between oxidative ...",10.3390/foods9080991


list of human milk proteins from OSU Proteome Discoverer Dave made from a few papers not comprehensive

In [15]:
# Read the file
with open('protein_lists/HumanMilkProteinDatabase_v2.fasta', 'r') as file:
    content = file.readlines()

# Extract unique protein IDs
hum_protein_ids = {line.split('|')[1] for line in content if line.startswith('>sp|')}
hum_protein_ids = list(hum_protein_ids)

print(f"Total protein IDs extracted: {len(hum_protein_ids)}")
# For debugging: print first 5 IDs to check format and uniqueness
print("First 5 protein IDs:", hum_protein_ids[:5])


Total protein IDs extracted: 382
First 5 protein IDs: ['P14618', 'Q99102', 'Q9UNQ0', 'P14174', 'P22897']


Test lists and list of proteins found in MBPDB peptides used for developement

In [16]:
#list of 42 proteins that currently have matches in the MBPDB
uniprot_ids = [    "P02666",    "P47710",    "P02662",    "P04653",    "P09115",    "P18626",    "O97943",    "O62823",    "P02663",    "P04654",   "P33049",    "A0A1L6KYI1",    "E9NZN2",    "P05814",    "P11839",    "P09116",    "Q9TSI0",    "Q9TVD0",    "P33048",    "P86273",    "A0A344X7B9",    "P02668",    "P07498",    "P02669",    "I6UFY2",    "P02670",    "P80195",    "P02754",    "P02755",    "P02756",    "P00711",    "P00710",    "P24627",    "P02788",    "P14632",    "Q29477",    "O77698",    "P14639",    "P02769",    "P67976",    "L8I8G5",    "P01966"]

#short lits of proteins to test
test_list=['P61278','O43612', 'P02788','P62158','P02666']

# Executes code and searches list

This is the code that runs the Uniport functions above given a protein list of interest, averages ~2 sec per protein

In [17]:
peptides_raw, references_raw = process_species_data(hum_protein_ids)

2025-11-21 14:58:13 - __main__ - INFO - Starting processing of 382 proteins
2025-11-21 14:58:15 - __main__ - INFO - Extracted 0 peptides and 3 function comments for P14618
2025-11-21 14:58:15 - __main__ - INFO - Successfully processed P14618: 0 peptides, 59 references
2025-11-21 14:58:15 - __main__ - INFO - Processed P14618 (1/382) in 1.91s - 0 peptides, 59 references
2025-11-21 14:58:17 - __main__ - INFO - Extracted 0 peptides and 1 function comments for Q99102
2025-11-21 14:58:17 - __main__ - INFO - Successfully processed Q99102: 0 peptides, 13 references
2025-11-21 14:58:17 - __main__ - INFO - Processed Q99102 (2/382) in 1.77s - 0 peptides, 13 references
2025-11-21 14:58:18 - __main__ - INFO - Extracted 0 peptides and 1 function comments for Q9UNQ0
2025-11-21 14:58:18 - __main__ - INFO - Successfully processed Q9UNQ0: 0 peptides, 46 references
2025-11-21 14:58:18 - __main__ - INFO - Processed Q9UNQ0 (3/382) in 1.89s - 0 peptides, 46 references
2025-11-21 14:58:20 - __main__ - INFO -

A series of merge steps to combine peptide and reference data, exands peptide list if multiple references are mentioned

In [18]:
# Execute the function with the current data
peptides_with_references, no_peptides_with_references = merge_peptides_with_references(
    peptides_raw,
    references_raw
)

2025-11-21 15:09:28 - __main__ - INFO - Starting merge: 129 peptides, 9086 references
2025-11-21 15:09:29 - __main__ - INFO - Merge complete: 244 peptides with references, 61 peptides without references


Fetchess abstract and reference infromation from PubMed, takes ~1 second per row

In [19]:
# Execute the function with the current data
pubmed_metadata = fetch_pubmed_details_batch(peptides_with_references)
# Display the first few rows of the pubmed_metadata DataFrame
pubmed_metadata.head(n=3)

2025-11-21 15:09:29 - __main__ - INFO - Starting batch fetch of PubMed details for 244 rows
2025-11-21 15:09:29 - __main__ - INFO - Row 1/244: Successfully fetched details for PubMed ID 12693969
2025-11-21 15:09:30 - __main__ - INFO - Row 2/244: Successfully fetched details for PubMed ID 1369293
2025-11-21 15:09:30 - __main__ - INFO - Row 3/244: Successfully fetched details for PubMed ID 1369293
2025-11-21 15:09:31 - __main__ - INFO - Row 4/244: Successfully fetched details for PubMed ID 1369293
2025-11-21 15:09:31 - __main__ - INFO - Row 5/244: Successfully fetched details for PubMed ID 11145122
2025-11-21 15:09:32 - __main__ - INFO - Row 6/244: Successfully fetched details for PubMed ID 2532366
2025-11-21 15:09:32 - __main__ - INFO - Row 7/244: Successfully fetched details for PubMed ID 8653797
2025-11-21 15:09:33 - __main__ - INFO - Row 8/244: Successfully fetched details for PubMed ID 7984506
2025-11-21 15:09:34 - __main__ - INFO - Row 9/244: Successfully fetched details for PubMed

Unnamed: 0,pubmed_id,abstract,title,authors,doi
0,12693969,A 31-amino acid synthetic peptide (NH(2)-FFSAS...,Potassium efflux induced by a new lactoferrin-...,"Viejo-Díaz, M. et al.",10.1023/a:1022657630698
1,1369293,Peptides with affinity for opioid receptors we...,Isolation and characterization of opioid antag...,"Tani, F. et al.",
2,1369293,Peptides with affinity for opioid receptors we...,Isolation and characterization of opioid antag...,"Tani, F. et al.",


Merge and Clean Peptide Data

In [20]:
def merge_and_clean_peptide_data(
    peptides_with_references: pd.DataFrame,
    pubmed_metadata: pd.DataFrame,
    no_peptides_with_references: pd.DataFrame,
    columns_to_drop_reff: Optional[List[str]] = None,
    columns_to_drop_no_reff: Optional[List[str]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Merge PubMed details with peptide data and clean up unnecessary columns.
    
    This function:
    1. Removes duplicate PubMed details based on pubmed_id
    2. Merges PubMed details with peptides that have references
    3. Fills missing values in title, doi, and authors from PubMed data
    4. Removes intermediate/helper columns from both DataFrames
    5. Returns cleaned final DataFrames
    
    Args:
        peptides_with_references: DataFrame containing peptides with references.
                       Must contain 'pubmed_id' column.
        pubmed_metadata: DataFrame containing fetched PubMed details.
                       Must contain 'pubmed_id', 'title', 'doi', 'authors', 'abstract'.
        no_peptides_with_references: DataFrame containing peptides without references.
        columns_to_drop_reff: Optional list of columns to drop from peptides_final_with_refs.
                             If None, uses default list of intermediate columns.
        columns_to_drop_no_reff: Optional list of columns to drop from peptides_final_no_refs.
                                 If None, uses default list of intermediate columns.
    
    Returns:
        Tuple containing:
            - peptides_final_with_refs: Cleaned DataFrame with peptides that have references.
            - peptides_final_no_refs: Cleaned DataFrame with peptides without references.
    
    Raises:
        ValueError: If required columns are missing from input DataFrames.
        TypeError: If inputs are not pandas DataFrames.
    
    Example:
        >>> reff_df = pd.DataFrame({'pubmed_id': ['123'], 'title': [None]})
        >>> pubmed_df = pd.DataFrame({'pubmed_id': ['123'], 'title': ['Article']})
        >>> no_reff_df = pd.DataFrame({'proteinID': ['P02666']})
        >>> reff_final, no_reff_final = merge_and_clean_peptide_data(reff_df, pubmed_df, no_reff_df)
    """
    # Input validation
    if not isinstance(peptides_with_references, pd.DataFrame):
        raise TypeError("peptides_with_references must be a pandas DataFrame")
    
    if not isinstance(pubmed_metadata, pd.DataFrame):
        raise TypeError("pubmed_metadata must be a pandas DataFrame")
    
    if not isinstance(no_peptides_with_references, pd.DataFrame):
        raise TypeError("peptides_without_references must be a pandas DataFrame")
    
    # Check required columns
    if 'pubmed_id' not in peptides_with_references.columns:
        raise ValueError("peptides_with_references must contain 'pubmed_id' column")
    
    if 'pubmed_id' not in pubmed_metadata.columns:
        raise ValueError("pubmed_metadata must contain 'pubmed_id' column")
    
    # Default columns to drop
    default_reff_drop_cols = [
        'title_pubmed', 'doi_pubmed', 'authors_pubmed', 'pubmed',
        'feature_evidence', 'evidence_to_pubmed', 'evidence_pubmed',
        'comment_evidence', 'reference key', 'scope_range'
    ]
    
    default_no_reff_drop_cols = [
        'pubmed', 'feature_evidence', 'evidence_to_pubmed', 'evidence_pubmed',
        'comment_evidence', 'reference key', 'title', 'authors', 'doi',
        'scope_range', 'pubmed_id'
    ]
    
    # Use provided columns or defaults
    reff_drop_cols = columns_to_drop_reff if columns_to_drop_reff is not None else default_reff_drop_cols
    no_reff_drop_cols = columns_to_drop_no_reff if columns_to_drop_no_reff is not None else default_no_reff_drop_cols
    
    try:
        logger.info("Starting merge and cleanup of peptide data")
        
        # Step 1: Remove duplicate PubMed details (keep first occurrence)
        if pubmed_metadata.empty:
            logger.warning("pubmed_metadata is empty, skipping merge")
            pubmed_metadata_unique = pd.DataFrame(columns=pubmed_metadata.columns)
        else:
            initial_count = len(pubmed_metadata)
            pubmed_metadata_unique = pubmed_metadata.drop_duplicates(subset='pubmed_id', keep='first')
            duplicates_removed = initial_count - len(pubmed_metadata_unique)
            if duplicates_removed > 0:
                logger.info(f"Removed {duplicates_removed} duplicate PubMed entries")
        
        # Step 2: Merge PubMed details with peptides that have references
        if peptides_with_references.empty:
            logger.warning("peptides_with_references is empty")
            peptides_final_with_refs = peptides_with_references.copy()
        else:
            peptides_final_with_refs = pd.merge(
                peptides_with_references,
                pubmed_metadata_unique,
                on='pubmed_id',
                how='left',
                suffixes=('', '_pubmed')
            )
            logger.debug(f"Merged {len(peptides_final_with_refs)} rows with PubMed details")
        
        # Step 3: Fill missing values using vectorized operations (much faster than apply)
        # Use combine_first or fillna for better performance
        fill_columns = ['title', 'doi', 'authors']
        
        for col in fill_columns:
            pubmed_col = f'{col}_pubmed'
            
            if col in peptides_final_with_refs.columns and pubmed_col in peptides_final_with_refs.columns:
                # Use vectorized fillna instead of apply for better performance
                peptides_final_with_refs[col] = peptides_final_with_refs[col].fillna(
                    peptides_final_with_refs[pubmed_col]
                )
                
                # Log how many values were filled
                filled_count = peptides_final_with_refs[col].notna().sum() - (
                    peptides_final_with_refs[col].notna().sum() if col in peptides_with_references.columns 
                    else 0
                )
                if filled_count > 0:
                    logger.debug(f"Filled {filled_count} missing values in '{col}' column")
            elif col not in peptides_final_with_refs.columns and pubmed_col in peptides_final_with_refs.columns:
                # If original column doesn't exist, use the PubMed column
                peptides_final_with_refs[col] = peptides_final_with_refs[pubmed_col]
        
        # Step 4: Drop unnecessary columns from peptides_final_with_refs
        # Only drop columns that actually exist
        existing_reff_drop_cols = [col for col in reff_drop_cols if col in peptides_final_with_refs.columns]
        if existing_reff_drop_cols:
            peptides_final_with_refs = peptides_final_with_refs.drop(columns=existing_reff_drop_cols)
            logger.debug(f"Dropped {len(existing_reff_drop_cols)} columns from peptides_final_with_refs")
        
        # Step 5: Drop unnecessary columns from no_peptides_with_references
        existing_no_reff_drop_cols = [col for col in no_reff_drop_cols if col in no_peptides_with_references.columns]
        if existing_no_reff_drop_cols:
            peptides_final_no_refs = no_peptides_with_references.drop(columns=existing_no_reff_drop_cols)
            logger.debug(f"Dropped {len(existing_no_reff_drop_cols)} columns from peptides_final_no_refs")
        else:
            peptides_final_no_refs = no_peptides_with_references.copy()
        
        # Log summary statistics
        logger.info(
            f"Cleanup complete: "
            f"peptides_final_with_refs: {len(peptides_final_with_refs)} rows, "
            f"{len(peptides_final_with_refs.columns)} columns; "
            f"peptides_final_no_refs: {len(peptides_final_no_refs)} rows, "
            f"{len(peptides_final_no_refs.columns)} columns"
        )
        
        # Log data quality information
        if not peptides_final_with_refs.empty:
            logger.info("Data quality summary for peptides_final_with_refs:")
            for col in ['title', 'doi', 'authors', 'abstract']:
                if col in peptides_final_with_refs.columns:
                    non_null_count = peptides_final_with_refs[col].notna().sum()
                    pct = (non_null_count / len(peptides_final_with_refs) * 100) if len(peptides_final_with_refs) > 0 else 0
                    logger.info(f"  {col}: {non_null_count}/{len(peptides_final_with_refs)} ({pct:.1f}%)")
        
        return peptides_final_with_refs, peptides_final_no_refs
        
    except Exception as e:
        logger.error(f"Error in merge_and_clean_peptide_data: {e}")
        raise


# Execute the function with the current data
peptides_final_with_refs, peptides_final_no_refs = merge_and_clean_peptide_data(
    peptides_with_references,
    pubmed_metadata,
    no_peptides_with_references
)

# Display summary information using the print_critical_info function
logger.info("=" * 60)
logger.info("Final DataFrames Summary")
logger.info("=" * 60)
print_critical_info(peptides_final_no_refs, "peptides_final_no_refs")
print_critical_info(peptides_final_with_refs, "peptides_final_with_refs")

# Display the first few rows of the peptides_final_no_refs DataFrame
peptides_final_no_refs.head(n=3)

2025-11-21 15:11:45 - __main__ - INFO - Starting merge and cleanup of peptide data
2025-11-21 15:11:45 - __main__ - INFO - Removed 140 duplicate PubMed entries
2025-11-21 15:11:45 - __main__ - INFO - Cleanup complete: peptides_final_with_refs: 244 rows, 13 columns; peptides_final_no_refs: 61 rows, 8 columns
2025-11-21 15:11:45 - __main__ - INFO - Data quality summary for peptides_final_with_refs:
2025-11-21 15:11:45 - __main__ - INFO -   title: 244/244 (100.0%)
2025-11-21 15:11:45 - __main__ - INFO -   doi: 241/244 (98.8%)
2025-11-21 15:11:45 - __main__ - INFO -   authors: 244/244 (100.0%)
2025-11-21 15:11:45 - __main__ - INFO -   abstract: 229/244 (93.9%)
2025-11-21 15:11:45 - __main__ - INFO - Final DataFrames Summary
2025-11-21 15:11:45 - __main__ - INFO - DataFrame 'peptides_final_no_refs': 61 rows, 8 columns, 20 missing values
2025-11-21 15:11:45 - __main__ - INFO - DataFrame 'peptides_final_with_refs': 244 rows, 13 columns, 237 missing values


Information for DataFrame: peptides_final_no_refs
----------------------------------------
Number of rows (peptides): 61
Number of columns: 8
Shape: (61, 8)

Columns (8): ['proteinID', 'protein_name', 'protein_desc', 'interval', 'peptide', 'description', 'associated_function', 'non_associated_function']

Missing Values:
                         Count  Percentage
non_associated_function     20       32.79

Memory usage: 0.04 MB


Information for DataFrame: peptides_final_with_refs
----------------------------------------
Number of rows (peptides): 244
Number of columns: 13
Shape: (244, 13)

Columns (13): ['proteinID', 'protein_name', 'protein_desc', 'interval', 'peptide', 'description', 'associated_function', 'non_associated_function', 'title', 'authors', 'doi', 'pubmed_id', 'abstract']

Missing Values:
                         Count  Percentage
non_associated_function    218       89.34
abstract                    15        6.15
doi                          3        1.23
pubmed_id     

Unnamed: 0,proteinID,protein_name,protein_desc,interval,peptide,description,associated_function,non_associated_function
228,P02788,TRFL_HUMAN,Lactotransferrin,20-67,GRRRSVQWCAVSQPEATKCFQWQRNMRKVRGPPVSCIKRDSPIQCIQA,Lactoferricin-H,,[Transferrins are iron binding transport prote...
230,P01275,GLUC_HUMAN,Pro-glucagon,21-50,RSLQDTEEKSRSFSASQADPLSDPDQMNED,Glicentin-related polypeptide,,[Plays a key role in glucose metabolism and ho...
231,P61278,SMS_HUMAN,Somatostatin,89-116,SANSNPAMAPRERKAGCKNFFWKTFTSC,Somatostatin-28,,"[Inhibits the secretion of pituitary hormones,..."


Extracteds asccociated function from non_asscoiated by referencing pubmed ID and description column

In [21]:
def update_and_clean_associated_functions(
    peptides_final_with_refs: pd.DataFrame,
    peptides_final_no_refs: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, int]]:
    """
    Update associated_function values and clean up non_associated_function column.
    
    This function:
    1. Applies update_associated_function to fill missing associated_function values
       by matching descriptions or PubMed IDs with non_associated_function entries
    2. Tracks how many associated_functions were successfully assigned
    3. Cleans up non_associated_function column by setting it to NaN where
       associated_function is now populated (to avoid redundancy)
    
    Args:
        peptides_final_with_refs: DataFrame containing peptides with references.
                             Must contain 'associated_function' and 'non_associated_function' columns.
        peptides_final_no_refs: DataFrame containing peptides without references.
                                 Must contain 'associated_function' and 'non_associated_function' columns.
    
    Returns:
        Tuple containing:
            - Updated peptides_final_with_refs with filled associated_function values
            - Updated peptides_final_no_refs with filled associated_function values
            - Dictionary with statistics:
                - 'reff_assigned': Number of associated_functions assigned in reff dataframe
                - 'no_reff_assigned': Number of associated_functions assigned in no_reff dataframe
                - 'reff_initial_empty': Initial count of empty values in reff dataframe
                - 'no_reff_initial_empty': Initial count of empty values in no_reff dataframe
    
    Raises:
        ValueError: If required columns are missing from input DataFrames.
        TypeError: If inputs are not pandas DataFrames.
    
    Example:
        >>> reff_df = pd.DataFrame({
        ...     'associated_function': ['', 'Function A'],
        ...     'non_associated_function': [['Function B'], None]
        ... })
        >>> no_reff_df = pd.DataFrame({
        ...     'associated_function': [''],
        ...     'non_associated_function': [['Function C']]
        ... })
        >>> reff_updated, no_reff_updated, stats = update_and_clean_associated_functions(reff_df, no_reff_df)
        >>> print(f"Assigned {stats['reff_assigned']} functions")
    """
    # Input validation
    if not isinstance(peptides_final_with_refs, pd.DataFrame):
        raise TypeError("peptides_final_with_refs must be a pandas DataFrame")
    
    if not isinstance(peptides_final_no_refs, pd.DataFrame):
        raise TypeError("peptides_final_no_refs must be a pandas DataFrame")
    
    # Check required columns
    required_cols = ['associated_function', 'non_associated_function']
    
    missing_reff_cols = [col for col in required_cols if col not in peptides_final_with_refs.columns]
    missing_no_reff_cols = [col for col in required_cols if col not in peptides_final_no_refs.columns]
    
    if missing_reff_cols:
        raise ValueError(f"Missing required columns in peptides_final_with_refs: {missing_reff_cols}")
    
    if missing_no_reff_cols:
        raise ValueError(f"Missing required columns in peptides_final_no_refs: {missing_no_reff_cols}")
    
    try:
        logger.info("Starting update of associated_function values")
        
        # Create copies to avoid modifying originals
        reff_df = peptides_final_with_refs.copy()
        no_reff_df = peptides_final_no_refs.copy()
        
        # Step 1: Calculate initial counts of empty associated_function
        # Count both empty strings and NaN values
        reff_empty_mask = (reff_df['associated_function'] == '') | reff_df['associated_function'].isna()
        no_reff_empty_mask = (no_reff_df['associated_function'] == '') | no_reff_df['associated_function'].isna()
        
        initial_empty_reff = reff_empty_mask.sum()
        initial_empty_no_reff = no_reff_empty_mask.sum()
        
        logger.debug(f"Initial empty associated_function: reff={initial_empty_reff}, no_reff={initial_empty_no_reff}")
        
        # Step 2: Apply update_associated_function to both dataframes
        logger.info("Applying update_associated_function to both dataframes...")
        reff_df['associated_function'] = reff_df.apply(update_associated_function, axis=1)
        no_reff_df['associated_function'] = no_reff_df.apply(update_associated_function, axis=1)
        
        # Step 3: Calculate how many were successfully assigned
        reff_final_empty_mask = (reff_df['associated_function'] == '') | reff_df['associated_function'].isna()
        no_reff_final_empty_mask = (no_reff_df['associated_function'] == '') | no_reff_df['associated_function'].isna()
        
        final_empty_reff = reff_final_empty_mask.sum()
        final_empty_no_reff = no_reff_final_empty_mask.sum()
        
        change_reff = initial_empty_reff - final_empty_reff
        change_no_reff = initial_empty_no_reff - final_empty_no_reff
        
        # Step 4: Clean up non_associated_function column
        # Set to NaN where associated_function is now populated (not empty and not NaN)
        reff_populated_mask = reff_df['associated_function'].notna() & (reff_df['associated_function'] != '')
        no_reff_populated_mask = no_reff_df['associated_function'].notna() & (no_reff_df['associated_function'] != '')
        
        if 'non_associated_function' in reff_df.columns and reff_populated_mask.any():
            reff_df.loc[reff_populated_mask, 'non_associated_function'] = np.nan
            cleaned_reff_count = reff_populated_mask.sum()
            logger.debug(f"Cleaned {cleaned_reff_count} non_associated_function values in reff dataframe")
        
        if 'non_associated_function' in no_reff_df.columns and no_reff_populated_mask.any():
            no_reff_df.loc[no_reff_populated_mask, 'non_associated_function'] = np.nan
            cleaned_no_reff_count = no_reff_populated_mask.sum()
            logger.debug(f"Cleaned {cleaned_no_reff_count} non_associated_function values in no_reff dataframe")
        
        # Prepare statistics
        stats = {
            'reff_assigned': change_reff,
            'no_reff_assigned': change_no_reff,
            'reff_initial_empty': initial_empty_reff,
            'no_reff_initial_empty': initial_empty_no_reff,
            'reff_final_empty': final_empty_reff,
            'no_reff_final_empty': final_empty_no_reff
        }
        
        # Log results
        logger.info(
            f"Update complete: "
            f"peptides_with_references: {change_reff} associated_functions assigned "
            f"({initial_empty_reff} → {final_empty_reff} empty), "
            f"peptides_without_references: {change_no_reff} associated_functions assigned "
            f"({initial_empty_no_reff} → {final_empty_no_reff} empty)"
        )
        
        if change_reff > 0 or change_no_reff > 0:
            logger.info(
                f"Successfully filled {change_reff + change_no_reff} total associated_function values "
                f"from non_associated_function data"
            )
        
        return reff_df, no_reff_df, stats
        
    except Exception as e:
        logger.error(f"Error in update_and_clean_associated_functions: {e}")
        raise

# Execute the function with the current data
peptides_final_with_refs, peptides_final_no_refs, update_stats = update_and_clean_associated_functions(
    peptides_final_with_refs,
    peptides_final_no_refs
)

# Log summary
logger.info("=" * 60)
logger.info("Associated Function Update Summary")
logger.info("=" * 60)
logger.info(f"In reff_merged dataframe: {update_stats['reff_assigned']} associated_functions were assigned.")
logger.info(f"In no_reff_merged dataframe: {update_stats['no_reff_assigned']} associated_functions were assigned.")

# Print critical information about both DataFrames
print_critical_info(peptides_final_no_refs, "data/peptides_final_no_refs")
print_critical_info(peptides_final_with_refs, "data/peptides_final_with_refs")

2025-11-21 15:11:45 - __main__ - INFO - Starting update of associated_function values
2025-11-21 15:11:45 - __main__ - INFO - Applying update_associated_function to both dataframes...
2025-11-21 15:11:45 - __main__ - INFO - Update complete: peptides_with_references: 6 associated_functions assigned (26 → 20 empty), peptides_without_references: 1 associated_functions assigned (42 → 41 empty)
2025-11-21 15:11:45 - __main__ - INFO - Successfully filled 7 total associated_function values from non_associated_function data
2025-11-21 15:11:45 - __main__ - INFO - Associated Function Update Summary
2025-11-21 15:11:45 - __main__ - INFO - In reff_merged dataframe: 6 associated_functions were assigned.
2025-11-21 15:11:45 - __main__ - INFO - In no_reff_merged dataframe: 1 associated_functions were assigned.
2025-11-21 15:11:45 - __main__ - INFO - DataFrame 'data/peptides_final_no_refs': 61 rows, 8 columns, 21 missing values
2025-11-21 15:11:45 - __main__ - INFO - DataFrame 'data/peptides_final_wi

Information for DataFrame: data/peptides_final_no_refs
----------------------------------------
Number of rows (peptides): 61
Number of columns: 8
Shape: (61, 8)

Columns (8): ['proteinID', 'protein_name', 'protein_desc', 'interval', 'peptide', 'description', 'associated_function', 'non_associated_function']

Missing Values:
                         Count  Percentage
non_associated_function     21       34.43

Memory usage: 0.04 MB


Information for DataFrame: data/peptides_final_with_refs
----------------------------------------
Number of rows (peptides): 244
Number of columns: 13
Shape: (244, 13)

Columns (13): ['proteinID', 'protein_name', 'protein_desc', 'interval', 'peptide', 'description', 'associated_function', 'non_associated_function', 'title', 'authors', 'doi', 'pubmed_id', 'abstract']

Missing Values:
                         Count  Percentage
non_associated_function    224       91.80
abstract                    15        6.15
doi                          3        1.23
pubm

Automated Association of Functions from Non-Associated Functions

<b>Function:</b> Uses AI to automatically match peptide descriptions with `non_associated_function` entries, moving matches to `associated_function` and setting unmatched to "indeterminable".

In [22]:
def match_description_to_non_associated_function(
    description: str,
    non_associated_function: Any,
    model: Optional[str] = None
) -> Optional[str]:
    """
    Use AI to check if description matches any entry in non_associated_function.
    
    Queries OpenAI API to determine if the peptide description matches any function
    description in the non_associated_function list. Handles variations in naming
    (e.g., "Casocidin-1" vs "Casocidin-I").
    
    Args:
        description: Peptide description (e.g., "Casocidin-1").
        non_associated_function: List or string of non-associated function descriptions.
        model: OpenAI model to use. Defaults to DEFAULT_OPENAI_MODEL if None.
    
    Returns:
        Matched function string if found, None if no match.
    
    Example:
        >>> desc = "Casocidin-1"
        >>> non_assoc = ['Casocidin-I inhibits the growth of E.coli...']
        >>> match = match_description_to_non_associated_function(desc, non_assoc)
        >>> print(match)  # 'Casocidin-I inhibits the growth of E.coli...'
    """
    import ast
    
    if not description or pd.isna(description):
        return None
    
    # Check if non_associated_function is valid (handle lists/arrays properly)
    # Check type first to avoid ambiguous truth value errors with arrays
    if isinstance(non_associated_function, (list, np.ndarray)):
        if len(non_associated_function) == 0:
            return None
    elif isinstance(non_associated_function, str):
        if not non_associated_function.strip():
            return None
    elif non_associated_function is None:
        return None
    else:
        # For other types (like pandas Series), check if it's NaN
        try:
            if pd.isna(non_associated_function):
                return None
        except (ValueError, TypeError):
            # If pd.isna() fails (e.g., with arrays), check truthiness
            if not non_associated_function:
                return None
    
    if openai_client is None:
        raise RuntimeError("OpenAI client not initialized. Check your API key configuration.")
    
    # Parse non_associated_function if it's a string representation of a list
    if isinstance(non_associated_function, str):
        try:
            if non_associated_function.strip().startswith('['):
                non_assoc_list = ast.literal_eval(non_associated_function)
            else:
                non_assoc_list = [non_associated_function]
        except (ValueError, SyntaxError):
            non_assoc_list = [non_associated_function]
    elif isinstance(non_associated_function, list):
        non_assoc_list = non_associated_function
    else:
        non_assoc_list = [str(non_associated_function)]
    
    if not non_assoc_list:
        return None
    
    # Use provided model or default
    if model is None:
        model = DEFAULT_OPENAI_MODEL
    
    try:
        # Extract the base peptide name and any identifier (number or letter)
        # e.g., "Casoxin-6" -> base="Casoxin", identifier="6"
        # e.g., "Casocidin-1" -> base="Casocidin", identifier="1"
        description_clean = str(description).strip()
        base_name = description_clean.split('-')[0].strip() if '-' in description_clean else description_clean
        identifier = description_clean.split('-')[1].strip() if '-' in description_clean and len(description_clean.split('-')) > 1 else None
        
        # Construct query for matching with logical rules
        non_assoc_str = '\n'.join([f"- {item}" for item in non_assoc_list])
        
        query = (
            f"Given a peptide description: '{description}'\n\n"
            f"And the following list of non-associated function descriptions:\n{non_assoc_str}\n\n"
            f"Determine if the peptide description matches any of the function descriptions using these logical rules:\n\n"
            f"MATCHING RULES:\n"
            f"1. The peptide name from the description MUST appear in the function description.\n"
            f"2. If the peptide description contains a number or letter identifier (e.g., '-6', '-A', '-1'), "
            f"that identifier MUST also appear in the matched function description.\n"
            f"3. Accept variations in format: numbers can match Roman numerals (1=I, 6=VI), "
            f"hyphens can be spaces, and case differences are acceptable.\n"
            f"4. Do NOT match if the function description mentions a DIFFERENT identifier or a group of peptides "
            f"that doesn't include the specific peptide from the description.\n"
            f"5. The match must be specific to the peptide described, not just a general category.\n\n"
            f"If there is a match based on these rules, respond with ONLY the exact matched function description from the list.\n"
            f"If there is NO match, respond with 'NO_MATCH'.\n\n"
            f"Your response should be ONLY the matched function description or 'NO_MATCH', nothing else."
        )
        
        logger.debug(f"Querying AI for description match: {description[:50]}...")
        
        # Send query to OpenAI API
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert in bioactive peptides and protein functions. "
                 "You match peptide descriptions with function descriptions using strict logical rules. "
                 "You require that the specific peptide name and any identifiers (numbers/letters) appear in the function description. "
                 "You do not match similar-sounding but different peptides."},
                {"role": "user", "content": query}
            ],
            temperature=0.0  # Zero temperature for maximum determinism
        )
        
        # Extract response
        match_result = response.choices[0].message.content.strip()
        
        if not match_result or match_result == 'NO_MATCH':
            return None
        
        # Additional validation: Check if the identifier appears in the matched result
        if identifier:
            # Generate identifier variations to check
            identifier_variations = [
                identifier,  # "6"
                f"-{identifier}",  # "-6"
                f" {identifier}",  # " 6"
                f"{identifier},",  # "6,"
                f"{identifier} ",  # "6 "
                f"{identifier}.",  # "6."
            ]
            
            # Add Roman numeral conversions for common numbers
            roman_numerals = {
                "1": ["I", "i"],
                "2": ["II", "ii"],
                "3": ["III", "iii"],
                "4": ["IV", "iv"],
                "5": ["V", "v"],
                "6": ["VI", "vi"],
                "7": ["VII", "vii"],
                "8": ["VIII", "viii"],
                "9": ["IX", "ix"],
                "10": ["X", "x"]
            }
            if identifier in roman_numerals:
                identifier_variations.extend(roman_numerals[identifier])
            
            identifier_variations = [v for v in identifier_variations if v]
            
            # Check if any variation appears in match_result (case-insensitive)
            match_result_lower = match_result.lower()
            identifier_found = any(var.lower() in match_result_lower for var in identifier_variations)
            
            if not identifier_found:
                logger.debug(f"Identifier '{identifier}' not found in match result, rejecting match")
                return None
        
        # Verify the match is actually in the list and contains the base name
        for item in non_assoc_list:
            item_str = str(item)
            if match_result in item_str or item_str in match_result:
                # Additional check: ensure base name appears (case-insensitive)
                if base_name.lower() in item_str.lower():
                    return item_str
        
        # If exact match not found, return None (don't trust AI response without validation)
        logger.debug(f"Match result not found in list or base name not found, rejecting")
        return None
        
    except Exception as e:
        logger.error(f"Error matching description '{description[:50]}...': {e}")
        return None


def auto_associate_functions_from_non_associated(
    df: pd.DataFrame,
    model: Optional[str] = None,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Automatically associate functions by matching descriptions with non_associated_function entries.
    
    Uses AI to match peptide descriptions with entries in non_associated_function column.
    When a match is found, moves that entry to associated_function. If no match and
    associated_function is empty, sets it to "indeterminable".
    
    Args:
        df: DataFrame with columns 'description', 'associated_function', 'non_associated_function'.
        model: OpenAI model to use. Defaults to DEFAULT_OPENAI_MODEL if None.
        verbose: If True, print progress information.
    
    Returns:
        DataFrame with updated associated_function and non_associated_function columns.
    
    Raises:
        ValueError: If required columns are missing.
        TypeError: If df is not a DataFrame.
    
    Example:
        >>> updated_df = auto_associate_functions_from_non_associated(peptides_final_with_refs)
        >>> print(f"Updated {len(updated_df)} rows")
    """
    # Validate inputs
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")
    
    if df.empty:
        if verbose:
            print("Warning: DataFrame is empty. Returning empty DataFrame.")
        return df.copy()
    
    # Check required columns
    required_cols = ['description', 'associated_function', 'non_associated_function']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # Filter rows that need processing (have non_associated_function and empty associated_function)
    rows_to_process = result_df[
        (result_df['non_associated_function'].notna()) &
        ((result_df['associated_function'].isna()) | (result_df['associated_function'] == ''))
    ].copy()
    
    if rows_to_process.empty:
        if verbose:
            print("No rows need processing (all have associated_function or no non_associated_function).")
        return result_df
    
    total_rows = len(rows_to_process)
    if verbose:
        print(f"Processing {total_rows} rows to match descriptions with non_associated_function...")
    
    matches_found = 0
    indeterminable_count = 0
    
    # Process each row
    for idx, (index, row) in enumerate(rows_to_process.iterrows(), start=1):
        description = row['description']
        non_assoc = row['non_associated_function']
        
        if verbose:
            print(f"Processing row {idx}/{total_rows}: {description[:50] if pd.notna(description) else 'N/A'}...")
        
        # Try to match description with non_associated_function
        matched_function = match_description_to_non_associated_function(
            description,
            non_assoc,
            model=model
        )
        
        if matched_function:
            # Move matched function to associated_function
            result_df.at[index, 'associated_function'] = matched_function
            
            # Remove matched entry from non_associated_function
            import ast
            if isinstance(non_assoc, str) and non_assoc.strip().startswith('['):
                try:
                    non_assoc_list = ast.literal_eval(non_assoc)
                    non_assoc_list = [item for item in non_assoc_list if str(item) != matched_function]
                    result_df.at[index, 'non_associated_function'] = non_assoc_list if non_assoc_list else None
                except (ValueError, SyntaxError):
                    result_df.at[index, 'non_associated_function'] = None
            else:
                result_df.at[index, 'non_associated_function'] = None
            
            matches_found += 1
            if verbose:
                print(f"  ✓ Matched: '{matched_function[:80]}...'")
        else:
            # No match found, set to "indeterminable"
            result_df.at[index, 'associated_function'] = "indeterminable"
            indeterminable_count += 1
            if verbose:
                print(f"  ✗ No match found, set to 'indeterminable'")
    
    if verbose:
        print(f"\nProcessing complete:")
        print(f"  - Matches found: {matches_found}")
        print(f"  - Set to 'indeterminable': {indeterminable_count}")
        print(f"  - Total processed: {total_rows}")
    
    return result_df

Process peptide dataframe <b>with</b> references

In [23]:

# Automatically associate functions using AI matching
# Create a new DataFrame instead of overwriting the original
peptides_final_with_refs_updated = auto_associate_functions_from_non_associated(
    peptides_final_with_refs,
    model=None,  # Use default model
    verbose=True
)

# Display updated dataframe
print("\nUpdated DataFrame:")
peptides_final_with_refs_updated[['description', 'associated_function', 'non_associated_function']].head(10)

# Note: To use the updated DataFrame, assign it explicitly:
# peptides_final_with_refs = peptides_final_with_refs_updated

Processing 20 rows to match descriptions with non_associated_function...
Processing row 1/20: Glucagon-like peptide 1(7-37)...


2025-11-21 15:11:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 2/20: Glucagon-like peptide 1(7-36)...


2025-11-21 15:11:47 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 3/20: Big endothelin-1...


2025-11-21 15:11:48 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 4/20: Endothelin-1...


2025-11-21 15:11:52 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✓ Matched: 'Endothelins are endothelium-derived vasoconstrictor peptides (By similarity). Pr...'
Processing row 5/20: Gonadoliberin-1...


2025-11-21 15:11:52 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 6/20: Fibrinopeptide B...


2025-11-21 15:11:53 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 7/20: Met-enkephalin-Arg-Gly-Leu...


2025-11-21 15:11:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 8/20: Cathepsin H mini chain...


2025-11-21 15:11:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 9/20: Neutrophil defensin 1...


2025-11-21 15:11:55 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 10/20: Islet amyloid polypeptide...


2025-11-21 15:11:57 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 11/20: Islet amyloid polypeptide...


2025-11-21 15:12:01 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 12/20: Islet amyloid polypeptide...


2025-11-21 15:12:03 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 13/20: Beta-defensin 1...


2025-11-21 15:12:04 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 14/20: Somatoliberin...


2025-11-21 15:12:04 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 15/20: Pancreatic icosapeptide...


2025-11-21 15:12:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 16/20: Insulin B chain...


2025-11-21 15:12:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 17/20: Gastric inhibitory polypeptide...


2025-11-21 15:12:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 18/20: Peptide YY...


2025-11-21 15:12:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 19/20: Peptide YY...


2025-11-21 15:12:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 20/20: Fibrinopeptide A...


2025-11-21 15:12:09 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'

Processing complete:
  - Matches found: 1
  - Set to 'indeterminable': 19
  - Total processed: 20

Updated DataFrame:


Unnamed: 0,description,associated_function,non_associated_function
0,Kaliocin-1,Has antimicrobial activity and is able to perm...,
1,Lactoferroxin-A,Has opioid antagonist activity (PubMed:1369293...,
2,Lactoferroxin-B,Has opioid antagonist activity (PubMed:1369293...,
3,Lactoferroxin-C,Has opioid antagonist activity (PubMed:1369293...,
4,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,
5,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,
6,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,
7,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,
8,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,
9,Long-acting natriuretic peptide,May have a role in cardio-renal homeostasis th...,


Process peptide dataframe <b>with out</b> references

In [24]:
# Automatically associate functions using AI matching
# Create a new DataFrame instead of overwriting the original
peptides_final_no_refs_updated = auto_associate_functions_from_non_associated(
    peptides_final_no_refs,
    model=None,  # Use default model
    verbose=True
)

# Display updated dataframe
print("\nUpdated DataFrame:")
peptides_final_no_refs_updated[['description', 'associated_function', 'non_associated_function']].head(10)


Processing 40 rows to match descriptions with non_associated_function...
Processing row 1/40: Lactoferricin-H...


2025-11-21 15:12:13 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✓ Matched: 'Lactoferricin binds to the bacterial surface and is crucial for the bactericidal...'
Processing row 2/40: Glicentin-related polypeptide...


2025-11-21 15:12:13 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 3/40: Somatostatin-28...


2025-11-21 15:12:14 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 4/40: Hippocampal cholinergic neurostimulating peptide...


2025-11-21 15:12:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 5/40: Cholecystokinin-12...


2025-11-21 15:12:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 6/40: P3(42)...


2025-11-21 15:12:17 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 7/40: P3(40)...


2025-11-21 15:12:17 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 8/40: GnRH-associated peptide 1...


2025-11-21 15:12:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 9/40: Neutrophil defensin 2...


2025-11-21 15:12:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 10/40: Activation peptide...


2025-11-21 15:12:19 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 11/40: Defensin beta 4A...


2025-11-21 15:12:19 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 12/40: Gastrin-releasing peptide...


2025-11-21 15:12:20 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 13/40: Paragranulin...


2025-11-21 15:12:20 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 14/40: Granulin-1...


2025-11-21 15:12:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 15/40: Granulin-2...


2025-11-21 15:12:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 16/40: Granulin-5...


2025-11-21 15:12:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 17/40: Granulin-6...


2025-11-21 15:12:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 18/40: Intestinal peptide PHM-27...


2025-11-21 15:12:24 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 19/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 20/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 21/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 22/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 23/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 24/40: Thyrotropin-releasing hormone...


2025-11-21 15:12:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 25/40: Endothelin-2...


2025-11-21 15:12:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 26/40: Gastrin-71...


2025-11-21 15:12:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 27/40: Gastrin-52...


2025-11-21 15:12:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 28/40: Big gastrin...


2025-11-21 15:12:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 29/40: Gastrin-14...


2025-11-21 15:12:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 30/40: Gastrin-6...


2025-11-21 15:12:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 31/40: Motilin...


2025-11-21 15:12:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 32/40: Motilin-associated peptide...


2025-11-21 15:12:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 33/40: Beta-defensin 106...


2025-11-21 15:12:30 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 34/40: Neuromedin N...


2025-11-21 15:12:30 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 35/40: Tail peptide...


2025-11-21 15:12:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 36/40: Endothelin-3...


2025-11-21 15:12:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 37/40: Synenkephalin...


2025-11-21 15:12:32 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 38/40: Insulin A chain...


2025-11-21 15:12:32 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 39/40: Peptide YY(3-36)...


2025-11-21 15:12:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'
Processing row 40/40: Beta-defensin 105...


2025-11-21 15:12:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  ✗ No match found, set to 'indeterminable'

Processing complete:
  - Matches found: 1
  - Set to 'indeterminable': 39
  - Total processed: 40

Updated DataFrame:


Unnamed: 0,description,associated_function,non_associated_function
228,Lactoferricin-H,Lactoferricin binds to the bacterial surface a...,
230,Glicentin-related polypeptide,indeterminable,[Plays a key role in glucose metabolism and ho...
231,Somatostatin-28,indeterminable,"[Inhibits the secretion of pituitary hormones,..."
232,Hippocampal cholinergic neurostimulating peptide,indeterminable,"[Binds ATP, opioids and phosphatidylethanolami..."
233,Cholecystokinin-58,This peptide hormone induces gall bladder cont...,
234,Cholecystokinin-58 desnonopeptide,This peptide hormone induces gall bladder cont...,
235,Cholecystokinin-39,This peptide hormone induces gall bladder cont...,
236,Cholecystokinin-33,This peptide hormone induces gall bladder cont...,
237,Cholecystokinin-25,This peptide hormone induces gall bladder cont...,
238,Cholecystokinin-18,This peptide hormone induces gall bladder cont...,


Create a new merged dataframe with no reference and with reference

In [25]:
def merge_peptides_with_and_without_refs(
    peptides_with_refs: pd.DataFrame,
    peptides_no_refs: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Merge DataFrames with and without references into a single unified DataFrame.
    
    Combines peptides_final_with_refs and peptides_final_no_refs into one DataFrame,
    aligning columns and handling missing values appropriately.
    
    Args:
        peptides_with_refs: DataFrame containing peptides with references.
        peptides_no_refs: DataFrame containing peptides without references.
        verbose: If True, print summary information.
    
    Returns:
        Merged DataFrame containing all peptides (with and without references).
    
    Raises:
        ValueError: If inputs are not DataFrames or required DataFrames are missing.
        TypeError: If inputs are not pandas DataFrames.
    
    Example:
        >>> merged_df = merge_peptides_with_and_without_refs(
        ...     peptides_final_with_refs,
        ...     peptides_final_no_refs,
        ...     verbose=True
        ... )
        >>> print(f"Merged {len(merged_df)} total peptides")
    """
    # Validate inputs
    if not isinstance(peptides_with_refs, pd.DataFrame):
        raise TypeError("peptides_with_refs must be a pandas DataFrame")
    
    if not isinstance(peptides_no_refs, pd.DataFrame):
        raise TypeError("peptides_no_refs must be a pandas DataFrame")
    
    # Handle empty DataFrames
    if peptides_with_refs.empty and peptides_no_refs.empty:
        if verbose:
            print("Warning: Both DataFrames are empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    if peptides_with_refs.empty:
        if verbose:
            print("peptides_with_refs is empty, returning peptides_no_refs")
        return peptides_no_refs.copy()
    
    if peptides_no_refs.empty:
        if verbose:
            print("peptides_no_refs is empty, returning peptides_with_refs")
        return peptides_with_refs.copy()
    
    try:
        # Get all unique columns from both DataFrames
        all_columns = list(set(peptides_with_refs.columns) | set(peptides_no_refs.columns))
        
        # Ensure both DataFrames have the same columns (fill missing with NaN)
        peptides_with_refs_aligned = peptides_with_refs.reindex(columns=all_columns)
        peptides_no_refs_aligned = peptides_no_refs.reindex(columns=all_columns)
        
        # Concatenate the DataFrames
        merged_df = pd.concat(
            [peptides_with_refs_aligned, peptides_no_refs_aligned],
            ignore_index=True
        )
        
        if verbose:
            print(f"Merged DataFrames:")
            print(f"  - peptides_with_refs: {len(peptides_with_refs)} rows")
            print(f"  - peptides_no_refs: {len(peptides_no_refs)} rows")
            print(f"  - Total merged: {len(merged_df)} rows, {len(merged_df.columns)} columns")
        
        logger.info(f"Merged {len(peptides_with_refs)} + {len(peptides_no_refs)} = {len(merged_df)} total peptides")
        
        return merged_df
        
    except Exception as e:
        logger.error(f"Error merging DataFrames: {e}")
        raise


# Merge DataFrames with and without references into one unified DataFrame
peptides_final_merged = merge_peptides_with_and_without_refs(
    peptides_final_with_refs_updated if 'peptides_final_with_refs_updated' in globals() else peptides_final_with_refs,
    peptides_final_no_refs_updated if 'peptides_final_no_refs_updated' in globals() else peptides_final_no_refs,
    verbose=True
)

# Display summary
print_critical_info(peptides_final_merged, "peptides_final_merged")


2025-11-21 15:12:33 - __main__ - INFO - Merged 244 + 61 = 305 total peptides
2025-11-21 15:12:33 - __main__ - INFO - DataFrame 'peptides_final_merged': 305 rows, 13 columns, 571 missing values


Merged DataFrames:
  - peptides_with_refs: 244 rows
  - peptides_no_refs: 61 rows
  - Total merged: 305 rows, 13 columns
Information for DataFrame: peptides_final_merged
----------------------------------------
Number of rows (peptides): 305
Number of columns: 13
Shape: (305, 13)

Columns (13): ['non_associated_function', 'authors', 'pubmed_id', 'interval', 'doi', 'title', 'associated_function', 'description', 'abstract', 'peptide', 'protein_name', 'proteinID', 'protein_desc']

Missing Values:
                         Count  Percentage
non_associated_function    247       80.98
abstract                    76       24.92
doi                         64       20.98
pubmed_id                   62       20.33
authors                     61       20.00
title                       61       20.00

Memory usage: 0.88 MB




# Determines biological function using OpenAI/ChatGPT API

<b>Function:</b> Uses calls OpenAI API to geterine Functin and provide evidence and reasoning

In [26]:
def extract_info(response_string: str) -> List[Tuple[str, str, str]]:
    """
    Extract function names, evidence, and logic from OpenAI API response string.
    
    Parses the structured response from OpenAI to extract bioactivity function
    classifications with their supporting evidence and logic.
    
    Args:
        response_string: Raw response string from OpenAI API containing function information
                         in format: "Function: X | Evidence: Y | Logic: Z"
    
    Returns:
        List of tuples, each containing (function_name, logic, evidence).
        Returns empty list if parsing fails or no functions found.
    
    Example:
        >>> response = "Function: Antimicrobial | Evidence: ... | Logic: ..."
        >>> results = extract_info(response)
        >>> print(results[0][0])  # 'Antimicrobial'
    """
    if not response_string or not isinstance(response_string, str):
        logger.warning("Invalid response_string provided to extract_info")
        return []
    
    extracted_info = []
    
    try:
        # Split by double newline to separate different functions
        functions_list = response_string.split("\n\n")
        
        for func_str in functions_list:
            if not func_str.strip():
                continue
                
            try:
                if "Function:" in func_str and "Evidence:" in func_str and "Logic:" in func_str:
                    # Extract function name
                    func_name = func_str.split("Function:")[1].split("|")[0].strip()
                    
                    # Extract evidence
                    if "Evidence:" in func_str:
                        evidence_part = func_str.split("Evidence:")[1]
                        evidence = evidence_part.split("|")[0].strip() if "|" in evidence_part else evidence_part.strip()
                    else:
                        evidence = ""
                    
                    # Extract logic
                    if "Logic:" in func_str:
                        logic_part = func_str.split("Logic:")[1]
                        logic = logic_part.split("|")[0].strip() if "|" in logic_part else logic_part.strip()
                    else:
                        logic = ""
                    
                    if func_name:  # Only add if function name was found
                        extracted_info.append((func_name, logic, evidence))
                        
            except (IndexError, AttributeError) as e:
                logger.warning(f"Error parsing function string: {e}")
                continue
        
        logger.debug(f"Extracted {len(extracted_info)} functions from response")
        
    except Exception as e:
        logger.error(f"Error extracting info from response: {e}")
        return []
    
    return extracted_info


def classify_bioactivity(
    peptide: str,
    description: Optional[str],
    associated_function: Optional[str],
    non_associated_function: Optional[str],
    abstract: Optional[str],
    title: Optional[str],
    mbpdb_function_list: List[str],
    model: Optional[str] = None
) -> List[Tuple[str, str, str]]:
    """
    Classify peptide bioactivity using OpenAI API.
    
    Constructs a query with peptide information and sends it to OpenAI's API
    to identify bioactivity functions from a provided list.
    
    Args:
        peptide: Peptide sequence string.
        description: Description of the peptide.
        associated_function: Associated function from UniProt data.
        non_associated_function: Non-associated function (protein-level function).
        abstract: PubMed abstract text.
        title: Publication title.
        mbpdb_function_list: List of potential bioactivity functions to choose from.
        model: OpenAI model to use. Defaults to DEFAULT_OPENAI_MODEL if None.
    
    Returns:
        List of tuples containing (function_name, logic, evidence) for each
        identified bioactivity function. Returns empty list on error.
    
    Raises:
        ValueError: If required parameters are invalid.
        RuntimeError: If OpenAI client is not initialized.
    
    Example:
        >>> functions = classify_bioactivity(
        ...     peptide="MKTAY",
        ...     description="Antimicrobial peptide",
        ...     associated_function="Antimicrobial activity",
        ...     non_associated_function=None,
        ...     abstract="...",
        ...     title="...",
        ...     mbpdb_function_list=["Antimicrobial", "Antioxidant"]
        ... )
    """
    # Validate inputs
    if not peptide or not isinstance(peptide, str):
        raise ValueError("peptide must be a non-empty string")
    
    if not mbpdb_function_list or not isinstance(mbpdb_function_list, list):
        raise ValueError("mbpdb_function_list must be a non-empty list")
    
    if openai_client is None:
        raise RuntimeError("OpenAI client not initialized. Check your API key configuration.")
    
    # Use provided model or default
    if model is None:
        model = DEFAULT_OPENAI_MODEL
    
    # Handle None values for optional parameters
    description = description or ""
    associated_function = associated_function or ""
    non_associated_function = non_associated_function or ""
    abstract = abstract or ""
    title = title or ""
    
    try:
        # Construct the query
        query = (
            f"Given the peptide {peptide}, with the description: {description}, "
            f"and the associated_function: {associated_function}, "
            f"or function of the protein or all peptides from that protein called non_associated_function: {non_associated_function}, "
            f"\n\nif associated_function = indeterminable then do ignore the non_associated_function input "
            f"title: {title}, and abstract: {abstract}, "
            f"please identify any bioactivity functions of the peptide from the list provided(mbpdb_function_list). "
            f"For each identified function, please provide the response in the following format: "
            f"'Function: [Function Name] | Evidence: [Specific evidence from the sources] | Logic: [Explanation for the choice]'"
            f"\n\nList of potential functions:{mbpdb_function_list}."
        )
        
        logger.debug(f"Calling OpenAI API with model: {model}")
        
        # Send query to OpenAI API
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "who is an expert in bioactive peptides"},
                {"role": "user", "content": query}
            ]
        )
        
        # Extract response content
        bioactivity_response = response.choices[0].message.content
        
        if not bioactivity_response:
            logger.warning("Empty response from OpenAI API")
            return []
        
        # Extract and return parsed information
        return extract_info(bioactivity_response)
        
    except Exception as e:
        logger.error(f"Error calling OpenAI API: {e}")
        logger.error(f"Peptide: {peptide[:50]}, Model: {model}")
        return []


Querry OpenAI API with results to determine peptide function

In [None]:
def process_peptides_with_gpt(
    peptides_df: pd.DataFrame,
    mbpdb_function_list: List[str],
    verbose: bool = True
) -> pd.DataFrame:
    """
    Process peptides DataFrame by classifying bioactivity using GPT.
    
    Iterates through each row of the input DataFrame, calls classify_bioactivity
    to identify bioactivity functions, and expands the DataFrame to include one
    row per identified function. Cleans function names, logic, and evidence by
    removing asterisks and quotes.
    
    Args:
        peptides_df: DataFrame containing peptide data with columns:
                    'peptide', 'description', 'associated_function',
                    'non_associated_function', 'abstract', 'title'
        mbpdb_function_list: List of bioactivity function names to match against.
        verbose: If True, print progress information for each row processed.
    
    Returns:
        DataFrame with expanded rows (one per identified function) containing
        original columns plus 'gpt_function', 'gpt_logic', and 'gpt_evidence'.
    
    Raises:
        ValueError: If required columns are missing from peptides_df.
        TypeError: If peptides_df is not a DataFrame or mbpdb_function_list is not a list.
    
    Example:
        >>> processed_df = process_peptides_with_gpt(
        ...     peptides_final_with_refs,
        ...     mbpdb_function_list,
        ...     verbose=True
        ... )
        >>> print(f"Processed {len(processed_df)} rows")
    """
    # Validate inputs
    if not isinstance(peptides_df, pd.DataFrame):
        raise TypeError("peptides_df must be a pandas DataFrame")
    
    if not isinstance(mbpdb_function_list, list):
        raise TypeError("mbpdb_function_list must be a list")
    
    if peptides_df.empty:
        if verbose:
            print("Warning: Input DataFrame is empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    # Check required columns
    required_cols = ['peptide', 'description', 'associated_function', 
                     'non_associated_function', 'abstract', 'title']
    missing_cols = [col for col in required_cols if col not in peptides_df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Create a copy to avoid modifying the original
    peptides_work = peptides_df.copy()
    
    # Create an empty list to store results
    results = []
    total_rows = len(peptides_work)
    
    if verbose:
        print(f"Processing {total_rows} rows with GPT classification...")
    
    # Iterate over each row of the DataFrame
    for index, (_, row) in enumerate(peptides_work.iterrows()):
        # Record the start time for this iteration
        start_time = time.time()
        
        try:
            # Get the functions, logic, and evidence based on the row data
            functions_with_logic = classify_bioactivity(
                row['peptide'],
                row['description'],
                row['associated_function'],
                row['non_associated_function'],
                row['abstract'],
                row['title'],
                mbpdb_function_list
            )
            
            # For each function, logic, and evidence, create a new row
            for func, logic, evidence in functions_with_logic:
                new_row = row.copy()
                
                # Clean function, logic, and evidence: remove asterisks and quotes
                clean_func = str(func).replace('*', '').strip(' \"')
                clean_logic = str(logic).replace('*', '').strip(' \"')
                clean_evidence = str(evidence).replace('*', '').strip(' \"')
                
                # Print function, logic, and evidence information
                if verbose:
                    print(f"\n  Function: {clean_func}")
                    print(f"  Logic: {clean_logic}")
                    print(f"  Evidence: {clean_evidence}")
                
                # Add cleaned values to new row
                new_row['gpt_function'] = clean_func
                new_row['gpt_logic'] = clean_logic
                new_row['gpt_evidence'] = clean_evidence
                
                results.append(new_row)
            
            # Calculate and report elapsed time
            elapsed_time = time.time() - start_time
            if verbose:
                num_functions = len(functions_with_logic)
                print(f"Processed row {index + 1}/{total_rows} in {elapsed_time:.2f} seconds "
                      f"({num_functions} function(s) identified)")
        
        except Exception as e:
            elapsed_time = time.time() - start_time
            if verbose:
                print(f"Error processing row {index + 1}/{total_rows} in {elapsed_time:.2f} seconds: {e}")
            # Continue processing other rows even if one fails
            continue
    
    # Convert the results to a new DataFrame
    if results:
        processed_df = pd.DataFrame(results)
        if verbose:
            print(f"\nProcessing complete: {len(processed_df)} total rows "
                  f"({len(peptides_work)} original rows expanded)")
    else:
        if verbose:
            print("Warning: No results generated. Returning empty DataFrame.")
        processed_df = pd.DataFrame()
    
    return processed_df


# Process peptides with GPT classification
# Use the dataframe directly (for manual editing, edit peptides_final_with_refs directly)
peptides_final_merged = peptides_final_merged.copy()

peptides_final_with_gpt_processed = process_peptides_with_gpt(
    peptides_final_merged,
    mbpdb_function_list,
    verbose=True
)

# Display the updated DataFrame
peptides_final_with_gpt_processed.head(n=3)

Processing 305 rows with GPT classification...


2025-11-21 15:12:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antimicrobial
  Logic: The description explicitly states that Kaliocin-1 has antimicrobial activity and can affect bacterial membranes, leading to its classification as an antimicrobial peptide.
  Evidence: The peptide was shown to have a bactericidal effect and was able to induce K(+)-efflux and dissipation of the transmembrane electrical potential in Escherichia coli cells, as well as permeabilize different ions through liposomal membranes.
Processed row 1/305 in 3.30 seconds (1 function(s) identified)


2025-11-21 15:12:41 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Opioid
  Logic: The peptide Lactoferroxin-A is specifically mentioned to have opioid antagonist activity and shows a preference for the mu-opioid receptor, which classifies it under opioid-related functions. This aligns with the functions outlined in the provided list, confirming its activity within that context.
  Evidence: Has opioid antagonist activity (PubMed:1369293). Shows preference for mu-receptor (PubMed:1369293).
Processed row 2/305 in 4.24 seconds (1 function(s) identified)


2025-11-21 15:12:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Opioid
  Logic: The peptide has been explicitly shown to exhibit opioid antagonist activity, indicating its direct involvement with opioid receptors, thus classifying it under the opioid function. This is further supported by the preference exhibited towards kappa-receptors, affirming its bioactivity in relation to opioid mechanisms.
  Evidence: Has opioid antagonist activity (PubMed:1369293). Shows higher degrees of preference for kappa-receptors than for mu-receptors (PubMed:1369293).
Processed row 3/305 in 3.31 seconds (1 function(s) identified)


2025-11-21 15:12:48 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Opioid
  Logic: The peptide is described explicitly as having opioid antagonist activity, indicating its function in relation to the opioid system. This aligns with its classification as having a bioactivity related to opioid receptors.
  Evidence: Has opioid antagonist activity and shows higher degrees of preference for kappa-receptors than for mu-receptors (PubMed:1369293)
Processed row 4/305 in 4.13 seconds (1 function(s) identified)


2025-11-21 15:12:58 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is related to the regulation of natriuresis and vasodilation, which are key factors in blood pressure control, thus suggesting an antihypertensive role.
  Evidence: According to a report, in vivo it is not sufficient to activate cGMP and does not inhibit collecting duct transport nor effect diuresis and natriuresis.

  Function: Nitric oxide liberation
  Logic: The induction of cGMP is commonly associated with the action of nitric oxide, indicating a potential role in nitric oxide liberation, which can contribute to vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Immunomodulatory
  Logic: The role of prostaglandins can extend to modulatory effects on immune responses, hence suggesting an immunomodulatory function.
  Evidence: May promote natriuresis, at least in part, by enhancing prostaglandin E2 synthesis resulting in the inhibition of renal Na+-K+-ATPase (Pu

2025-11-21 15:13:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 6/305 in 7.83 seconds (0 function(s) identified)


2025-11-21 15:13:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described to play a role in cardio-renal homeostasis through vasodilation, and has been shown to promote the production of cGMP and induce vasodilation (PubMed:2825692). Additionally, the abstract mentions a significant inverse correlation between blood pressure and ANF peptides, implying they are important for the maintenance of blood pressure.  
   Logic: The role of the long-acting natriuretic peptide in inducing vasodilation is directly correlated with its potential antihypertensive effects by reducing vascular resistance and, consequently, blood pressure.
  Logic: The role of the long-acting natriuretic peptide in inducing vasodilation is directly correlated with its potential antihypertensive effects by reducing vascular resistance and, consequently, blood pressure.
  Evidence: The peptide is described to play a role in cardio-renal homeostasis through vasodilation, and has been shown to promote the production of cGMP an

2025-11-21 15:13:24 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is described to induce vasodilation, which would likely contribute to lowering blood pressure, consistent with antihypertensive activity.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..." (PubMed references)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with nitric oxide signaling pathways, suggesting this peptide may facilitate nitric oxide release leading to its vasodilatory effects.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation..." (PubMed:2825692)

  Function: Immunomodulatory
  Logic: Modulation of protein reabsorption can indicate an immunomodulatory effect, as alterations in kidney function and protein levels impact immune system status.
  Evidence: Possibly enhances protein excretion in urine by decreasing proximal tubular protein reabsorption..." (PubMed:11145122)
Processed row 8/305 i

2025-11-21 15:13:30 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "Three peptides consisting of amino acids ... have blood pressure-lowering... properties in animals"  
   Logic: The peptides derived from the atrial natriuretic factor prohormone are described as having blood pressure-lowering effects, which indicates they may have antihypertensive properties.
  Logic: The peptides derived from the atrial natriuretic factor prohormone are described as having blood pressure-lowering effects, which indicates they may have antihypertensive properties.
  Evidence: Three peptides consisting of amino acids ... have blood pressure-lowering... properties in animals"  
   Logic: The peptides derived from the atrial natriuretic factor prohormone are described as having blood pressure-lowering effects, which indicates they may have antihypertensive properties.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation"  
   Logic: The production of cGMP typi

2025-11-21 15:13:40 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 10/305 in 9.91 seconds (0 function(s) identified)


2025-11-21 15:13:50 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is associated with promoting vasodilation and regulating blood pressure through the production of cGMP. (PubMed:2825692).  
   Logic: Since vasodilation typically leads to a decrease in blood pressure, this aligns well with an antihypertensive function.
  Logic: Since vasodilation typically leads to a decrease in blood pressure, this aligns well with an antihypertensive function.
  Evidence: The peptide is associated with promoting vasodilation and regulating blood pressure through the production of cGMP. (PubMed:2825692).  
   Logic: Since vasodilation typically leads to a decrease in blood pressure, this aligns well with an antihypertensive function.

  Function: Nitric oxide liberation  
   Evidence: The peptide promotes the production of cGMP, which is often associated with nitric oxide signaling pathways that mediate vasodilation. (PubMed:2825692).  
   Logic: The induction of vasodilation and the production of cGMP suggest 

2025-11-21 15:13:57 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
   - Evidence: "In vitro, promotes the production of cGMP and induces vasodilation... May promote natriuresis... appears to bind to specific receptors... which was found with or without endothelium present."
   - Logic: The peptide promotes vasodilation and natriuresis, both of which are mechanisms known to lower blood pressure, thus fitting the definition of an antihypertensive agent.
  Logic: The peptide promotes vasodilation and natriuresis, both of which are mechanisms known to lower blood pressure, thus fitting the definition of an antihypertensive agent.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation... May promote natriuresis... appears to bind to specific receptors... which was found with or without endothelium present."
   - Logic: The peptide promotes vasodilation and natriuresis, both of which are mechanisms known to lower blood pressure, thus fitting the definition of an antihypertensive agent.

  Function: Nitri

2025-11-21 15:14:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide regulates blood pressure and volume by promoting diuresis and vasodilation, which are mechanisms related to antihypertensive effects.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation...

  Function: Nitric oxide liberation
  Logic: The increase in cGMP production usually indicates a pathway involving nitric oxide signaling, which is associated with vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation...

  Function: Immunomodulatory
  Logic: Alterations in renal handling of proteins and electrolytes suggest an involvement in regulating immune responses and inflammation related to those proteins.
  Evidence: Possibly enhances protein excretion in urine by decreasing proximal tubular protein reabsorption...
Processed row 13/305 in 5.51 seconds (3 function(s) identified)


2025-11-21 15:14:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The inhibition of aldosterone and promotion of vasodilation suggests the peptide may have a role in lowering blood pressure, fulfilling the antihypertensive function category.
  Evidence: The peptide is noted for its role in vasodilation and possibly inhibiting aldosterone synthesis, which is associated with blood pressure regulation (PubMed references supporting regulation of natriuresis, diuresis).

  Function: Nitric oxide liberation
  Logic: The connection between cGMP production and vasodilation generally involves the liberation of nitric oxide as part of the signaling pathway, indicating that this function may apply to the peptide.
  Evidence: The peptide promotes the production of cGMP and induces vasodilation, which is often linked to the action of nitric oxide (PubMed:2825692).

  Function: Immunomodulatory
  Logic: Changes in fluid and electrolyte balance mediated by such peptides can affect immune system parameters, suggesting potential

2025-11-21 15:14:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described as a "vessel dilator," which implies a role in vasodilation. In vitro studies indicate that it promotes the production of cGMP and induces vasodilation (PubMed:2825692).  
   Logic: As a vessel dilator that promotes vasodilation and affects blood pressure, the peptide would contribute to antihypertensive effects by lowering systemic vascular resistance.
  Logic: As a vessel dilator that promotes vasodilation and affects blood pressure, the peptide would contribute to antihypertensive effects by lowering systemic vascular resistance.
  Evidence: The peptide is described as a "vessel dilator," which implies a role in vasodilation. In vitro studies indicate that it promotes the production of cGMP and induces vasodilation (PubMed:2825692).  
   Logic: As a vessel dilator that promotes vasodilation and affects blood pressure, the peptide would contribute to antihypertensive effects by lowering systemic vascular resistance

2025-11-21 15:14:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The vasodilatory properties of the peptide suggest it could reduce blood pressure, making it relevant to antihypertensive activity.
  Evidence: The peptide is described as a vessel dilator and may have a role in cardio-renal homeostasis by promoting vasodilation, which is associated with blood pressure regulation (PubMed:2532366, PubMed:2825692)

  Function: Nitric oxide liberation
  Logic: The increase in cGMP production is often associated with pathways that involve nitric oxide, indicating a potential function in nitric oxide liberation.
  Evidence: In vitro, the peptide promotes the production of cGMP, which is typically a downstream signaling molecule for nitric oxide (PubMed:2825692)

  Function: Immunomodulatory
  Logic: Prostaglandin E2 has known roles in modulating immune responses, suggesting a possible immunomodulatory function for the peptide.
  Evidence: The peptide may enhance prostaglandin E2 synthesis, which can have various immuno

2025-11-21 15:14:34 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation" (PubMed:2532366).  
   Logic: The description indicates the peptide's involvement in vasodilation and regulation of blood pressure, suggesting antihypertensive properties.
  Logic: The description indicates the peptide's involvement in vasodilation and regulation of blood pressure, suggesting antihypertensive properties.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation" (PubMed:2532366).  
   Logic: The description indicates the peptide's involvement in vasodilation and regulation of blood pressure, suggesting antihypertensive properties.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).  
   Logic: The production of cGMP is often associated with nitric oxide signaling, indicating

2025-11-21 15:14:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692)."  
   Logic: The peptide's ability to promote vasodilation and enhance cGMP production suggests a mechanism for lowering blood pressure, which aligns with antihypertensive properties.
  Logic: The peptide's ability to promote vasodilation and enhance cGMP production suggests a mechanism for lowering blood pressure, which aligns with antihypertensive properties.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692)."  
   Logic: The peptide's ability to promote vasodilation and enhance cGMP production suggests a mechanism for lowering blood pressure, which aligns with antihypertensive properties.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692)."  
   Logic: The elevation of cGMP is often associated with the pathway involving nitr

2025-11-21 15:14:49 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 19/305 in 6.85 seconds (0 function(s) identified)


2025-11-21 15:14:56 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The mechanism of action involving vasodilation directly relates to the antihypertensive effect, as reducing vascular resistance lowers blood pressure.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation" (PubMed:2532366)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is a signaling pathway often associated with nitric oxide-mediated vasodilation, suggesting that this peptide could enhance nitric oxide activity.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692)

  Function: Immunomodulatory
  Logic: The involvement in natriuresis and distinct receptor binding suggests an immunomodulatory role, especially considering the connection to renal function and fluid balance.
  Evidence: May promote natriuresis... resulting in the inhibition of renal Na+-K+-ATPase" (PubMed:7595132) and "appears to bind to specific receptors 

2025-11-21 15:15:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation (PubMed:2532366, PubMed:7595132)."  
   Logic: The peptide induces vasodilation and potentially regulates blood pressure by affecting renal and cardiovascular functions, thus contributing to antihypertensive activity.
  Logic: The peptide induces vasodilation and potentially regulates blood pressure by affecting renal and cardiovascular functions, thus contributing to antihypertensive activity.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation (PubMed:2532366, PubMed:7595132)."  
   Logic: The peptide induces vasodilation and potentially regulates blood pressure by affecting renal and cardiovascular functions, thus contributing to antihypertensive activity.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodil

2025-11-21 15:15:12 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described to have a role in cardio-renal homeostasis through regulation of vasodilation and possibly promoting natriuresis and diuresis (PubMed:2532366, PubMed:7595132). Additionally, the vasodilation activity induced by the peptide in vitro is associated with increased cGMP production, which is known to contribute to lowering blood pressure.  
   Logic: The association of the peptide with vasodilation directly suggests an antihypertensive function, as vasodilation reduces vascular resistance, thereby lowering blood pressure.
  Logic: The association of the peptide with vasodilation directly suggests an antihypertensive function, as vasodilation reduces vascular resistance, thereby lowering blood pressure.
  Evidence: The peptide is described to have a role in cardio-renal homeostasis through regulation of vasodilation and possibly promoting natriuresis and diuresis (PubMed:2532366, PubMed:7595132). Additionally, the vasodilat

2025-11-21 15:15:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation (PubMed:2532366, PubMed:7595132, PubMed:7955907, PubMed:8087923, PubMed:8653797)."  
   Logic: The peptide is associated with vasodilation and regulation of cardiovascular parameters, indicating a potential role in lowering blood pressure.
  Logic: The peptide is associated with vasodilation and regulation of cardiovascular parameters, indicating a potential role in lowering blood pressure.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation (PubMed:2532366, PubMed:7595132, PubMed:7955907, PubMed:8087923, PubMed:8653797)."  
   Logic: The peptide is associated with vasodilation and regulation of cardiovascular parameters, indicating a potential role in lowering blood pressure.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP a

2025-11-21 15:15:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described as a "Vessel dilator," and it promotes vasodilation and the production of cGMP, which are mechanisms associated with lowering blood pressure (PubMed:2825692).  
   Logic: Vasodilation contributes directly to the reduction of blood pressure, indicating an antihypertensive effect.
  Logic: Vasodilation contributes directly to the reduction of blood pressure, indicating an antihypertensive effect.
  Evidence: The peptide is described as a "Vessel dilator," and it promotes vasodilation and the production of cGMP, which are mechanisms associated with lowering blood pressure (PubMed:2825692).  
   Logic: Vasodilation contributes directly to the reduction of blood pressure, indicating an antihypertensive effect.

  Function: Nitric oxide liberation  
   Evidence: In vitro, the peptide promotes the production of cGMP, which is often associated with the liberation of nitric oxide as part of the vasodilatory mechanism (PubMed:

2025-11-21 15:15:35 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is described as a vessel dilator and has been associated with effects such as hypotension and vasodilation, indicating an antihypertensive effect.
  Evidence: In vivo, diuresis and natriuresis, as well as hypotension have been observed" [PubMed:2532366]; "In vitro, promotes the production of cGMP and induces vasodilation" [PubMed:2825692].

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often a downstream effect of nitric oxide signaling in the vasculature, suggesting that this peptide may mediate its effects through nitric oxide liberation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" [PubMed:2825692].

  Function: Immunomodulatory
  Logic: The modulation of natriuresis and involvement with prostaglandins indicate an interaction with immune mediators or pathways, which commonly fall under immunomodulatory functions.
  Evidence: May promote natriuresis, at least in part, by enh

2025-11-21 15:15:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Nitric oxide liberation
  Logic: The ability of the peptide to promote cGMP production and induce vasodilation suggests a role in nitric oxide signaling, as cGMP is a key mediator of nitric oxide effects in vascular biology.
  Evidence: Promotes the production of cGMP and induces vasodilation (PubMed:2825692)

  Function: Antihypertensive
  Logic: By regulating diuresis and affecting blood pressure through aldosterone inhibition, the peptide may have antihypertensive properties. This is further supported by its role in promoting natriuresis and vasodilation, both of which contribute to blood pressure regulation.
  Evidence: May have a role in cardio-renal homeostasis through regulation of diuresis and inhibiting aldosterone synthesis (PubMed:2825692)

  Function: Immunomodulatory
  Logic: The modulation of renal processes and enhancement of protein excretion might indicate an immunomodulatory role, as kidney function often interacts with immune responses. Moreover, protein

2025-11-21 15:15:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide promotes vasodilation and enhances production of cGMP (PubMed:2825692).  
   Logic: Vasodilation is a mechanism that can contribute to lower blood pressure, which relates to antihypertensive effects.
  Logic: Vasodilation is a mechanism that can contribute to lower blood pressure, which relates to antihypertensive effects.
  Evidence: The peptide promotes vasodilation and enhances production of cGMP (PubMed:2825692).  
   Logic: Vasodilation is a mechanism that can contribute to lower blood pressure, which relates to antihypertensive effects.

  Function: Nitric oxide liberation  
   Evidence: The peptide induces vasodilation in vitro and promotes the production of cGMP (PubMed:2825692).  
   Logic: Nitric oxide is known to play a significant role in vasodilation, and the increase in cGMP is associated with nitric oxide signaling pathways.
  Logic: Nitric oxide is known to play a significant role in vasodilation, and the increase

2025-11-21 15:15:55 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The functioning of kaliuretic peptide in regulating blood pressure is tied to its ability to affect diuresis and aldosterone levels, both of which have direct implications for hypertension.
  Evidence: May have a role in cardio-renal homeostasis through regulation of diuresis and inhibiting aldosterone synthesis (PubMed:2825692).

  Function: Nitric oxide liberation
  Logic: The induction of vasodilation typically involves pathways that lead to increased levels of nitric oxide, which could mediate the effects described.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Antidiuretic
  Logic: While it primarily promotes potassium excretion, the interplay of diuresis and the relative activity on sodium indicates a role that is complex in relation to fluid regulation.
  Evidence: May have a role in potassium excretion but not sodium excretion (natriuresis) (PubMed:8087923).

  Function: Cytom

2025-11-21 15:16:00 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
   - Evidence: Promotes the production of cGMP and induces vasodilation (PubMed:2825692).
   - Logic: The peptide's ability to induce vasodilation implies it can reduce blood pressure, classifying it as antihypertensive.
  Logic: The peptide's ability to induce vasodilation implies it can reduce blood pressure, classifying it as antihypertensive.
  Evidence: Promotes the production of cGMP and induces vasodilation (PubMed:2825692).
   - Logic: The peptide's ability to induce vasodilation implies it can reduce blood pressure, classifying it as antihypertensive.

  Function: Nitric oxide liberation
   - Evidence: Induces vasodilation which is associated with a 4-5 fold increase in cyclic GMP secondary to activation of particulate guanylate cyclase (from abstract regarding atrial natriuretic peptides).
   - Logic: Vasodilation is often linked to the action of nitric oxide, suggesting that the peptide may play a role in nitric oxide liberation, especially give

2025-11-21 15:16:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The ability to promote vasodilation and increase cGMP levels is associated with lowering blood pressure, which indicates an antihypertensive effect.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often linked to the action of nitric oxide in inducing vasodilation, suggesting involvement in nitric oxide signaling pathways.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Improves cognition
  Logic: While direct evidence is not provided, there is a plausible indirect connection between vasodilation/cognitive function due to increased cerebral blood flow.
  Evidence: Not specifically mentioned, but the implication of vasodilation and cGMP production could relate to improved blood flow and oxygen delivery to the brain.

  Function: Immunomodulatory
  Logic: Immunomodulato

2025-11-21 15:16:14 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The vasodilatory effects of the peptide imply an ability to lower blood pressure, characteristic of antihypertensive agents.
  Evidence: The peptide promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Increase sodium excretion (Natriuresis)
  Logic: Since the peptide facilitates the excretion of sodium, it directly correlates with natriuretic activity.
  Evidence: The peptide is involved in promoting natriuresis by enhancing prostaglandin E2 synthesis resulting in the inhibition of renal Na+-K+-ATPase (PubMed:7595132, PubMed:7720651).

  Function: Nitric oxide liberation
  Logic: The relationship between cGMP production and nitric oxide suggests that this peptide may be involved in nitric oxide signaling.
  Evidence: The peptide promotes the production of cGMP, which is a signaling molecule often associated with nitric oxide signaling pathways (PubMed:2825692).
Processed row 31/305 in 7.31 seconds (3 function(s)

2025-11-21 15:16:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).  
   Logic: The ability to induce vasodilation is typically associated with an antihypertensive effect, as it can lead to reduced blood pressure.
  Logic: The ability to induce vasodilation is typically associated with an antihypertensive effect, as it can lead to reduced blood pressure.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).  
   Logic: The ability to induce vasodilation is typically associated with an antihypertensive effect, as it can lead to reduced blood pressure.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP" (PubMed:2825692).  
   Logic: The production of cGMP is often associated with nitric oxide signaling pathways, which plays a role in vasodilation and cardiovascular regulation.
  Logic: The production of cGMP is often associated with nitri

2025-11-21 15:16:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The function is supported by its described role in promoting vasodilation and natriuresis, directly related to controlling hypertension.
  Evidence: The peptide has been associated with the regulation of vasodilation and natriuresis, which are critical for blood pressure regulation (associated_function). Several studies suggest that it promotes vasodilation of renal arteries and possibly contributes to regulating blood pressure (PubMed:2825692).

  Function: Nitric oxide liberation
  Logic: The synthesis of cGMP is typically linked to nitric oxide activity; thus, its promotion indicates a pathway that includes nitric oxide liberation.
  Evidence: It promotes the production of cGMP in vitro, which is an indicator of nitric oxide signaling (PubMed:2825692).

  Function: Cytomodulatory
  Logic: The modulation of renal processes and sodium handling indicates a cytomodulatory role in kidney function and fluid balance.
  Evidence: May promote natriuresi

2025-11-21 15:16:32 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is suggested to induce vasodilation and promote natriuresis, which are both mechanisms that can lower blood pressure, thus supporting its identification as antihypertensive.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation" (PubMed:2532366, PubMed:2825692)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with nitric oxide signaling and vasodilation, suggesting that the peptide might be involved in mechanisms that liberate nitric oxide or stimulate its pathway.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692)

  Function: Immunomodulatory
  Logic: Although not directly stated, altered protein handling and enhanced excretion may relate to immunomodulatory effects related to inflammatory responses, especially in a renal context.
  Evidence: Possibly enhances protein excretion in urine by 

2025-11-21 15:16:39 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: In vitro, induces vasodilation and promotes natriuresis, suggesting a role in lowering blood pressure (PubMed:2825692) and may contribute to cardiovascular regulation (multiple PubMed sources indicate various mechanisms affecting blood pressure).  
   Logic: Vasodilation and natriuresis are key physiological processes that contribute to lower blood pressure, thereby supporting the classification of this peptide's activity as antihypertensive.
  Logic: Vasodilation and natriuresis are key physiological processes that contribute to lower blood pressure, thereby supporting the classification of this peptide's activity as antihypertensive.
  Evidence: In vitro, induces vasodilation and promotes natriuresis, suggesting a role in lowering blood pressure (PubMed:2825692) and may contribute to cardiovascular regulation (multiple PubMed sources indicate various mechanisms affecting blood pressure).  
   Logic: Vasodilation and natriuresis are key phy

2025-11-21 15:16:43 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation" and "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..."  
   Logic: The peptide's ability to induce vasodilation through the generation of cGMP indicates a potential antihypertensive effect, as vasodilation lowers blood pressure.
  Logic: The peptide's ability to induce vasodilation through the generation of cGMP indicates a potential antihypertensive effect, as vasodilation lowers blood pressure.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" and "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..."  
   Logic: The peptide's ability to induce vasodilation through the generation of cGMP indicates a potential antihypertensive effect, as vasodilation lowers blood pressure.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, pr

2025-11-21 15:16:48 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is noted to play a role in cardio-renal homeostasis through vasodilation. In vitro studies indicate that it can vasodilate renal arteries (PubMed:2825692).  
   Logic: Vasodilation contributes to lowering blood pressure, which is a key characteristic of antihypertensive agents.
  Logic: Vasodilation contributes to lowering blood pressure, which is a key characteristic of antihypertensive agents.
  Evidence: The peptide is noted to play a role in cardio-renal homeostasis through vasodilation. In vitro studies indicate that it can vasodilate renal arteries (PubMed:2825692).  
   Logic: Vasodilation contributes to lowering blood pressure, which is a key characteristic of antihypertensive agents.

  Function: Nitric oxide liberation  
   Evidence: In vitro, the peptide is reported to promote the production of cGMP, which is associated with nitric oxide signaling pathways and vasodilation (PubMed:2825692).  
   Logic: The production o

2025-11-21 15:16:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive 
   - Evidence: The description mentions that the peptide promotes vasodilation and regulates diuresis and natriuresis, which are actions associated with lowering blood pressure. Specifically, it was indicated that in vitro, the peptide induces vasodilation (PubMed:2825692) and promotes natriuresis.
   - Logic: The vasodilatory effect contributes to lowering blood pressure, while natriuresis helps decrease blood volume, both of which are key mechanisms of antihypertensive agents.
  Logic: The vasodilatory effect contributes to lowering blood pressure, while natriuresis helps decrease blood volume, both of which are key mechanisms of antihypertensive agents.
  Evidence: The description mentions that the peptide promotes vasodilation and regulates diuresis and natriuresis, which are actions associated with lowering blood pressure. Specifically, it was indicated that in vitro, the peptide induces vasodilation (PubMed:2825692) and promotes natriuresis.
   - Lo

2025-11-21 15:16:59 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: It has been suggested that Auriculin-C may play a role in cardio-renal homeostasis through the regulation of natriuresis and vasodilation, indicating a potential effect on reducing blood pressure.  
   Logic: The association with vasodilation and inhibition of aldosterone synthesis supports its potential as an antihypertensive agent.
  Logic: The association with vasodilation and inhibition of aldosterone synthesis supports its potential as an antihypertensive agent.
  Evidence: It has been suggested that Auriculin-C may play a role in cardio-renal homeostasis through the regulation of natriuresis and vasodilation, indicating a potential effect on reducing blood pressure.  
   Logic: The association with vasodilation and inhibition of aldosterone synthesis supports its potential as an antihypertensive agent.

  Function: Nitric oxide liberation  
   Evidence: In vitro studies indicate that Auriculin-C promotes the production of cGMP and indu

2025-11-21 15:17:03 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation" and "May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation." (PubMed citations provided)  
   Logic: The promotion of vasodilation and natriuresis is often associated with reducing blood pressure, indicating an antihypertensive effect.
  Logic: The promotion of vasodilation and natriuresis is often associated with reducing blood pressure, indicating an antihypertensive effect.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" and "May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation." (PubMed citations provided)  
   Logic: The promotion of vasodilation and natriuresis is often associated with reducing blood pressure, indicating an antihypertensive effect.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP," which is of

2025-11-21 15:17:10 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Since urodilatin promotes vasodilation and regulates electrolyte balance, it contributes to lowering blood pressure, which classifies it as having antihypertensive properties.
  Evidence: Urodilatin mediates vasodilation primarily in the renal system and is involved in the regulation of sodium and water homeostasis (PubMed:8351194, PubMed:8779891).

  Function: Nitric oxide liberation
  Logic: The stimulation of cGMP production is closely associated with the release of nitric oxide (NO), which is a potent vasodilator. Therefore, this function relates to the peptide's role in promoting vasodilation.
  Evidence: Urodilatin stimulates cGMP production by renal transmembrane receptors, likely NPR1 (PubMed:8384600, PubMed:9893117).

  Function: Immunomodulatory
  Logic: While there is no direct reference to immunomodulatory effects, anti-inflammatory responses and fluid regulation could imply an indirect role.
  Evidence: The review of function suggests

2025-11-21 15:17:15 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Urodilatin functions to relax blood vessels, which can lower blood pressure, making it relevant as an antihypertensive agent.
  Evidence: Mediates vasodilation, natriuresis and diuresis primarily in the renal system

  Function: Nitric oxide liberation
  Logic: The stimulation of cGMP production is closely associated with nitric oxide signaling pathways, which are known to promote vasodilation. This suggests a potential role for urodilatin in nitric oxide liberation or action.
  Evidence: Specifically binds and stimulates cGMP production by renal transmembrane receptors, likely NPR1

  Function: Cytomodulatory
  Logic: The regulation of renal and cardiovascular functions indicates a broader role in modulating cellular responses and functions within those systems, which fits the category of cytomodulatory activity.
  Evidence: Urodilatin is involved in maintaining cardio-renal homeostasis
Processed row 42/305 in 5.08 seconds (3 function(s) identifi

2025-11-21 15:17:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "Mediates vasodilation, natriuresis and diuresis primarily in the renal system..." (PubMed:8351194, PubMed:8779891).  
   Logic: Urodilatin’s role in mediating vasodilation directly relates to its potential antihypertensive effects, as vasodilation can lead to lowered blood pressure.
  Logic: Urodilatin’s role in mediating vasodilation directly relates to its potential antihypertensive effects, as vasodilation can lead to lowered blood pressure.
  Evidence: Mediates vasodilation, natriuresis and diuresis primarily in the renal system..." (PubMed:8351194, PubMed:8779891).  
   Logic: Urodilatin’s role in mediating vasodilation directly relates to its potential antihypertensive effects, as vasodilation can lead to lowered blood pressure.

  Function: Nitric oxide liberation  
   Evidence: "Specifically binds and stimulates cGMP production by renal transmembrane receptors, likely NPR1" (PubMed:8384600, PubMed:9893117).  
   Logic: The stimulati

2025-11-21 15:17:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive 
   - Evidence: Urodilatin mediates vasodilation, as noted in the associated function. One of the roles of vasodilators is to lower blood pressure, which contributes to antihypertensive effects (PubMed:8351194).
   - Logic: The ability of Urodilatin to promote vasodilation implies a role in reducing hypertension, aligning with antihypertensive properties.
  Logic: The ability of Urodilatin to promote vasodilation implies a role in reducing hypertension, aligning with antihypertensive properties.
  Evidence: Urodilatin mediates vasodilation, as noted in the associated function. One of the roles of vasodilators is to lower blood pressure, which contributes to antihypertensive effects (PubMed:8351194).
   - Logic: The ability of Urodilatin to promote vasodilation implies a role in reducing hypertension, aligning with antihypertensive properties.

  Function: Nitric oxide liberation
   - Evidence: Urodilatin stimulates cGMP production in renal transmembrane re

2025-11-21 15:17:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Urodilatin's role in vasodilation and its effects on sodium and fluid balance suggest an antihypertensive function, as lowering blood volume through diuresis can contribute to reduced blood pressure.
  Evidence: The peptide mediates vasodilation and promotes natriuresis and diuresis, which are important in controlling blood pressure (PubMed:8351194, PubMed:8779891).

  Function: Nitric oxide liberation
  Logic: The stimulation of cGMP is a typical mechanism through which nitric oxide exerts its vasodilatory effects, linking urodilatin to nitric oxide activity.
  Evidence: Urodilatin is known to stimulate cGMP production, which is associated with nitric oxide pathways (PubMed:8384600, PubMed:9893117).

  Function: Immunomodulatory
  Logic: Hormones and peptides that regulate fluid balance can also influence the immune system, although this may require further evidence for a direct link.
  Evidence: While the abstract does not directly state immunom

2025-11-21 15:17:47 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive 
   - Evidence: Urodilatin mediates vasodilation, which can lower blood pressure, as it is involved in the regulation of fluid volume and electrolyte balance (PubMed:2528951, PubMed:8351194).
   - Logic: The ability of urodilatin to induce vasodilation and regulate extracellular fluid volume contributes to its role in lowering blood pressure, making it pertinent to classifying it as antihypertensive.
  Logic: The ability of urodilatin to induce vasodilation and regulate extracellular fluid volume contributes to its role in lowering blood pressure, making it pertinent to classifying it as antihypertensive.
  Evidence: Urodilatin mediates vasodilation, which can lower blood pressure, as it is involved in the regulation of fluid volume and electrolyte balance (PubMed:2528951, PubMed:8351194).
   - Logic: The ability of urodilatin to induce vasodilation and regulate extracellular fluid volume contributes to its role in lowering blood pressure, making it pertin

2025-11-21 15:17:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The primary function of urodilatin involves vasodilation, which is a mechanism that lowers blood pressure, thus classifying it under antihypertensive effects.
  Evidence: Urodilatin mediates vasodilation, contributing to the control of fluid-electrolyte balance and blood pressure regulation.

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often linked to nitric oxide pathways, indicating a potential role for urodilatin in promoting nitric oxide liberation.
  Evidence: Urodilatin stimulates cGMP production by renal transmembrane receptors, suggesting a mechanism that could enhance nitric oxide signaling.
Processed row 47/305 in 4.62 seconds (2 function(s) identified)


2025-11-21 15:17:58 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 48/305 in 7.12 seconds (0 function(s) identified)


2025-11-21 15:18:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Auriculin-D is described as promoting vasodilation, which is a known mechanism for lowering blood pressure, thereby indicating an antihypertensive function.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, and vasodilation.

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with the release of nitric oxide (NO), a potent vasodilator. This suggests that Auriculin-D may facilitate nitric oxide liberation as part of its vasodilatory action.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation.

  Function: Increase calcium uptake
  Logic: Although not directly stated, the interaction with distinctive receptors and the regulation of natriuresis and vasodilation may imply involvement in calcium signaling pathways that can affect smooth muscle contraction and relaxation.
  Evidence: appears to bind to specific receptors that are distinct f

2025-11-21 15:18:11 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide has vasodilatory properties and seems to regulate blood pressure, potentially contributing to antihypertensive effects.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation.

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with nitric oxide signaling pathways, indicating that the peptide may be involved in nitric oxide liberation that leads to vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation.

  Function: Immunomodulatory
  Logic: The enhancement of prostaglandin synthesis can have immunomodulatory effects, suggesting that there could be an immune-related role.
  Evidence: May promote natriuresis... by enhancing prostaglandin E2 synthesis.

  Function: Increase mucin secretion
  Logic: While indirectly related, the reference to enhancing protein excretion suggests a potential role in mucin secretion,

2025-11-21 15:18:19 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described as potentially promoting vasodilation, which is often associated with a reduction in blood pressure (PubMed:2825692).  
   Logic: Vasodilation leads to a decrease in vascular resistance, which can lower blood pressure, aligning with the definition of antihypertensive activity.
  Logic: Vasodilation leads to a decrease in vascular resistance, which can lower blood pressure, aligning with the definition of antihypertensive activity.
  Evidence: The peptide is described as potentially promoting vasodilation, which is often associated with a reduction in blood pressure (PubMed:2825692).  
   Logic: Vasodilation leads to a decrease in vascular resistance, which can lower blood pressure, aligning with the definition of antihypertensive activity.

  Function: Nitric oxide liberation  
   Evidence: The peptide was noted to promote the production of cGMP in vitro, which is often a result of nitric oxide signaling pathways (Pu

2025-11-21 15:18:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is described to have vasodilatory properties, which can help lower blood pressure, thus classifying it as having an antihypertensive function.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation" (Abstract), "In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).

  Function: Nitric oxide liberation
  Logic: The production of cGMP is frequently associated with the action of nitric oxide in vasodilation, suggesting that this peptide may enhance nitric oxide function indirectly via cGMP pathways.
  Evidence: In vitro, promotes the production of cGMP" (PubMed:2825692).

  Function: Cytomodulatory
  Logic: The modulation of renal processes such as protein reabsorption indicates a cytomodulatory effect, impacting cellular behavior in the kidney's proximal tubules.
  Evidence: May enhance protein excretion in urine by decreasing proximal tubular prot

2025-11-21 15:18:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Antihypertensive activity can be attributed to the peptide's ability to promote vasodilation and natriuresis, both of which can lower blood pressure. The production of cGMP is also linked to vasodilatory effects.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation" and "In vitro, promotes the production of cGMP and induces vasodilation

  Function: Nitric oxide liberation
  Logic: The involvement of cGMP in promoting vasodilation is often associated with nitric oxide signaling pathways in vascular physiology, suggesting a possible role of nitric oxide liberation.
  Evidence: In vitro, promotes the production of cGMP

  Function: Immunomodulatory
  Logic: Although not directly linked to immune modulation, the effects on renal function and protein excretion may have indirect immunomodulatory consequences in kidney function and systemic circulation.
  Evidence: Possibly enhances protein ex

2025-11-21 15:18:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The regulation of natriuresis and vasodilation suggests that the peptide contributes to lowering blood pressure, which is a known mechanism for antihypertensive agents.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation." (PubMed:2532366)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often linked to the action of nitric oxide in promoting vasodilation, indicating that Auriculin-D may facilitate nitric oxide signaling.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation." (PubMed:2825692)

  Function: Cytomodulatory
  Logic: The modulation of prostaglandin E2 synthesis and its effects on renal function suggests a regulatory role on cellular responses, common in cytomodulatory functions.
  Evidence: May promote natriuresis, at least in part, by enhancing prostaglandin E2 synthesis." (PubMed:7720651)
Processed row 54/305 in 5.33 seconds

2025-11-21 15:18:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is noted to induce vasodilation and regulate blood volume, which are mechanisms related to lowering blood pressure.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation

  Function: Nitric oxide liberation
  Logic: The induction of cGMP is typically associated with nitric oxide signaling, which leads to vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation

  Function: Increase intestinal motility
  Logic: The specific vasodilation of intestinal smooth muscle suggests a role in enhancing intestinal motility.
  Evidence: In vitro, selectively vasodilates intestinal smooth muscle but not vascular smooth muscle strips

  Function: Cytomodulatory
  Logic: The conflicting reports suggest that its effect on cell function (particularly in the kidney) may be context-dependent, indicating a modulatory role.
  Evidence: According to a report, in v

2025-11-21 15:18:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The promotion of cGMP and subsequent vasodilation are known mechanisms through which antihypertensive effects are mediated, leading to lowered blood pressure.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692). "May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation.

  Function: Nitric oxide liberation
  Logic: The generation of cGMP is closely associated with the activity of nitric oxide in vascular smooth muscle, leading to vasodilation.
  Evidence: In vitro, promotes the production of cGMP" (PubMed:2825692).

  Function: Immunomodulatory
  Logic: The modulation of protein reabsorption can affect immune functions and inflammatory processes in the kidney, indicating an immunomodulatory role.
  Evidence: May enhance protein excretion in urine by decreasing proximal tubular protein reabsorption" (PubMed:11145122).
Processed row 56/305 in 4.85 seconds (3 function(

2025-11-21 15:18:57 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP, which regulates blood pressure and controls extracellular fluid volume. This is highlighted in multiple sources including PubMed:2532366, PubMed:2825692, and PubMed:7595132, which discuss its role in vasodilation and regulating blood pressure.  
   Logic: ANP is well-known for its function in lowering blood pressure through vasodilation and natriuresis, making it a key component in cardio-renal homeostasis.
  Logic: ANP is well-known for its function in lowering blood pressure through vasodilation and natriuresis, making it a key component in cardio-renal homeostasis.
  Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP, which regulates blood pressure and controls extracellular fluid volume. This is highlighted in multiple sources including PubMed:2532366, PubMed:2825692, and PubMed:7595132, which discuss its role in vasodilation and regulating blood 

2025-11-21 15:19:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The atrial natriuretic peptide plays a crucial role in regulating blood pressure, diuresis, and natriuresis, all of which contribute to its antihypertensive effects.
  Evidence: PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797

  Function: Anticancer
  Logic: ANP is involved in inhibiting cardiac remodeling and cardiac hypertrophy by inducing cardiomyocyte apoptosis, which suggests a potential role in anticancer mechanisms.
  Evidence: PubMed:16875975

  Function: Immunomodulatory
  Logic: Although not explicitly stated, since natriuretic peptides are involved in various biological processes, including response modulation in tissues, this suggests potential immunomodulatory effects.
  Evidence: PubMed:15166840 (inferred from related research)
Processed row 58/305 in 5.79 seconds (3 function(s) identified)


2025-11-21 15:19:12 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Atrial natriuretic peptide is known to play a significant role in lowering blood pressure through its actions on natriuresis and vasodilation, directly linking its function to antihypertensive effects.
  Evidence: Essential for regulating blood pressure, controlling the extracellular fluid volume and maintaining the fluid-electrolyte balance (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797)

  Function: Cytomodulatory
  Logic: The peptide’s involvement in regulating cardiomyocyte apoptosis and cardiac remodeling signifies its modulatory effects on cellular functions, which supports its classification as cytomodulatory.
  Evidence: Inhibiting cardiac remodeling and cardiac hypertrophy by inducing cardiomyocyte apoptosis and attenuating the growth of cardiomyocytes and fibroblasts (PubMed:16875975)

  Function: Ameliorates insulin resistance
  Logic: The peptide’s role in regulating lipid metabolism and

2025-11-21 15:19:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: ANP is involved in regulating blood pressure, controlling extracellular fluid volume, and maintaining fluid-electrolyte balance (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP acts as a hormone that induces vasodilation and promotes natriuresis (the excretion of sodium through urine), both of which lower blood pressure. Thus, it is classified as having an antihypertensive function.
  Logic: ANP acts as a hormone that induces vasodilation and promotes natriuresis (the excretion of sodium through urine), both of which lower blood pressure. Thus, it is classified as having an antihypertensive function.
  Evidence: ANP is involved in regulating blood pressure, controlling extracellular fluid volume, and maintaining fluid-electrolyte balance (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP acts as a hormone that induc

2025-11-21 15:19:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is involved in regulating blood pressure and controlling extracellular fluid volume, indicating its role in vascular homeostasis (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP promotes vasodilation and natriuresis, which lead to a decrease in blood pressure, thus categorizing it as antihypertensive.
  Logic: ANP promotes vasodilation and natriuresis, which lead to a decrease in blood pressure, thus categorizing it as antihypertensive.
  Evidence: The peptide is involved in regulating blood pressure and controlling extracellular fluid volume, indicating its role in vascular homeostasis (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP promotes vasodilation and natriuresis, which lead to a decrease in blood pressure, thus categorizing it as antihypertensive.

  Function: Antimicrobial  
   Evidence: Whi

2025-11-21 15:19:38 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive 
   - Evidence: ANP regulates blood pressure by promoting vasodilation and natriuresis (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).
   - Logic: ANP's primary role includes mediating cardio-renal homeostasis and regulating blood pressure through vasodilation and fluid balance, therefore qualifying as antihypertensive.
  Logic: ANP's primary role includes mediating cardio-renal homeostasis and regulating blood pressure through vasodilation and fluid balance, therefore qualifying as antihypertensive.
  Evidence: ANP regulates blood pressure by promoting vasodilation and natriuresis (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).
   - Logic: ANP's primary role includes mediating cardio-renal homeostasis and regulating blood pressure through vasodilation and fluid balance, therefore qualifying as antihypertensive.

  Function: Ameliorates insulin resistance
  

2025-11-21 15:19:49 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The primary role of atrial natriuretic peptide (ANP) is to reduce blood pressure through various mechanisms such as inducing vasodilation and promoting natriuresis.
  Evidence: Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is therefore essential for regulating blood pressure" (PubMed references).

  Function: Antiapoptotic effect
  Logic: The peptide is involved in apoptotic processes, but it also has mechanisms that promote apoptosis in specific contexts such as cardiomyocytes, indicating a complex role that can also align with antiapoptotic effects in certain scenarios.
  Evidence: Inhibiting cardiac remodeling and cardiac hypertrophy by inducing cardiomyocyte apoptosis and attenuating the growth of cardiomyocytes and fibroblasts" (PubMed:16875975).

  Function: Ameliorates insulin resistance
  Logic: The involvement of ANP in increasing energy expenditure and regulation of lipid metabolism suggests a potential role

2025-11-21 15:19:59 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The description states that atrial natriuretic peptide regulates blood pressure through vasodilation, natriuresis, and diuresis. It mentions that it is essential for regulating blood pressure and controlling extracellular fluid volume.  
   Logic: The vasodilatory effect and regulation of fluid balance directly relate to antihypertensive activity, making this function appropriate.
  Logic: The vasodilatory effect and regulation of fluid balance directly relate to antihypertensive activity, making this function appropriate.
  Evidence: The description states that atrial natriuretic peptide regulates blood pressure through vasodilation, natriuresis, and diuresis. It mentions that it is essential for regulating blood pressure and controlling extracellular fluid volume.  
   Logic: The vasodilatory effect and regulation of fluid balance directly relate to antihypertensive activity, making this function appropriate.

  Function: Nitric oxide libe

2025-11-21 15:20:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is therefore essential for regulating blood pressure..." (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: As a natriuretic peptide, ANP induces vasodilation and promotes sodium excretion (natriuresis), both of which play a crucial role in lowering blood pressure, thus qualifying it as an antihypertensive agent.
  Logic: As a natriuretic peptide, ANP induces vasodilation and promotes sodium excretion (natriuresis), both of which play a crucial role in lowering blood pressure, thus qualifying it as an antihypertensive agent.
  Evidence: Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is therefore essential for regulating blood pressure..." (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: As a natriuretic peptide, ANP ind

2025-11-21 15:20:15 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: ANP has a well-established role in lowering blood pressure through vasodilation and natriuresis, which confirms its antihypertensive effect.
  Evidence: INDOLES advanced functions including regulating blood pressure and maintaining extracellular fluid volume

  Function: Nitric oxide liberation
  Logic: The stimulation of NPR1 and production of cGMP in vascular smooth muscle cells often leads to increased nitric oxide (NO) production which contributes to vasodilation.
  Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP

  Function: Immunomodulatory
  Logic: ANP plays a role in cardiac remodeling and the modulation of inflammatory responses, possibly impacting immune function indirectly.
  Evidence: Involved in regulating cardiac remodeling and may influence inflammation associated with cardiac conditions

  Function: Ameliorates insulin resistance
  Logic: The influence of ANP on energy metabolism suggests potential effec

2025-11-21 15:20:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: Regulates blood pressure, controlling the extracellular fluid volume and maintaining the fluid-electrolyte balance (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: The primary role of ANP includes vasodilation and natriuresis which contribute significantly to its antihypertensive effect by lowering blood pressure.
  Logic: The primary role of ANP includes vasodilation and natriuresis which contribute significantly to its antihypertensive effect by lowering blood pressure.
  Evidence: Regulates blood pressure, controlling the extracellular fluid volume and maintaining the fluid-electrolyte balance (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: The primary role of ANP includes vasodilation and natriuresis which contribute significantly to its antihypertensive effect by lowering blood pressure.

  Function: Cytomodulatory 

2025-11-21 15:20:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: Atrial natriuretic peptide is known to regulate blood pressure by inducing vasodilation and natriuresis, thus contributing to its antihypertensive effects.
  Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP, which in turn activates effector proteins, such as PRKG1, that drive various biological responses" (PubMed:1660465, PubMed:1672777, etc.)

  Function: Immunomodulatory
  Logic: By influencing cardiac remodeling and inhibiting hypertrophy, this peptide can modulate immune responses related to cardiac health.
  Evidence: Also involved in inhibiting cardiac remodeling and cardiac hypertrophy" (PubMed:16875975)

  Function: Ameliorates insulin resistance
  Logic: The regulation of lipid metabolism and energy homeostasis indicates a potential role in improving insulin sensitivity.
  Evidence: In adipose tissue, acts in various cGMP- and PKG-dependent pathways to regulate lipid metabolism and energy homeostasis" (PubMed:15

2025-11-21 15:20:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The essential role of ANP in vasodilation and natriuresis directly indicates its function in lowering blood pressure, which characterizes an antihypertensive effect.
  Evidence: ANP is involved in regulating blood pressure, controlling extracellular fluid volume, and maintaining fluid-electrolyte balance (PubMed references 2532366, 2825692, 7595132, 7720651, 8087923, 8653797).

  Function: Ameliorates insulin resistance
  Logic: The ability of ANP to restore mitochondrial gene expression and enhance energy metabolism suggests a role in improving insulin sensitivity and thus ameliorating insulin resistance.
  Evidence: ANP treatment restored down-regulation of mitochondrial genes induced by fatty acids and TNFα, improving lipid metabolism and energy dissipation (abstract from the title provided).

  Function: Cholesterol regulation
  Logic: While not directly stated, the regulation of lipid metabolism typically includes cholesterol metabolism, supp

2025-11-21 15:20:49 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The role of ANP in regulating blood pressure is well-documented, as it stimulates NPR1 to produce cGMP, which promotes vasodilation and helps to control extracellular fluid volume, thereby having an antihypertensive effect (PubMed:2532366, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP is primarily involved in the regulation of blood pressure and fluid balance, which are critical components of hypertension management.
  Logic: ANP is primarily involved in the regulation of blood pressure and fluid balance, which are critical components of hypertension management.
  Evidence: The role of ANP in regulating blood pressure is well-documented, as it stimulates NPR1 to produce cGMP, which promotes vasodilation and helps to control extracellular fluid volume, thereby having an antihypertensive effect (PubMed:2532366, PubMed:7595132, PubMed:7720651, PubMed:8087923, PubMed:8653797).  
   Logic: ANP is primarily invol

2025-11-21 15:21:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide regulates blood pressure and is involved in vasodilation and controlling extracellular fluid volume, as specified in the associated functions.  
   Logic: ANP is known to have a key role in lowering blood pressure by promoting natriuresis and vasodilation, which makes it relevant to antihypertensive activity.
  Logic: ANP is known to have a key role in lowering blood pressure by promoting natriuresis and vasodilation, which makes it relevant to antihypertensive activity.
  Evidence: The peptide regulates blood pressure and is involved in vasodilation and controlling extracellular fluid volume, as specified in the associated functions.  
   Logic: ANP is known to have a key role in lowering blood pressure by promoting natriuresis and vasodilation, which makes it relevant to antihypertensive activity.

  Function: Ameliorates insulin resistance  
   Evidence: In adipose tissue, ANP is involved in regulating lipid metabolism and ene

2025-11-21 15:21:12 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: ANP is known to regulate blood pressure through vasodilation and natriuresis, as described in various PubMed sources (e.g., PubMed:2532366, PubMed:2852920).  
   Logic: The role of ANP in mediating vasodilation and promoting sodium excretion directly contributes to its antihypertensive effects, making it a crucial hormone in blood pressure regulation.
  Logic: The role of ANP in mediating vasodilation and promoting sodium excretion directly contributes to its antihypertensive effects, making it a crucial hormone in blood pressure regulation.
  Evidence: ANP is known to regulate blood pressure through vasodilation and natriuresis, as described in various PubMed sources (e.g., PubMed:2532366, PubMed:2852920).  
   Logic: The role of ANP in mediating vasodilation and promoting sodium excretion directly contributes to its antihypertensive effects, making it a crucial hormone in blood pressure regulation.

  Function: Cytomodulatory  
   Evidence

2025-11-21 15:21:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive 
   - Evidence: "Essential for regulating blood pressure, controlling the extracellular fluid volume and maintaining the fluid-electrolyte balance" (PubMed:2532366, PubMed:2825692, PubMed:7595132).
   - Logic: Atrial Natriuretic Peptide (ANP) is known to reduce blood pressure primarily by promoting vasodilation and natriuresis, which are critical processes in antihypertensive action.
  Logic: Atrial Natriuretic Peptide (ANP) is known to reduce blood pressure primarily by promoting vasodilation and natriuresis, which are critical processes in antihypertensive action.
  Evidence: Essential for regulating blood pressure, controlling the extracellular fluid volume and maintaining the fluid-electrolyte balance" (PubMed:2532366, PubMed:2825692, PubMed:7595132).
   - Logic: Atrial Natriuretic Peptide (ANP) is known to reduce blood pressure primarily by promoting vasodilation and natriuresis, which are critical processes in antihypertensive action.

  Function: Cy

2025-11-21 15:21:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide functions as a hormone that regulates blood pressure through vasodilation and other regulatory mechanisms.
  Evidence: Plays a key role in mediating cardio-renal homeostasis... essential for regulating blood pressure" (PubMed:2532366, PubMed:2825692, PubMed:7595132, PubMed:7720651)

  Function: Antiapoptotic effect
  Logic: While the peptide induces apoptosis in certain cardiac cells, its net effect could be interpreted as antiapoptotic in contexts outside of pathological conditions, illustrating its role in preventing excessive growth or hypertrophy.
  Evidence: Involved in inhibiting cardiac remodeling and cardiac hypertrophy by inducing cardiomyocyte apoptosis" (PubMed:16875975)

  Function: Ameliorates insulin resistance
  Logic: AMPK is known for its role in improving insulin sensitivity, thus linking the peptide's actions to potential improvements in insulin resistance.
  Evidence: Regulates lipid metabolism and energy homeostasi

2025-11-21 15:21:40 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "Acts by specifically binding and stimulating NPR1 to produce cGMP, which in turn activates effector proteins, such as PRKG1... Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is therefore essential for regulating blood pressure..." (PubMed references listed).  
   Logic: ANP plays a crucial role in lowering blood pressure by promoting vasodilation and natriuresis, linking its activity directly to antihypertensive effects.
  Logic: ANP plays a crucial role in lowering blood pressure by promoting vasodilation and natriuresis, linking its activity directly to antihypertensive effects.
  Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP, which in turn activates effector proteins, such as PRKG1... Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is therefore essential for regulating blood pressure..." (PubMed references listed).  
   Logic: ANP plays a crucial role 

2025-11-21 15:21:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
   - Evidence: "Acts by specifically binding and stimulating NPR1 to produce cGMP, which in turn activates effector proteins, such as PRKG1, that drive various biological responses."
   - Logic: The atrial natriuretic peptide regulates blood pressure by mediating vasodilation and natriuresis, which are known actions of antihypertensive agents.
  Logic: The atrial natriuretic peptide regulates blood pressure by mediating vasodilation and natriuresis, which are known actions of antihypertensive agents.
  Evidence: Acts by specifically binding and stimulating NPR1 to produce cGMP, which in turn activates effector proteins, such as PRKG1, that drive various biological responses."
   - Logic: The atrial natriuretic peptide regulates blood pressure by mediating vasodilation and natriuresis, which are known actions of antihypertensive agents.

  Function: Diuretic
   - Evidence: "Regulates vasodilation, natriuresis, diuresis and aldosterone synthesis and is there

2025-11-21 15:21:56 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: ANP regulates blood pressure by promoting vasodilation and natriuresis, as indicated by multiple PubMed sources such as PubMed:2532366 and PubMed:7720651.  
   Logic: The primary role of ANP is to lower blood pressure through these mechanisms, making its bioactivity in reducing hypertension evident.
  Logic: The primary role of ANP is to lower blood pressure through these mechanisms, making its bioactivity in reducing hypertension evident.
  Evidence: ANP regulates blood pressure by promoting vasodilation and natriuresis, as indicated by multiple PubMed sources such as PubMed:2532366 and PubMed:7720651.  
   Logic: The primary role of ANP is to lower blood pressure through these mechanisms, making its bioactivity in reducing hypertension evident.

  Function: Antapoptotic effect  
   Evidence: ANP was shown to inhibit cardiac remodeling and cardiac hypertrophy by inducing cardiomyocyte apoptosis, as noted in PubMed:16875975.  
   Logic: Alth

2025-11-21 15:22:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 78/305 in 9.30 seconds (0 function(s) identified)


2025-11-21 15:22:14 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..."  
   Logic: The role of Auriculin-B in promoting vasodilation directly suggests potential antihypertensive effects, as vasodilation can lower blood pressure.
  Logic: The role of Auriculin-B in promoting vasodilation directly suggests potential antihypertensive effects, as vasodilation can lower blood pressure.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..."  
   Logic: The role of Auriculin-B in promoting vasodilation directly suggests potential antihypertensive effects, as vasodilation can lower blood pressure.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation..."  
   Logic: The production of cGMP is often linked to the activity of nitric oxide in vasodilation processes, suggesting a function rel

2025-11-21 15:22:19 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The description indicates that the peptide may promote vasodilation and manage blood pressure, which are linked to antihypertensive effects.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation (PubMed:2532366, PubMed:2825692).

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with nitric oxide signaling, which is a common pathway for inducing vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation (PubMed:2825692).

  Function: Cytomodulatory
  Logic: The action of modifying renal function suggests a modulatory effect on cellular processes in the kidneys.
  Evidence: May enhance protein excretion in urine by decreasing proximal tubular protein reabsorption (PubMed:11145122).
Processed row 80/305 in 5.79 seconds (3 function(s) identified)


2025-11-21 15:22:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is involved in promoting vasodilation, which can contribute to lowering blood pressure. Additionally, its role in natriuresis suggests it can help manage blood work, indicating antihypertensive properties.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation." (PubMed:2532366, PubMed:7720651)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with nitric oxide signaling pathways in vascular tissues, suggesting that this peptide may facilitate the release or effects of nitric oxide, contributing to vasodilation.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation." (PubMed:2825692)

  Function: Cytomodulatory
  Logic: This indicates a modulatory role over renal functions, affecting how proteins are processed and expelled, which aligns with cytomodulatory activities.
  Evidence: Possibly enhances protein excretion in uri

2025-11-21 15:22:35 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed row 82/305 in 7.02 seconds (0 function(s) identified)


2025-11-21 15:22:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide may promote natriuresis and induce vasodilation, which are mechanisms known to lower blood pressure (PubMed:2825692).  
   Logic: Since vasodilation directly contributes to reducing vascular resistance and, consequently, blood pressure, the peptide's ability to affect these processes suggests a role in antihypertensive activity.
  Logic: Since vasodilation directly contributes to reducing vascular resistance and, consequently, blood pressure, the peptide's ability to affect these processes suggests a role in antihypertensive activity.
  Evidence: The peptide may promote natriuresis and induce vasodilation, which are mechanisms known to lower blood pressure (PubMed:2825692).  
   Logic: Since vasodilation directly contributes to reducing vascular resistance and, consequently, blood pressure, the peptide's ability to affect these processes suggests a role in antihypertensive activity.

  Function: Nitric oxide liberation  
   Evide

2025-11-21 15:22:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..." and "In vivo promotes natriuresis and in vitro, vasodilates renal artery strips."  
   Logic: The references to vasodilation and natriuresis suggest that this peptide could help lower blood pressure, which constitutes an antihypertensive action.
  Logic: The references to vasodilation and natriuresis suggest that this peptide could help lower blood pressure, which constitutes an antihypertensive action.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation..." and "In vivo promotes natriuresis and in vitro, vasodilates renal artery strips."  
   Logic: The references to vasodilation and natriuresis suggest that this peptide could help lower blood pressure, which constitutes an antihypertensive action.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the p

2025-11-21 15:23:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation." (Multiple PubMed sources indicate vasodilatory effects)  
   Logic: The regulation of vasodilation and the effect on blood volume suggest that the peptide may help lower blood pressure, aligning with antihypertensive activity.
  Logic: The regulation of vasodilation and the effect on blood volume suggest that the peptide may help lower blood pressure, aligning with antihypertensive activity.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis and vasodilation." (Multiple PubMed sources indicate vasodilatory effects)  
   Logic: The regulation of vasodilation and the effect on blood volume suggest that the peptide may help lower blood pressure, aligning with antihypertensive activity.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation."  
  

2025-11-21 15:23:09 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive
  Logic: The peptide is described to induce vasodilation and promote natriuresis, both of which contribute to lowering blood pressure, hence classifying it as having antihypertensive activity.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation" (PubMed:2532366, PubMed:2825692, etc.)

  Function: Nitric oxide liberation
  Logic: The production of cGMP is often associated with the liberation of nitric oxide (NO), which is a known vasodilator. This suggests that the peptide may enhance the liberation of NO to exert its vasodilatory effects.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692)

  Function: Immunomodulatory
  Logic: The regulation of protein reabsorption can influence immune responses, as proteins play a role in various immune functions, thus indicating an immunomodulatory role of the peptide.
  Evidence: May enhance protein excretion in u

2025-11-21 15:23:14 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).  
   Logic: The ability to induce vasodilation can lead to decreased blood pressure, thus contributing to an antihypertensive effect.
  Logic: The ability to induce vasodilation can lead to decreased blood pressure, thus contributing to an antihypertensive effect.
  Evidence: In vitro, promotes the production of cGMP and induces vasodilation" (PubMed:2825692).  
   Logic: The ability to induce vasodilation can lead to decreased blood pressure, thus contributing to an antihypertensive effect.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP" (PubMed:2825692).  
   Logic: The production of cGMP is often a downstream effect of nitric oxide signaling, indicating that this peptide may facilitate nitric oxide liberation which can lead to vasodilation.
  Logic: The production of cGMP is often a downstream effect of ni

2025-11-21 15:23:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: "May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation."  
   Logic: The peptide is described to promote vasodilation and natriuresis, which are processes that can lower blood pressure, indicating potential antihypertensive effects.
  Logic: The peptide is described to promote vasodilation and natriuresis, which are processes that can lower blood pressure, indicating potential antihypertensive effects.
  Evidence: May have a role in cardio-renal homeostasis through regulation of natriuresis, diuresis, vasodilation."  
   Logic: The peptide is described to promote vasodilation and natriuresis, which are processes that can lower blood pressure, indicating potential antihypertensive effects.

  Function: Nitric oxide liberation  
   Evidence: "In vitro, promotes the production of cGMP and induces vasodilation."  
   Logic: The production of cGMP is closely associated with nitric oxide signaling, s

2025-11-21 15:23:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



  Function: Antihypertensive  
   Evidence: The peptide is described to induce vasodilation and promotes the production of cGMP, which is involved in the relaxation of blood vessels. This aligns with findings from several PubMed sources that indicate it may regulate blood pressure through its effects on vascular smooth muscle.  
   Logic: Vasodilation typically leads to a decrease in blood pressure, suggesting that Auriculin-A may serve an antihypertensive role.
  Logic: Vasodilation typically leads to a decrease in blood pressure, suggesting that Auriculin-A may serve an antihypertensive role.
  Evidence: The peptide is described to induce vasodilation and promotes the production of cGMP, which is involved in the relaxation of blood vessels. This aligns with findings from several PubMed sources that indicate it may regulate blood pressure through its effects on vascular smooth muscle.  
   Logic: Vasodilation typically leads to a decrease in blood pressure, suggesting that Auriculin-

Clean the Returned functions

In [None]:
# Defining the cleaning function
def clean_gpt_function(func):
    """
    Clean GPT function string by removing newlines, quotes, and asterisks.
    
    Args:
        func: Function string to clean (may contain newlines, quotes, asterisks)
    
    Returns:
        Cleaned function string with first line only, stripped of quotes and asterisks
    """
    if pd.isna(func) or func is None:
        return func
    # Get first line, strip whitespace and quotes, then remove all asterisks
    cleaned = str(func).split("\n")[0].strip().strip("'").replace('*', '')
    return cleaned.strip()  # Final strip to remove any remaining whitespace

# Assuming peptides_final_with_refs_gpt_processed is already defined, applying the cleaning function to its 'gpt_function' column
peptides_final_with_gpt_processed['gpt_function'] = peptides_final_with_gpt_processed['gpt_function'].apply(clean_gpt_function)
print("Cleaned GPT function column:")
list(set(peptides_final_with_gpt_processed['gpt_function']))

# Display summary information
# copy the dataframe to peptides_final_updated
peptides_final_updated = peptides_final_with_gpt_processed.copy()
# print critical info before merge
print_critical_info(peptides_final_updated,"prior to merge")

Merge protein metadata (name, description) into GPT-processed peptides table

In [None]:
def merge_protein_info(
    target_df: pd.DataFrame,
    source_df: pd.DataFrame,
    merge_on: List[str] = ['proteinID', 'peptide'],
    columns_to_add: List[str] = ['protein_name', 'protein_desc'],
    drop_duplicates: bool = True,
    how: str = 'inner',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Merge protein information (name and description) into target DataFrame.
    
    This function merges protein metadata from source_df into target_df based on
    specified merge keys, handles column name conflicts, and reorders columns
    to place the new columns after the first column.
    
    Args:
        target_df: DataFrame to merge into (e.g., peptides_final_updated).
        source_df: DataFrame containing protein information (e.g., peptides_raw).
        merge_on: List of column names to merge on. Default: ['proteinID', 'peptide'].
        columns_to_add: List of column names to add from source_df. 
                       Default: ['protein_name', 'protein_desc'].
        drop_duplicates: If True, drop duplicates in source_df before merging.
        how: Type of merge ('inner', 'left', 'right', 'outer'). Default: 'inner'.
        verbose: If True, print summary information.
    
    Returns:
        DataFrame with merged protein information and reordered columns.
    
    Raises:
        ValueError: If required columns are missing from source_df or merge_on columns
                   are missing from either DataFrame.
    
    Example:
        >>> peptides_final_updated = merge_protein_info(
        ...     peptides_final_updated,
        ...     peptides_raw,
        ...     merge_on=['proteinID', 'peptide'],
        ...     columns_to_add=['protein_name', 'protein_desc']
        ... )
    """
    # Validate inputs
    if target_df.empty:
        if verbose:
            print("Warning: target_df is empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    # Check merge_on columns exist in both DataFrames
    missing_in_target = [col for col in merge_on if col not in target_df.columns]
    missing_in_source = [col for col in merge_on if col not in source_df.columns]
    
    if missing_in_target:
        raise ValueError(f"Columns {missing_in_target} not found in target_df")
    if missing_in_source:
        raise ValueError(f"Columns {missing_in_source} not found in source_df")
    
    # Check columns_to_add exist in source_df
    missing_cols = [col for col in columns_to_add if col not in source_df.columns]
    if missing_cols:
        raise ValueError(f"Columns {missing_cols} not found in source_df")
    
    # Prepare source DataFrame
    source_work = source_df.copy()
    
    # Drop duplicates if requested
    if drop_duplicates:
        initial_count = len(source_work)
        source_work = source_work.drop_duplicates(subset=merge_on)
        if verbose and len(source_work) < initial_count:
            print(f"Dropped {initial_count - len(source_work)} duplicate rows from source_df")
    
    # Select only the columns we need from source
    source_cols = merge_on + columns_to_add
    source_subset = source_work[source_cols]
    
    # Determine if we need suffixes (check if columns already exist in target)
    existing_cols = [col for col in columns_to_add if col in target_df.columns]
    if existing_cols:
        # Use suffixes to handle conflicts
        suffixes = ('', '_from_source')
        if verbose:
            print(f"Columns {existing_cols} already exist in target_df. Using suffixes: {suffixes}")
    else:
        suffixes = ('', '')
    
    # Perform merge
    merged_df = target_df.merge(
        source_subset,
        on=merge_on,
        how=how,
        suffixes=suffixes
    )
    
    if merged_df.empty:
        if verbose:
            print(f"Warning: Merge resulted in empty DataFrame (how='{how}')")
        return merged_df
    
    # Handle column names after merge
    # If suffixes were used, rename columns to remove suffix
    final_columns_to_add = []
    for col in columns_to_add:
        if f"{col}_from_source" in merged_df.columns:
            # Use the version from source (drop the original if it exists)
            if col in merged_df.columns:
                merged_df = merged_df.drop(columns=[col])
            merged_df = merged_df.rename(columns={f"{col}_from_source": col})
            final_columns_to_add.append(col)
        elif col in merged_df.columns:
            final_columns_to_add.append(col)
    
    # Reorder columns: first column, then columns_to_add, then rest
    current_cols = list(merged_df.columns)
    first_col = current_cols[0]
    
    # Remove columns_to_add and first_col from the list
    remaining_cols = [c for c in current_cols 
                     if c not in final_columns_to_add and c != first_col]
    
    # Create new column order: first_col, columns_to_add, remaining_cols
    new_col_order = [first_col] + final_columns_to_add + remaining_cols
    
    # Reorder DataFrame
    result_df = merged_df[new_col_order].copy()
    
    if verbose:
        print(f"Merge completed: {len(target_df)} -> {len(result_df)} rows")
        print(f"Added columns: {final_columns_to_add}")
        print(f"Column order: {new_col_order[:5]}... (showing first 5)")
    
    return result_df


# Drop duplicates and merge protein information
peptides_raw = peptides_raw.drop_duplicates(subset=['proteinID', 'peptide'])

# Merge protein name and description into peptides_final_updated
peptides_final_updated = merge_protein_info(
    peptides_final_updated,
    peptides_raw,
    merge_on=['proteinID', 'peptide'],
    columns_to_add=['protein_name', 'protein_desc'],
    drop_duplicates=False,  # Already dropped above
    how='inner',
    verbose=True
)

print_critical_info(peptides_final_updated, "aftermerge")


# Compared Uniport list to MBPDB

Creates new df with peptides discoved in Uniport search and match with MBPDB on peptide and DOI

In [None]:
def match_peptides_with_mbpdb(
    peptides_df: pd.DataFrame,
    mbpdb_df: pd.DataFrame,
    match_column: str = 'peptide',
    verbose: bool = True
) -> Tuple[pd.DataFrame, Dict[str, int]]:
    """
    Match peptides DataFrame with MBPDB database based on peptide sequences.
    
    Filters rows from peptides_df where the peptide sequence matches those in mbpdb_df.
    Calculates statistics about matched and unmatched peptides, DOIs, and functions.
    
    Args:
        peptides_df: DataFrame containing peptide data with columns:
                    'peptide', 'doi', 'gpt_function' (at minimum)
        mbpdb_df: DataFrame containing MBPDB data with 'peptide' column.
        match_column: Column name to match on. Default: 'peptide'.
        verbose: If True, print summary statistics.
    
    Returns:
        Tuple containing:
            - matched_df: DataFrame with rows that matched MBPDB
            - stats_dict: Dictionary with matching statistics:
                - 'total_records': Total records in input DataFrame
                - 'matched_count': Number of matched rows
                - 'unique_peptides_not_matched': Unique peptides not matched
                - 'unique_doi_not_matched': Unique DOIs not matched
                - 'unique_functions_not_matched': Unique GPT functions not matched
    
    Raises:
        ValueError: If required columns are missing from DataFrames.
        TypeError: If inputs are not DataFrames.
    
    Example:
        >>> matched_df, stats = match_peptides_with_mbpdb(
        ...     peptides_final_with_refs_gpt_processed,
        ...     mbpdb_data,
        ...     match_column='peptide',
        ...     verbose=True
        ... )
        >>> print(f"Matched {stats['matched_count']} rows")
    """
    # Validate inputs
    if not isinstance(peptides_df, pd.DataFrame):
        raise TypeError("peptides_df must be a pandas DataFrame")
    
    if not isinstance(mbpdb_df, pd.DataFrame):
        raise TypeError("mbpdb_df must be a pandas DataFrame")
    
    if peptides_df.empty:
        if verbose:
            print("Warning: peptides_df is empty. Returning empty DataFrame.")
        return pd.DataFrame(), {
            'total_records': 0,
            'matched_count': 0,
            'unique_peptides_not_matched': 0,
            'unique_doi_not_matched': 0,
            'unique_functions_not_matched': 0
        }
    
    if mbpdb_df.empty:
        if verbose:
            print("Warning: mbpdb_df is empty. No matches possible.")
        return pd.DataFrame(), {
            'total_records': len(peptides_df),
            'matched_count': 0,
            'unique_peptides_not_matched': peptides_df['peptide'].nunique() if 'peptide' in peptides_df.columns else 0,
            'unique_doi_not_matched': peptides_df['doi'].nunique() if 'doi' in peptides_df.columns else 0,
            'unique_functions_not_matched': peptides_df['gpt_function'].nunique() if 'gpt_function' in peptides_df.columns else 0
        }
    
    # Check required columns
    if match_column not in peptides_df.columns:
        raise ValueError(f"Column '{match_column}' not found in peptides_df")
    
    if match_column not in mbpdb_df.columns:
        raise ValueError(f"Column '{match_column}' not found in mbpdb_df")
    
    # Filter rows where peptide matches those in mbpdb_data
    # Convert mbpdb_df column to dictionary with 'list' orientation for isin check
    mbpdb_values = mbpdb_df[[match_column]].to_dict(orient='list')
    
    # Use isin method to check if values are present in mbpdb_data
    matched_mask = peptides_df[[match_column]].isin(mbpdb_values).all(axis=1)
    matched_df = peptides_df[matched_mask].copy()
    
    # Calculate statistics for unmatched items
    unmatched_mask = ~matched_mask
    
    # Count unique unmatched peptides
    if 'peptide' in peptides_df.columns:
        unique_peptides_not_matched = peptides_df.loc[unmatched_mask, 'peptide'].nunique()
    else:
        unique_peptides_not_matched = 0
    
    # Count unique unmatched DOIs
    if 'doi' in peptides_df.columns:
        unique_doi_not_matched = peptides_df.loc[unmatched_mask, 'doi'].nunique()
    else:
        unique_doi_not_matched = 0
    
    # Count unique unmatched GPT functions
    if 'gpt_function' in peptides_df.columns:
        unique_functions_not_matched = peptides_df.loc[unmatched_mask, 'gpt_function'].nunique()
    else:
        unique_functions_not_matched = 0
    
    # Compile statistics
    stats = {
        'total_records': len(peptides_df),
        'matched_count': len(matched_df),
        'unique_peptides_not_matched': unique_peptides_not_matched,
        'unique_doi_not_matched': unique_doi_not_matched,
        'unique_functions_not_matched': unique_functions_not_matched
    }
    
    # Print summary statistics if verbose
    if verbose:
        print("=" * 60)
        print("MBPDB Matching Results")
        print("=" * 60)
        print(f"Number of records from UniProt/GPT search: {stats['total_records']}")
        print(f"Number of peptides matched with MBPDB: {stats['matched_count']}")
        print(f"Number of unique peptides not matched with MBPDB: {stats['unique_peptides_not_matched']}")
        print(f"Number of unique DOIs not matched with MBPDB: {stats['unique_doi_not_matched']}")
        print(f"Number of unique functions from GPT not matched with MBPDB: {stats['unique_functions_not_matched']}")
        print("=" * 60)
    
    return matched_df, stats


# Step 1: Create peptides_matched_mbpdb
peptides_matched_mbpdb, matching_stats = match_peptides_with_mbpdb(
    peptides_final_updated,
    mbpdb_data,
    match_column='peptide',
    verbose=True
)

# Display the resulting peptides_matched_mbpdb
peptides_matched_mbpdb.head(n=3)


Creates dataframe with peptides that did not match MBPDB

In [None]:
def get_unmatched_peptides(
    full_df: pd.DataFrame, 
    matched_df: pd.DataFrame, 
    match_on: str = 'index',
    verbose: bool = True
) -> pd.DataFrame:
    """
    Identify rows in full_df that are not present in matched_df.
    
    This function finds peptides that were not matched with MBPDB by filtering
    out rows that exist in the matched dataframe. Uses boolean indexing for
    robustness, avoiding index mismatch errors.
    
    Args:
        full_df: DataFrame containing all peptides (e.g., peptides_final_updated).
        matched_df: DataFrame containing matched peptides (e.g., peptides_matched_mbpdb).
        match_on: Method to match rows. Options:
            - 'index': Match by DataFrame index (default, fastest)
            - 'peptide': Match by 'peptide' column values
            - 'peptide_doi': Match by both 'peptide' and 'doi' column values
        verbose: If True, print summary statistics.
    
    Returns:
        DataFrame containing rows from full_df that are not in matched_df.
    
    Example:
        >>> unmatched = get_unmatched_peptides(
        ...     peptides_final_updated,
        ...     peptides_matched_mbpdb,
        ...     match_on='index'
        ... )
    """
    if full_df.empty:
        if verbose:
            print("Warning: full_df is empty. Returning empty DataFrame.")
        return pd.DataFrame()
    
    if matched_df.empty:
        if verbose:
            print("Warning: matched_df is empty. Returning full_df.")
        return full_df.copy()
    
    # Match based on specified method
    if match_on == 'index':
        # Use boolean indexing to avoid index mismatch errors
        mask = ~full_df.index.isin(matched_df.index)
        unmatched_df = full_df[mask].copy()
    elif match_on == 'peptide':
        if 'peptide' not in full_df.columns:
            raise ValueError("'peptide' column not found in full_df")
        mask = ~full_df['peptide'].isin(matched_df['peptide'])
        unmatched_df = full_df[mask].copy()
    elif match_on == 'peptide_doi':
        if 'peptide' not in full_df.columns or 'doi' not in full_df.columns:
            raise ValueError("'peptide' and/or 'doi' columns not found in full_df")
        # Create composite key for matching
        full_keys = full_df['peptide'].astype(str) + '_' + full_df['doi'].astype(str)
        matched_keys = matched_df['peptide'].astype(str) + '_' + matched_df['doi'].astype(str)
        mask = ~full_keys.isin(matched_keys)
        unmatched_df = full_df[mask].copy()
    else:
        raise ValueError(f"Invalid match_on value: {match_on}. Must be 'index', 'peptide', or 'peptide_doi'")
    
    if verbose:
        print(f"Number of unique peptides w/o MBPDB match: {unmatched_df.shape[0]}")
        print(f"Original peptides: {full_df.shape[0]}, Matched: {matched_df.shape[0]}, Unmatched: {unmatched_df.shape[0]}")
    
    return unmatched_df


# Step 2: Create peptides_not_matched_mbpdb
# Get peptides that didn't match with MBPDB based on index
peptides_not_matched_mbpdb = get_unmatched_peptides(
    peptides_final_updated,
    peptides_matched_mbpdb,
    match_on='index',
    verbose=True
)

# Display peptides_not_matched_mbpdb
peptides_not_matched_mbpdb


# Export final dataframe to csv



In [None]:
# Simplified: Export peptides_final_updated, drop index, and rename file

# Ensure output directory exists
output_dir = Path(DATA_DIR)
output_dir.mkdir(parents=True, exist_ok=True)

# Generate timestamp for filename (format: YYYYMMDD_HHMMSS)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Rename DataFrame for export (no index column) with timestamp
protein_id = "382_human_proteins_"
output_filename = f"{protein_id}novel_peptides_for_addition_to_MBPDB_{timestamp}.csv"
output_path = output_dir / output_filename

# Export to CSV, dropping index
peptides_final_updated.to_csv(output_path, index=False)

logger.info(f"Exported {len(peptides_final_updated)} novel peptides to {output_path}")