In [1]:
# Import the protein search functions
from notebook.protein_search import *


In [2]:
# Example 1: Search by protein name
print("=== Searching for insulin structures ===")
insulin_results = search_structures(
    query="insulin",
    max_resolution=2.0,  # High quality only
    limit=5
)
print(f"Found {insulin_results['total_count']} insulin structures")
print(f"Top 5 PDB IDs: {insulin_results['pdb_ids']}")
print(f"Scores: {insulin_results['scores']}")


=== Searching for insulin structures ===
Found 4435 insulin structures
Top 5 PDB IDs: ['6RLX', '5JYQ', '2WFU', '2WFV', '1ZT3']
Scores: {'6RLX': 0.9941523690322508, '5JYQ': 0.9940683158491971, '2WFU': 0.9910523909666406, '2WFV': 0.9910523909666406, '1ZT3': 0.988116276326907}


In [3]:
# Example 2: Search by organism and method
print("\n=== Human proteins solved by X-ray crystallography ===")
human_xray = search_structures(
    organism="Homo sapiens",
    method="X-RAY DIFFRACTION",
    max_resolution=1.5,  # Very high resolution
    limit=10
)
print(f"Found {human_xray['returned_count']} high-resolution human structures")
print(f"PDB IDs: {human_xray['pdb_ids'][:5]}")



=== Human proteins solved by X-ray crystallography ===
Found 10 high-resolution human structures
PDB IDs: ['1A4I', '1A7S', '1AAP', '1AIE', '1B9O']


In [10]:
# Example: Search for structures similar to a given sequence
# Small insulin A-chain sequence for demonstration
insulin_sequence = "METPAQLSFLLLLLWLPDTTGSPIPLPETTEDTYRMLLAHNLLAHATRAIVVDPNDNSTQNLGYKFSQNVQELLRELDKIQNVLSLLSRIQHLPRILAKRQRPVKLLARSQNLTGLLDFTGKFTPNVARYFYNGTSMACETFQYGGCMGNGNNFVTEKECLQTCRTVAAERPGVNLLSDWQSQRALMPFSSCHDGYTLNNQCCRPGWQRPANPTANLNRHLKECCEVEHDTPANCLTPEAAEAPLVPVGSSDARWTPIPPGVQSQVSVSVGVHTATQTAPGAVAPLGPLSTPHCSROUGQYTNKQDRGPVLPSPALMPLVGVQTLTCGGESCGPLPEAAEAPLVPVGSSDARWTPIPPGVQSQVSVSVGVHTATQTAPGAVAPLGPLSTPHCSROUGQYTNKQDRGPVLPSPALMPLVGVQTLTCGGESCGPLP"  # Human haptoglobin sequence

print("=== Searching by sequence similarity ===")
seq_results = search_by_sequence(
    sequence=insulin_sequence,
    sequence_type="protein",
    identity_cutoff=0.3,  # 30% identity minimum
    max_resolution=2.5,
    limit=10
)
print(f"Found {seq_results['total_count']} structures with similar sequences")
print(f"Best matches: {seq_results['pdb_ids'][:3]}")
print(f"Similarity scores: {[seq_results['scores'][pdb] for pdb in seq_results['pdb_ids'][:3]]}")


=== Searching by sequence similarity ===
Found 123 structures with similar sequences
Best matches: ['1BIK', '1T8N', '3BTT']
Similarity scores: [1.5, 0.7692307692307692, 0.7692307692307692]


In [11]:
# Example: Find structures similar to insulin (4INS)
print("=== Searching for structures similar to 4INS (insulin) ===")
struct_results = search_by_structure(
    reference_pdb_ids="4INS",
    match_type="relaxed",  # Allow some flexibility
    limit=10
)
print(f"Found {struct_results['total_count']} structurally similar entries")
print(f"Similar structures: {struct_results['pdb_ids'][:5]}")
print(f"Shape similarity scores: {[struct_results['scores'][pdb] for pdb in struct_results['pdb_ids'][:3]]}")


=== Searching for structures similar to 4INS (insulin) ===
Found 1712 structurally similar entries
Similar structures: ['4INS', '3INS', '4E7T', '4M4M', '1IZB']
Shape similarity scores: [1.0, 0.8914623662209674, 0.8243856490859911]


In [12]:
# Example: Search by ligand name
print("=== Structures containing ATP ===")
atp_results = search_by_chemical(
    ligand_name="ATP",
    max_resolution=2.0,
    limit=8
)
print(f"Found {atp_results['total_count']} structures with ATP")
print(f"ATP-bound structures: {atp_results['pdb_ids'][:5]}")


=== Structures containing ATP ===
Found 3 structures with ATP
ATP-bound structures: ['6FCW', '5UBG', '6R5E']


In [13]:
# Example: Get high-quality structures
print("=== High-quality structures (strict criteria) ===")
hq_results = get_high_quality_structures(
    max_resolution=1.5,  # Very high resolution
    max_r_work=0.20,     # Good refinement
    max_r_free=0.25,     # Good validation
    min_year=2015,       # Recent structures
    limit=10
)
print(f"Found {hq_results['total_count']} high-quality structures")
print(f"Best quality PDBs: {hq_results['pdb_ids'][:5]}")
print("These structures meet strict quality criteria for detailed analysis")


=== High-quality structures (strict criteria) ===
Found 14944 high-quality structures
Best quality PDBs: ['5D8V', '5NW3', '7ATG', '3X2M', '6ANM']
These structures meet strict quality criteria for detailed analysis


In [14]:
# Example: Get detailed information for specific structures
print("=== Detailed information for insulin structure (4INS) ===")
details = get_structure_details(["4INS"], include_assembly=True)

pdb_info = details["4INS"]
print(f"Title: {pdb_info['title']}")
print(f"Method: {pdb_info['method']}")
print(f"Resolution: {pdb_info['resolution_A']} Å")
print(f"Space group: {pdb_info['space_group']}")
print(f"Number of entities: {len(pdb_info['entities'])}")
print(f"Ligands: {pdb_info['ligands']}")

print("\nEntity details:")
for entity in pdb_info['entities']:
    print(f"  - {entity['description']} ({entity['sequence_length']} residues, {entity['organism']})")


=== Detailed information for insulin structure (4INS) ===
Title: THE STRUCTURE OF 2ZN PIG INSULIN CRYSTALS AT 1.5 ANGSTROMS RESOLUTION
Method: X-RAY DIFFRACTION
Resolution: 1.5 Å
Space group: H 3
Number of entities: 2
Ligands: ['ZN']

Entity details:
  - INSULIN (CHAIN A) (21 residues, Sus scrofa)
  - INSULIN (CHAIN B) (30 residues, Sus scrofa)


In [15]:
# Example: Extract sequences from structures
print("=== Extracting sequences ===")
sequences = get_sequences(["4INS", "1ZNI"])  # Insulin structures

for seq_key, seq_data in sequences.items():
    if 'error' not in seq_data:
        print(f"{seq_key}:")
        print(f"  Type: {seq_data['type']}")
        print(f"  Length: {seq_data['length']} residues")
        print(f"  Sequence: {seq_data['sequence'][:50]}..." if seq_data['sequence'] else "  No sequence available")
        print()


=== Extracting sequences ===
1ZNI_1:
  Type: Protein
  Length: 21 residues
  Sequence: GIVEQCCTSICSLYQLENYCN...

4INS_1:
  Type: Protein
  Length: 21 residues
  Sequence: GIVEQCCTSICSLYQLENYCN...



In [16]:
# Example: Compare insulin structures
print("=== Comparing insulin structures ===")
comparison = compare_structures(
    pdb_ids=["4INS", "1ZNI", "3W11"],  # Different insulin structures
    comparison_type="sequence"
)

print(f"Comparing: {comparison['pdb_ids']}")
print("\nPairwise sequence identities:")
for pair, data in comparison['comparisons'].items():
    print(f"  {pair}: {data['sequence_identity']:.1%} identity, length diff: {data['length_difference']}")


=== Comparing insulin structures ===
Comparing: ['4INS', '1ZNI', '3W11']

Pairwise sequence identities:
  4INS_1ZNI: 100.0% identity, length diff: 0
  4INS_3W11: 100.0% identity, length diff: 0
  1ZNI_3W11: 100.0% identity, length diff: 0


In [17]:
# Example: Analyze interactions in structures
print("=== Analyzing molecular interactions ===")
interactions = analyze_interactions(
    pdb_ids=["4INS", "1HVH"],  # Insulin and HIV protease
    interaction_type="all"
)

for pdb_id, data in interactions.items():
    print(f"\n{pdb_id}:")
    print(f"  Protein chains: {data['protein_chains']}")
    print(f"  Ligands: {len(data['ligands'])} present")
    print(f"  Interactions found:")
    for interaction in data['interactions']:
        print(f"    - {interaction['type']}: {interaction.get('description', 'Present')}")
    
    if 'quaternary_structure' in data:
        qs = data['quaternary_structure']
        print(f"  Quaternary structure: {qs.get('oligomeric_state', 'Unknown')}")


=== Analyzing molecular interactions ===

4INS:
  Protein chains: []
  Ligands: 1 present
  Interactions found:
    - protein-ligand: Present
  Quaternary structure: dimeric

1HVH:
  Protein chains: []
  Ligands: 0 present
  Interactions found:
  Quaternary structure: dimeric


In [18]:
# Example: Get comprehensive structural summaries
print("=== Comprehensive structural summaries ===")
summaries = get_structural_summary(
    pdb_ids=["4INS", "1HVH"],
    include_quality_metrics=True
)

for pdb_id, summary in summaries.items():
    print(f"\n=== {pdb_id} SUMMARY ===")
    print(f"Title: {summary['title']}")
    
    exp = summary['experimental']
    print(f"Method: {exp['method']} at {exp['resolution_A']} Å resolution")
    print(f"Deposited: {exp['deposition_date']}")
    
    comp = summary['composition']
    print(f"Composition: {comp['protein_entities']} proteins, {comp['ligands']} ligands")
    print(f"Organisms: {summary.get('organisms', ['Unknown'])}")
    
    rel = summary['research_relevance']
    relevance = []
    if rel['has_ligands']: relevance.append("drug target")
    if rel['is_complex']: relevance.append("protein complex")
    if rel['high_resolution']: relevance.append("high resolution")
    print(f"Research relevance: {', '.join(relevance) if relevance else 'basic structure'}")
    
    if 'quality' in summary:
        print(f"Quality assessment: {summary['quality']['quality_score']}")


=== Comprehensive structural summaries ===

=== 4INS SUMMARY ===
Title: THE STRUCTURE OF 2ZN PIG INSULIN CRYSTALS AT 1.5 ANGSTROMS RESOLUTION
Method: X-RAY DIFFRACTION at 1.5 Å resolution
Deposited: None
Composition: 0 proteins, 1 ligands
Organisms: ['Sus scrofa']
Research relevance: drug target, protein complex, high resolution
Quality assessment: Good (very good resolution)

=== 1HVH SUMMARY ===
Title: NONPEPTIDE CYCLIC CYANOGUANIDINES AS HIV PROTEASE INHIBITORS
Method: X-RAY DIFFRACTION at 1.8 Å resolution
Deposited: None
Composition: 0 proteins, 0 ligands
Organisms: ['Human immunodeficiency virus 1']
Research relevance: high resolution
Quality assessment: Good (very good resolution, good R-work)


In [19]:
# Example: Complete workflow for drug discovery research
print("=== COMPLETE WORKFLOW EXAMPLE: Drug Discovery ===")

# Step 1: Find structures with specific ligand
print("\n1. Finding ATP-binding proteins...")
atp_structures = search_by_chemical(ligand_name="ATP", max_resolution=2.0, limit=3)
print(f"Found {len(atp_structures['pdb_ids'])} ATP-binding structures")

# Step 2: Get detailed information
print("\n2. Getting detailed structure information...")
structure_details = get_structure_details(atp_structures['pdb_ids'][:2])

# Step 3: Analyze interactions
print("\n3. Analyzing molecular interactions...")
interaction_data = analyze_interactions(atp_structures['pdb_ids'][:2])

# Step 4: Get comprehensive summary
print("\n4. Generating research summary...")
research_summary = get_structural_summary(atp_structures['pdb_ids'][:2])

print("\n=== WORKFLOW RESULTS ===")
for pdb_id in atp_structures['pdb_ids'][:2]:
    print(f"\n{pdb_id}:")
    summary = research_summary.get(pdb_id, {})
    print(f"  Title: {summary.get('title', 'Unknown')}")
    print(f"  Resolution: {summary.get('experimental', {}).get('resolution_A', 'Unknown')} Å")
    
    interactions = interaction_data.get(pdb_id, {})
    print(f"  Protein chains: {len(interactions.get('protein_chains', []))}")
    print(f"  Ligands present: {len(interactions.get('ligands', []))}")
    
    print("  Research potential:", end=" ")
    rel = summary.get('research_relevance', {})
    if rel.get('has_ligands'): print("Drug target", end=" ")
    if rel.get('high_resolution'): print("High-res structure", end=" ")
    print()

print("\n=== This workflow helps identify promising drug targets! ===")


=== COMPLETE WORKFLOW EXAMPLE: Drug Discovery ===

1. Finding ATP-binding proteins...
Found 3 ATP-binding structures

2. Getting detailed structure information...

3. Analyzing molecular interactions...

4. Generating research summary...

=== WORKFLOW RESULTS ===

6FCW:
  Title: Catalytic subunit HisG from Psychrobacter arcticus ATP phosphoribosyltransferase (HisZG ATPPRT) in complex with PRATP
  Resolution: 2.0 Å
  Protein chains: 0
  Ligands present: 2
  Research potential: Drug target 

5UBG:
  Title: Catalytic core domain of Adenosine triphosphate phosphoribosyltransferase from Campylobacter jejuni with bound Phosphoribosyl-ATP
  Resolution: 1.9 Å
  Protein chains: 0
  Ligands present: 2
  Research potential: Drug target High-res structure 

=== This workflow helps identify promising drug targets! ===
