# AlphaFold Model

In [None]:
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Optional, Tuple
import tempfile
import os
import json

ALPHAFOLD_API_BASE = "https://alphafold.ebi.ac.uk/api"
ALPHAFOLD_FILES_BASE = "https://alphafold.ebi.ac.uk/files"

HIGH_CONFIDENCE_PROTEIN = {
    'name': 'Human Ubiquitin',
    'uniprot_id': 'P0CG48',
    'expected_mean_plddt': '>90',
}

LOW_CONFIDENCE_PROTEIN = {
    'name': 'Human Alpha-Synuclein',
    'uniprot_id': 'P37840',
    'expected_mean_plddt': '<70',
}

def get_alphafold_metadata(uniprot_id: str):
    url = f"{ALPHAFOLD_API_BASE}/prediction/{uniprot_id}"
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    data = response.json()
    return data[0] if isinstance(data, list) else data

def download_alphafold_structure(uniprot_id: str, version: int = 4):
    pdb_filename = f"AF-{uniprot_id}-F1-model_v{version}.pdb"
    pdb_url = f"{ALPHAFOLD_FILES_BASE}/{pdb_filename}"
    response = requests.get(pdb_url, timeout=60)
    response.raise_for_status()
    temp_dir = tempfile.gettempdir()
    local_pdb_path = os.path.join(temp_dir, pdb_filename)
    with open(local_pdb_path, 'w') as f:
        f.write(response.text)
    return local_pdb_path

def extract_plddt_from_pdb(pdb_file: str):
    plddt_scores = []
    residue_numbers = []
    with open(pdb_file, 'r') as f:
        for line in f:
            if line.startswith('ATOM') and ' CA ' in line:
                res_num = int(line[22:26].strip())
                bfactor = float(line[60:66].strip())
                plddt_scores.append(bfactor)
                residue_numbers.append(res_num)
    return np.array(plddt_scores), residue_numbers

def calculate_plddt_statistics(plddt: np.ndarray):
    """Calculate statistics - all values as Python native types for JSON"""
    stats = {
        'mean': float(np.mean(plddt)),
        'median': float(np.median(plddt)),
        'std': float(np.std(plddt)),
        'min': float(np.min(plddt)),
        'max': float(np.max(plddt)),
        'q25': float(np.percentile(plddt, 25)),
        'q75': float(np.percentile(plddt, 75)),
        'very_high_conf': float(np.sum(plddt > 90) / len(plddt) * 100),
        'high_conf': float(np.sum((plddt > 70) & (plddt <= 90)) / len(plddt) * 100),
        'medium_conf': float(np.sum((plddt > 50) & (plddt <= 70)) / len(plddt) * 100),
        'low_conf': float(np.sum(plddt <= 50) / len(plddt) * 100),
    }
    return stats

def analyze_protein(protein_info):
    print(f"\nüî¨ Analyzing: {protein_info['name']}")
    
    metadata = get_alphafold_metadata(protein_info['uniprot_id'])
    version = metadata.get('latestVersion', 4)
    pdb_file = download_alphafold_structure(protein_info['uniprot_id'], version)
    plddt, residue_numbers = extract_plddt_from_pdb(pdb_file)
    stats = calculate_plddt_statistics(plddt)
    
    confidence = 'VERY HIGH' if stats['mean'] > 90 else 'HIGH' if stats['mean'] > 70 else 'MEDIUM' if stats['mean'] > 50 else 'LOW'
    
    print(f"   Mean pLDDT: {stats['mean']:.1f}")
    print(f"   Confidence: {confidence}")
    
    return {
        'protein_info': protein_info,
        'plddt': plddt,
        'residue_numbers': residue_numbers,
        'statistics': stats,
        'confidence': confidence,
    }

# Run analysis
print("üöÄ Starting AlphaFold Analysis")
high_conf = analyze_protein(HIGH_CONFIDENCE_PROTEIN)
low_conf = analyze_protein(LOW_CONFIDENCE_PROTEIN)

# Results
delta = float(high_conf['statistics']['mean'] - low_conf['statistics']['mean'])
print(f"\nüìä Results:")
print(f"   Œî pLDDT: {delta:.1f} points")

# Save JSON
results = {
    'high_confidence': {
        'name': high_conf['protein_info']['name'],
        'uniprot_id': high_conf['protein_info']['uniprot_id'],
        'statistics': high_conf['statistics'],
        'confidence': high_conf['confidence'],
    },
    'low_confidence': {
        'name': low_conf['protein_info']['name'],
        'uniprot_id': low_conf['protein_info']['uniprot_id'],
        'statistics': low_conf['statistics'],
        'confidence': low_conf['confidence'],
    },
    'comparison': {'delta_mean_plddt': delta}
}

with open('alphafold_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úÖ Done! Results saved to alphafold_results.json")

üöÄ Starting AlphaFold Analysis

üî¨ Analyzing: Human Ubiquitin
   Mean pLDDT: 88.6
   Confidence: HIGH

üî¨ Analyzing: Human Alpha-Synuclein


In [10]:
#!/usr/bin/env python3
"""
AlphaFold Database API - Protein Structure Confidence Comparison
Comparing a well-folded protein vs an intrinsically disordered protein (IDP)
"""

import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Optional, Tuple
import tempfile
import os
import json

# ============================================================================
# AlphaFold Database Configuration
# ============================================================================

ALPHAFOLD_API_BASE = "https://alphafold.ebi.ac.uk/api"
ALPHAFOLD_FILES_BASE = "https://alphafold.ebi.ac.uk/files"

# ============================================================================
# Protein Targets - HIGH CONFIDENCE vs LOW CONFIDENCE
# ============================================================================

HIGH_CONFIDENCE_PROTEIN = {
    'name': 'Human Ubiquitin',
    'uniprot_id': 'P0CG48',
    'length': 76,
    'description': 'Small, highly structured protein - gold standard for protein folding',
    'expected_mean_plddt': '>90',
    'characteristics': [
        '‚úÖ Compact globular structure',
        '‚úÖ Very stable Œ≤-grasp fold',
        '‚úÖ Well-characterized by X-ray/NMR',
        '‚úÖ High sequence conservation',
        '‚úÖ No disordered regions',
        '‚úÖ Essential cellular protein',
        '‚úÖ Gold standard for folding studies'
    ]
}

LOW_CONFIDENCE_PROTEIN = {
    'name': 'Human Alpha-Synuclein',
    'uniprot_id': 'P37840',
    'length': 140,
    'description': 'Intrinsically Disordered Protein (IDP) - Parkinson\'s disease associated',
    'expected_mean_plddt': '<70',
    'characteristics': [
        '‚ö†Ô∏è Intrinsically disordered protein (IDP)',
        '‚ö†Ô∏è Lacks stable 3D structure in solution',
        '‚ö†Ô∏è Associated with Parkinson\'s disease',
        '‚ö†Ô∏è Forms amyloid fibrils',
        '‚ö†Ô∏è Multiple conformational states',
        '‚ö†Ô∏è Low sequence complexity in regions',
        '‚ö†Ô∏è Difficult for structure prediction',
        '‚ö†Ô∏è Biologically active in disordered state'
    ]
}

print("="*80)
print("üß¨ ALPHAFOLD DATABASE - PROTEIN CONFIDENCE COMPARISON")
print("="*80)
print(f"\nüìä Comparing Two Extreme Cases:")
print(f"\n1Ô∏è‚É£  HIGH CONFIDENCE PROTEIN:")
print(f"    Protein: {HIGH_CONFIDENCE_PROTEIN['name']}")
print(f"    UniProt: {HIGH_CONFIDENCE_PROTEIN['uniprot_id']}")
print(f"    Length: {HIGH_CONFIDENCE_PROTEIN['length']} residues")
print(f"    Type: Well-folded globular protein")
print(f"\n2Ô∏è‚É£  LOW CONFIDENCE PROTEIN:")
print(f"    Protein: {LOW_CONFIDENCE_PROTEIN['name']}")
print(f"    UniProt: {LOW_CONFIDENCE_PROTEIN['uniprot_id']}")
print(f"    Length: {LOW_CONFIDENCE_PROTEIN['length']} residues")
print(f"    Type: Intrinsically Disordered Protein (IDP)")
print("="*80)

# ============================================================================
# AlphaFold API Functions
# ============================================================================

def get_alphafold_metadata(uniprot_id: str) -> Optional[Dict]:
    """
    Get prediction metadata from AlphaFold Database API
    
    Args:
        uniprot_id: UniProt accession ID
        
    Returns:
        Metadata dictionary or None if error
    """
    url = f"{ALPHAFOLD_API_BASE}/prediction/{uniprot_id}"
    
    print(f"\nüîç Querying AlphaFold Database API...")
    print(f"   URL: {url}")
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        data = response.json()
        
        # API returns a list
        if isinstance(data, list) and len(data) > 0:
            metadata = data[0]
        else:
            metadata = data
        
        print(f"‚úÖ AlphaFold metadata retrieved!")
        print(f"\nüìä Prediction Information:")
        print(f"   UniProt ID: {metadata.get('uniprotAccession', 'N/A')}")
        print(f"   Gene: {metadata.get('gene', 'N/A')}")
        print(f"   Organism: {metadata.get('organismScientificName', 'N/A')}")
        print(f"   Length: {metadata.get('uniprotEnd', 'N/A')} residues")
        print(f"   Model version: {metadata.get('latestVersion', 'N/A')}")
        print(f"   Model created: {metadata.get('modelCreatedDate', 'N/A')}")
        
        return metadata
        
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f"‚ùå Protein not found in AlphaFold Database")
            print(f"   UniProt ID {uniprot_id} may not have a prediction")
        else:
            print(f"‚ùå API error: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None


def download_alphafold_structure(uniprot_id: str, version: int = 4) -> Optional[str]:
    """
    Download AlphaFold predicted structure (PDB format)
    
    Args:
        uniprot_id: UniProt accession ID
        version: AlphaFold model version
        
    Returns:
        Path to downloaded PDB file or None if error
    """
    pdb_filename = f"AF-{uniprot_id}-F1-model_v{version}.pdb"
    pdb_url = f"{ALPHAFOLD_FILES_BASE}/{pdb_filename}"
    
    print(f"\nüì• Downloading AlphaFold structure...")
    print(f"   URL: {pdb_url}")
    
    try:
        response = requests.get(pdb_url, timeout=60)
        response.raise_for_status()
        
        # Save to temp file
        temp_dir = tempfile.gettempdir()
        local_pdb_path = os.path.join(temp_dir, pdb_filename)
        
        with open(local_pdb_path, 'w') as f:
            f.write(response.text)
        
        print(f"‚úÖ Structure downloaded!")
        print(f"   Saved to: {local_pdb_path}")
        print(f"   Size: {len(response.text)} bytes")
        
        return local_pdb_path
        
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f"‚ùå Structure file not found")
            print(f"   Try different version or check UniProt ID")
        else:
            print(f"‚ùå Download error: {e}")
        return None
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None


def extract_plddt_from_pdb(pdb_file: str) -> Optional[Tuple[np.ndarray, List[int]]]:
    """
    Extract all pLDDT scores from AlphaFold PDB file
    AlphaFold stores pLDDT in the B-factor column
    
    Args:
        pdb_file: Path to PDB file
        
    Returns:
        Tuple of (plddt_scores array, residue_numbers list) or None if error
    """
    plddt_scores = []
    residue_numbers = []
    
    try:
        with open(pdb_file, 'r') as f:
            for line in f:
                if line.startswith('ATOM'):
                    # Only process CA atoms
                    if ' CA ' in line:
                        # Extract residue number (columns 23-26)
                        res_num = int(line[22:26].strip())
                        # B-factor is in columns 60-66 (contains pLDDT)
                        bfactor = float(line[60:66].strip())
                        
                        plddt_scores.append(bfactor)
                        residue_numbers.append(res_num)
        
        return np.array(plddt_scores), residue_numbers
    
    except Exception as e:
        print(f"‚ùå Error parsing PDB file: {e}")
        return None


def calculate_plddt_statistics(plddt: np.ndarray) -> Dict:
    """
    Calculate comprehensive statistics for pLDDT scores
    
    Args:
        plddt: Array of pLDDT scores
        
    Returns:
        Dictionary of statistics
    """
    stats = {
        'mean': float(np.mean(plddt)),  # Python float
        'very_high_conf': float(np.sum(...) / len(...) * 100)  # Python float
    }
    
    # Confidence level distribution (AlphaFold categories)
    stats['very_high_conf'] = np.sum(plddt > 90) / len(plddt) * 100  # > 90
    stats['high_conf'] = np.sum((plddt > 70) & (plddt <= 90)) / len(plddt) * 100  # 70-90
    stats['medium_conf'] = np.sum((plddt > 50) & (plddt <= 70)) / len(plddt) * 100  # 50-70
    stats['low_conf'] = np.sum(plddt <= 50) / len(plddt) * 100  # < 50
    
    return stats


def determine_confidence_level(mean_plddt: float) -> str:
    """Determine overall confidence level from mean pLDDT"""
    if mean_plddt > 90:
        return 'VERY HIGH'
    elif mean_plddt > 70:
        return 'HIGH'
    elif mean_plddt > 50:
        return 'MEDIUM'
    else:
        return 'LOW'


def analyze_protein(protein_info: Dict) -> Optional[Dict]:
    """
    Complete analysis pipeline for a single protein
    
    Args:
        protein_info: Dictionary with protein information
        
    Returns:
        Analysis results dictionary or None if error
    """
    uniprot_id = protein_info['uniprot_id']
    
    print("\n" + "="*80)
    print(f"üî¨ ANALYZING: {protein_info['name']}")
    print("="*80)
    print(f"UniProt ID: {uniprot_id}")
    print(f"Description: {protein_info['description']}")
    print(f"Expected mean pLDDT: {protein_info['expected_mean_plddt']}")
    
    # Step 1: Get metadata
    metadata = get_alphafold_metadata(uniprot_id)
    if metadata is None:
        return None
    
    # Step 2: Download structure
    model_version = metadata.get('latestVersion', 4)
    pdb_file = download_alphafold_structure(uniprot_id, version=model_version)
    
    # Try alternative versions if needed
    if pdb_file is None:
        print("\nüîÑ Trying alternative versions...")
        for version in [4, 3, 2, 1]:
            pdb_file = download_alphafold_structure(uniprot_id, version=version)
            if pdb_file is not None:
                break
    
    if pdb_file is None:
        return None
    
    # Step 3: Extract pLDDT scores
    print("\nüìä Extracting pLDDT scores...")
    result = extract_plddt_from_pdb(pdb_file)
    if result is None:
        return None
    
    plddt, residue_numbers = result
    
    # Step 4: Calculate statistics
    print("üìà Calculating statistics...")
    stats = calculate_plddt_statistics(plddt)
    
    # Step 5: Determine confidence
    confidence = determine_confidence_level(stats['mean'])
    
    # Check if matches expectation
    expected = protein_info['expected_mean_plddt']
    if '>' in expected:
        threshold = float(expected.replace('>', ''))
        matches = bool(stats['mean'] > threshold)  # Python bool
    else:
        threshold = float(expected.replace('<', ''))
        matches = stats['mean'] < threshold
    
    # Print results
    print(f"\n‚úÖ Analysis Complete!")
    print(f"\nüìä PLDDT STATISTICS:")
    print(f"   Mean:     {stats['mean']:.1f}")
    print(f"   Median:   {stats['median']:.1f}")
    print(f"   Std Dev:  {stats['std']:.1f}")
    print(f"   Range:    {stats['min']:.1f} - {stats['max']:.1f}")
    print(f"   Q25-Q75:  {stats['q25']:.1f} - {stats['q75']:.1f}")
    print(f"\nüìà CONFIDENCE DISTRIBUTION:")
    print(f"   Very High (>90):  {stats['very_high_conf']:.1f}%")
    print(f"   High (70-90):     {stats['high_conf']:.1f}%")
    print(f"   Medium (50-70):   {stats['medium_conf']:.1f}%")
    print(f"   Low (<50):        {stats['low_conf']:.1f}%")
    print(f"\nüéØ OVERALL CONFIDENCE: {confidence}")
    print(f"   Matches Expectation: {'‚úÖ YES' if matches else '‚ö†Ô∏è  NO'}")
    
    return {
        'protein_info': protein_info,
        'metadata': metadata,
        'plddt': plddt,
        'residue_numbers': residue_numbers,
        'statistics': stats,
        'confidence': confidence,
        'matches_expectation': matches,
        'pdb_file': pdb_file
    }


def compare_proteins(high_conf_result: Dict, low_conf_result: Dict):
    """
    Compare two protein analysis results and generate visualizations
    
    Args:
        high_conf_result: Results from high confidence protein
        low_conf_result: Results from low confidence protein
    """
    print("\n" + "="*80)
    print("üìä PROTEIN COMPARISON ANALYSIS")
    print("="*80)
    
    hc_stats = high_conf_result['statistics']
    lc_stats = low_conf_result['statistics']
    
    delta_mean = hc_stats['mean'] - lc_stats['mean']
    delta_std = hc_stats['std'] - lc_stats['std']
    
    print(f"\n1Ô∏è‚É£  HIGH CONFIDENCE: {high_conf_result['protein_info']['name']}")
    print(f"    Mean pLDDT: {hc_stats['mean']:.1f} ¬± {hc_stats['std']:.1f}")
    print(f"    Confidence: {high_conf_result['confidence']}")
    print(f"    Very High Regions: {hc_stats['very_high_conf']:.1f}%")
    
    print(f"\n2Ô∏è‚É£  LOW CONFIDENCE: {low_conf_result['protein_info']['name']}")
    print(f"    Mean pLDDT: {lc_stats['mean']:.1f} ¬± {lc_stats['std']:.1f}")
    print(f"    Confidence: {low_conf_result['confidence']}")
    print(f"    Very High Regions: {lc_stats['very_high_conf']:.1f}%")
    
    print(f"\nüìà KEY DIFFERENCES:")
    print(f"    Œî Mean pLDDT: {delta_mean:.1f} points")
    print(f"    Œî Std Dev: {delta_std:.1f} points")
    print(f"    Mean Ratio: {hc_stats['mean']/lc_stats['mean']:.2f}x")
    
    print(f"\nüí° INTERPRETATION:")
    if delta_mean > 20:
        print(f"    ‚úÖ AlphaFold shows STRONG discrimination between proteins")
        print(f"    ‚úÖ High confidence protein is well-structured")
        print(f"    ‚úÖ Low confidence protein has significant disorder")
    elif delta_mean > 10:
        print(f"    ‚ö†Ô∏è  AlphaFold shows MODERATE discrimination")
    else:
        print(f"    ‚ö†Ô∏è  AlphaFold shows LIMITED discrimination")
    
    print(f"\nüéØ QUANTUM COMPUTING IMPLICATIONS:")
    print(f"    ‚Ä¢ Low confidence regions (pLDDT < 70) are ideal for quantum sampling")
    print(f"    ‚Ä¢ {low_conf_result['protein_info']['name']}: {lc_stats['medium_conf'] + lc_stats['low_conf']:.1f}% amenable to quantum methods")
    print(f"    ‚Ä¢ {high_conf_result['protein_info']['name']}: {hc_stats['medium_conf'] + hc_stats['low_conf']:.1f}% amenable to quantum methods")
    
    # Generate visualizations
    print("\nüìä Generating comparison plots...")
    create_comparison_plots(high_conf_result, low_conf_result)


def create_comparison_plots(high_conf_result: Dict, low_conf_result: Dict):
    """
    Create comprehensive comparison visualizations
    
    Args:
        high_conf_result: Results from high confidence protein
        low_conf_result: Results from low confidence protein
    """
    fig = plt.figure(figsize=(16, 10))
    
    hc_plddt = high_conf_result['plddt']
    hc_res = high_conf_result['residue_numbers']
    hc_name = high_conf_result['protein_info']['name']
    
    lc_plddt = low_conf_result['plddt']
    lc_res = low_conf_result['residue_numbers']
    lc_name = low_conf_result['protein_info']['name']
    
    # Plot 1: pLDDT along sequence - High Confidence
    ax1 = plt.subplot(3, 2, 1)
    ax1.plot(hc_res, hc_plddt, linewidth=2, color='#2E86AB')
    ax1.axhline(y=90, color='green', linestyle='--', alpha=0.5, label='Very High (90)')
    ax1.axhline(y=70, color='orange', linestyle='--', alpha=0.5, label='High (70)')
    ax1.axhline(y=50, color='red', linestyle='--', alpha=0.5, label='Medium (50)')
    ax1.fill_between(hc_res, hc_plddt, alpha=0.3, color='#2E86AB')
    ax1.set_xlabel('Residue Number', fontsize=11)
    ax1.set_ylabel('pLDDT Score', fontsize=11)
    ax1.set_title(f'{hc_name}\nHigh Confidence Protein', fontsize=12, fontweight='bold')
    ax1.legend(fontsize=9)
    ax1.grid(alpha=0.3)
    ax1.set_ylim([0, 105])
    
    # Plot 2: pLDDT along sequence - Low Confidence
    ax2 = plt.subplot(3, 2, 2)
    ax2.plot(lc_res, lc_plddt, linewidth=2, color='#A23B72')
    ax2.axhline(y=90, color='green', linestyle='--', alpha=0.5, label='Very High (90)')
    ax2.axhline(y=70, color='orange', linestyle='--', alpha=0.5, label='High (70)')
    ax2.axhline(y=50, color='red', linestyle='--', alpha=0.5, label='Medium (50)')
    ax2.fill_between(lc_res, lc_plddt, alpha=0.3, color='#A23B72')
    ax2.set_xlabel('Residue Number', fontsize=11)
    ax2.set_ylabel('pLDDT Score', fontsize=11)
    ax2.set_title(f'{lc_name}\nLow Confidence Protein (IDP)', fontsize=12, fontweight='bold')
    ax2.legend(fontsize=9)
    ax2.grid(alpha=0.3)
    ax2.set_ylim([0, 105])
    
    # Plot 3: Distribution comparison - Histogram
    ax3 = plt.subplot(3, 2, 3)
    ax3.hist(hc_plddt, bins=30, alpha=0.6, label=hc_name, color='#2E86AB', edgecolor='black')
    ax3.hist(lc_plddt, bins=30, alpha=0.6, label=lc_name, color='#A23B72', edgecolor='black')
    ax3.axvline(x=90, color='green', linestyle='--', alpha=0.5)
    ax3.axvline(x=70, color='orange', linestyle='--', alpha=0.5)
    ax3.axvline(x=50, color='red', linestyle='--', alpha=0.5)
    ax3.set_xlabel('pLDDT Score', fontsize=11)
    ax3.set_ylabel('Frequency', fontsize=11)
    ax3.set_title('pLDDT Distribution Comparison', fontsize=12, fontweight='bold')
    ax3.legend(fontsize=10)
    ax3.grid(alpha=0.3)
    
    # Plot 4: Box plot comparison
    ax4 = plt.subplot(3, 2, 4)
    data_to_plot = [hc_plddt, lc_plddt]
    bp = ax4.boxplot(data_to_plot, labels=[hc_name[:20], lc_name[:20]], 
                     patch_artist=True, showmeans=True)
    bp['boxes'][0].set_facecolor('#2E86AB')
    bp['boxes'][1].set_facecolor('#A23B72')
    ax4.axhline(y=90, color='green', linestyle='--', alpha=0.5)
    ax4.axhline(y=70, color='orange', linestyle='--', alpha=0.5)
    ax4.axhline(y=50, color='red', linestyle='--', alpha=0.5)
    ax4.set_ylabel('pLDDT Score', fontsize=11)
    ax4.set_title('Distribution Statistics', fontsize=12, fontweight='bold')
    ax4.grid(alpha=0.3)
    
    # Plot 5: Confidence category comparison
    ax5 = plt.subplot(3, 2, 5)
    hc_stats = high_conf_result['statistics']
    lc_stats = low_conf_result['statistics']
    
    categories = ['Very High\n(>90)', 'High\n(70-90)', 'Medium\n(50-70)', 'Low\n(<50)']
    hc_values = [hc_stats['very_high_conf'], hc_stats['high_conf'], 
                 hc_stats['medium_conf'], hc_stats['low_conf']]
    lc_values = [lc_stats['very_high_conf'], lc_stats['high_conf'], 
                 lc_stats['medium_conf'], lc_stats['low_conf']]
    
    x = np.arange(len(categories))
    width = 0.35
    
    ax5.bar(x - width/2, hc_values, width, label=hc_name[:20], color='#2E86AB', alpha=0.8)
    ax5.bar(x + width/2, lc_values, width, label=lc_name[:20], color='#A23B72', alpha=0.8)
    ax5.set_xlabel('Confidence Category', fontsize=11)
    ax5.set_ylabel('Percentage of Residues (%)', fontsize=11)
    ax5.set_title('Confidence Category Distribution', fontsize=12, fontweight='bold')
    ax5.set_xticks(x)
    ax5.set_xticklabels(categories, fontsize=9)
    ax5.legend(fontsize=10)
    ax5.grid(alpha=0.3, axis='y')
    
    # Plot 6: Summary statistics table
    ax6 = plt.subplot(3, 2, 6)
    ax6.axis('off')
    
    # Create summary table
    summary_data = [
        ['Metric', hc_name[:25], lc_name[:25], 'Difference'],
        ['Mean pLDDT', f"{hc_stats['mean']:.1f}", f"{lc_stats['mean']:.1f}", 
         f"{hc_stats['mean']-lc_stats['mean']:.1f}"],
        ['Std Dev', f"{hc_stats['std']:.1f}", f"{lc_stats['std']:.1f}", 
         f"{hc_stats['std']-lc_stats['std']:.1f}"],
        ['Min pLDDT', f"{hc_stats['min']:.1f}", f"{lc_stats['min']:.1f}", 
         f"{hc_stats['min']-lc_stats['min']:.1f}"],
        ['Max pLDDT', f"{hc_stats['max']:.1f}", f"{lc_stats['max']:.1f}", 
         f"{hc_stats['max']-lc_stats['max']:.1f}"],
        ['Very High %', f"{hc_stats['very_high_conf']:.1f}", 
         f"{lc_stats['very_high_conf']:.1f}", 
         f"{hc_stats['very_high_conf']-lc_stats['very_high_conf']:.1f}"],
        ['Confidence', high_conf_result['confidence'], 
         low_conf_result['confidence'], '-'],
    ]
    
    table = ax6.table(cellText=summary_data, cellLoc='center', loc='center',
                     colWidths=[0.25, 0.25, 0.25, 0.25])
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 2)
    
    # Style header row
    for i in range(4):
        table[(0, i)].set_facecolor('#4A4A4A')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    ax6.set_title('Summary Statistics Comparison', fontsize=12, fontweight='bold', pad=20)
    
    plt.tight_layout()
    
    # Save figure to current directory
    output_file = 'alphafold_protein_comparison.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    print(f"‚úÖ Comparison plots saved to: {output_file}")
    
    plt.close()


# ============================================================================
# Main Analysis Pipeline
# ============================================================================

def run_comparison_analysis():
    """
    Complete comparison analysis pipeline
    """
    print("\n" + "üöÄ STARTING ALPHAFOLD PROTEIN COMPARISON" + "\n")
    
    # Analyze high confidence protein
    high_conf_result = analyze_protein(HIGH_CONFIDENCE_PROTEIN)
    
    if high_conf_result is None:
        print("‚ùå High confidence protein analysis failed")
        return None
    
    # Analyze low confidence protein
    low_conf_result = analyze_protein(LOW_CONFIDENCE_PROTEIN)
    
    if low_conf_result is None:
        print("‚ùå Low confidence protein analysis failed")
        return None
    
    # Compare results
    compare_proteins(high_conf_result, low_conf_result)
    
    # Prepare final results
    results = {
        'high_confidence': high_conf_result,
        'low_confidence': low_conf_result,
        'comparison': {
            'delta_mean_plddt': high_conf_result['statistics']['mean'] - low_conf_result['statistics']['mean'],
            'delta_std_plddt': high_conf_result['statistics']['std'] - low_conf_result['statistics']['std'],
            'mean_ratio': high_conf_result['statistics']['mean'] / low_conf_result['statistics']['mean']
        }
    }
    
    return results


# ============================================================================
# Execute Analysis
# ============================================================================

if __name__ == "__main__":
    results = run_comparison_analysis()
    
    if results:
        # Save results as JSON (more portable than pickle)
        output_json = 'alphafold_comparison_results.json'
        os.makedirs(os.path.dirname(output_json), exist_ok=True)
        
        # Convert numpy arrays to lists for JSON serialization
        json_results = {
            'high_confidence': {
                'name': results['high_confidence']['protein_info']['name'],
                'uniprot_id': results['high_confidence']['protein_info']['uniprot_id'],
                'statistics': results['high_confidence']['statistics'],
                'confidence': results['high_confidence']['confidence'],
                'matches_expectation': results['high_confidence']['matches_expectation']
            },
            'low_confidence': {
                'name': results['low_confidence']['protein_info']['name'],
                'uniprot_id': results['low_confidence']['protein_info']['uniprot_id'],
                'statistics': results['low_confidence']['statistics'],
                'confidence': results['low_confidence']['confidence'],
                'matches_expectation': results['low_confidence']['matches_expectation']
            },
            'comparison': results['comparison']
        }
        
        with open(output_json, 'w') as f:
            json.dump(json_results, f, indent=2)
        
        print(f"\nüíæ Results saved to: {output_json}")
        print("\n" + "="*80)
        print("‚úÖ ALPHAFOLD COMPARISON ANALYSIS COMPLETE!")
        print("="*80)
        print("\nüìä Key Findings:")
        print(f"   ‚Ä¢ {HIGH_CONFIDENCE_PROTEIN['name']}: Mean pLDDT = {results['high_confidence']['statistics']['mean']:.1f}")
        print(f"   ‚Ä¢ {LOW_CONFIDENCE_PROTEIN['name']}: Mean pLDDT = {results['low_confidence']['statistics']['mean']:.1f}")
        print(f"   ‚Ä¢ Difference: {results['comparison']['delta_mean_plddt']:.1f} points")
        print(f"\nüéØ AlphaFold successfully distinguishes structured vs disordered proteins!")
        
    else:
        print("\n‚ùå Comparison analysis failed")

üß¨ ALPHAFOLD DATABASE - PROTEIN CONFIDENCE COMPARISON

üìä Comparing Two Extreme Cases:

1Ô∏è‚É£  HIGH CONFIDENCE PROTEIN:
    Protein: Human Ubiquitin
    UniProt: P0CG48
    Length: 76 residues
    Type: Well-folded globular protein

2Ô∏è‚É£  LOW CONFIDENCE PROTEIN:
    Protein: Human Alpha-Synuclein
    UniProt: P37840
    Length: 140 residues
    Type: Intrinsically Disordered Protein (IDP)

üöÄ STARTING ALPHAFOLD PROTEIN COMPARISON


üî¨ ANALYZING: Human Ubiquitin
UniProt ID: P0CG48
Description: Small, highly structured protein - gold standard for protein folding
Expected mean pLDDT: >90

üîç Querying AlphaFold Database API...
   URL: https://alphafold.ebi.ac.uk/api/prediction/P0CG48
‚úÖ AlphaFold metadata retrieved!

üìä Prediction Information:
   UniProt ID: P0CG48
   Gene: UBC
   Organism: Homo sapiens
   Length: 685 residues
   Model version: 6
   Model created: 2025-08-01T00:00:00Z

üì• Downloading AlphaFold structure...
   URL: https://alphafold.ebi.ac.uk/files/AF-P0

TypeError: object of type 'ellipsis' has no len()