In [None]:
# Generate Protein Embeddings using Efficient One-Hot Encoding
# 
# This script implements an optimized one-hot encoding approach for protein sequences:
# 1. Loads kinase target sequences from FASTA file
# 2. Creates position-aware features (N-terminal, C-terminal, central regions)
# 3. Generates interpretable amino acid composition and physicochemical features
# 4. Saves embeddings optimized for machine learning (115 features vs 21,000+ traditional)
#
# Key advantages:
# - Highly interpretable features with biological meaning
# - Efficient: ~100x fewer features than traditional one-hot encoding
# - Captures both sequence composition AND positional information
# - Perfect balance for MSc-level drug-target prediction projects

In [None]:
import numpy as np
import os
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm

In [4]:
# Efficient One-Hot Encoding with Interpretability Focus
def generate_efficient_onehot_embeddings(fasta_path, output_csv="data/step4_efficient_onehot_embeddings.csv"):
    """
    Efficient One-Hot Encoding approach:
    1. Adaptive sequence length (no fixed padding)
    2. Position-independent features (pooling strategies)
    3. Interpretable feature names
    4. Much smaller feature space while preserving information
    """
    
    # Standard amino acids
    amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 
                   'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    
    # Physicochemical properties for grouping
    hydrophobic = ['A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W']
    polar = ['S', 'T', 'N', 'Q'] 
    positive = ['K', 'R', 'H']
    negative = ['D', 'E']
    special = ['C', 'G', 'P']
    
    # Read sequences
    sequences = list(SeqIO.parse(fasta_path, "fasta"))
    print(f"Found {len(sequences)} protein sequences.")
    
    records = []
    
    for record in tqdm(sequences, desc="Generating efficient one-hot embeddings"):
        name = record.id
        sequence = str(record.seq).upper()
        seq_length = len(sequence)
        
        # Convert sequence to one-hot matrix (seq_length x 20)
        one_hot_matrix = np.zeros((seq_length, len(amino_acids)))
        
        for i, aa in enumerate(sequence):
            if aa in amino_acids:
                aa_index = amino_acids.index(aa)
                one_hot_matrix[i, aa_index] = 1
        
        # Strategy 1: Amino Acid Counts (interpretable)
        aa_counts = np.sum(one_hot_matrix, axis=0)
        
        # Strategy 2: Positional Statistics (efficient + interpretable)
        # N-terminal features (first 10% of sequence)
        n_term_len = max(1, seq_length // 10)
        n_term_composition = np.mean(one_hot_matrix[:n_term_len], axis=0)
        
        # C-terminal features (last 10% of sequence)
        c_term_composition = np.mean(one_hot_matrix[-n_term_len:], axis=0)
        
        # Central region features (middle 80%)
        start_central = seq_length // 10
        end_central = seq_length - seq_length // 10
        if end_central > start_central:
            central_composition = np.mean(one_hot_matrix[start_central:end_central], axis=0)
        else:
            central_composition = np.mean(one_hot_matrix, axis=0)
        
        # Strategy 3: Physicochemical Region Analysis
        def get_region_properties(region_matrix):
            if region_matrix.shape[0] == 0:
                return [0, 0, 0, 0, 0]
            
            hydrophobic_content = np.sum([region_matrix[:, amino_acids.index(aa)].sum() 
                                        for aa in hydrophobic if aa in amino_acids])
            polar_content = np.sum([region_matrix[:, amino_acids.index(aa)].sum() 
                                  for aa in polar if aa in amino_acids])
            positive_content = np.sum([region_matrix[:, amino_acids.index(aa)].sum() 
                                     for aa in positive if aa in amino_acids])
            negative_content = np.sum([region_matrix[:, amino_acids.index(aa)].sum() 
                                     for aa in negative if aa in amino_acids])
            special_content = np.sum([region_matrix[:, amino_acids.index(aa)].sum() 
                                    for aa in special if aa in amino_acids])
            
            total = region_matrix.shape[0]
            return [hydrophobic_content/total, polar_content/total, 
                   positive_content/total, negative_content/total, special_content/total]
        
        n_term_props = get_region_properties(one_hot_matrix[:n_term_len])
        c_term_props = get_region_properties(one_hot_matrix[-n_term_len:])
        central_props = get_region_properties(one_hot_matrix[start_central:end_central])
        
        # Build feature dictionary with interpretable names
        features = {"id": name, "length": seq_length}
        
        # Total amino acid counts (20 features)
        for i, aa in enumerate(amino_acids):
            features[f"count_{aa}"] = int(aa_counts[i])
        
        # N-terminal composition (20 features)
        for i, aa in enumerate(amino_acids):
            features[f"nterm_freq_{aa}"] = n_term_composition[i]
        
        # C-terminal composition (20 features) 
        for i, aa in enumerate(amino_acids):
            features[f"cterm_freq_{aa}"] = c_term_composition[i]
        
        # Central region composition (20 features)
        for i, aa in enumerate(amino_acids):
            features[f"central_freq_{aa}"] = central_composition[i]
        
        # Physicochemical properties by region (15 features)
        prop_names = ['hydrophobic', 'polar', 'positive', 'negative', 'special']
        for i, prop in enumerate(prop_names):
            features[f"nterm_{prop}"] = n_term_props[i]
            features[f"cterm_{prop}"] = c_term_props[i] 
            features[f"central_{prop}"] = central_props[i]
        
        # Gradient features (interpretable positional trends) (20 features)
        for i, aa in enumerate(amino_acids):
            if seq_length > 1:
                # Simple gradient: difference between C-term and N-term frequency
                gradient = c_term_composition[i] - n_term_composition[i]
                features[f"gradient_{aa}"] = gradient
            else:
                features[f"gradient_{aa}"] = 0
        
        records.append(features)
    
    # Save to CSV
    df = pd.DataFrame(records)
    os.makedirs("data", exist_ok=True)
    df.to_csv(output_csv, index=False)
    
    print(f"✓ Efficient one-hot embeddings saved to '{output_csv}'")
    print(f"✓ Shape: {df.shape[0]} sequences × {df.shape[1]-1} features")
    print(f"✓ Feature breakdown:")
    print(f"   - Amino acid counts: 20 features")
    print(f"   - N-terminal composition: 20 features") 
    print(f"   - C-terminal composition: 20 features")
    print(f"   - Central composition: 20 features")
    print(f"   - Physicochemical regions: 15 features")
    print(f"   - Positional gradients: 20 features")
    print(f"   - Total interpretable features: {df.shape[1]-2}")
    
    return df

In [5]:
# Efficient One-Hot Encoding: MSc Project Focus
def run_efficient_one_hot(fasta_path):
    """
    Generate Efficient One-Hot Encodings for protein sequences.
    Streamlined for MSc project requirements.
    """
    
    print("🧬 Generating Efficient One-Hot Protein Embeddings...")
    print("=" * 50)
    
    # Generate the embeddings
    result_df = generate_efficient_onehot_embeddings(fasta_path)
    
    print("\n✅ Embedding generation complete!")
    print(f"📊 Output file: data/step4_efficient_onehot_embeddings.csv")
    print(f"📈 Features: {result_df.shape[1]-1} interpretable features")
    print(f"🔬 Sequences: {result_df.shape[0]} kinase targets")
    
    # Display feature summary
    print(f"\n📋 Feature Summary:")
    print(f"   • Amino acid counts: 20 features")
    print(f"   • N-terminal composition: 20 features") 
    print(f"   • C-terminal composition: 20 features")
    print(f"   • Central region composition: 20 features")
    print(f"   • Physicochemical properties: 15 features")
    print(f"   • Positional gradients: 20 features")
    print(f"   • Total: {result_df.shape[1]-2} features (vs 21,000+ traditional)")
    
    print(f"\n🎯 Ready for machine learning pipeline!")

# Run the efficient one-hot encoding
if __name__ == "__main__":
    fasta_path = "data/step3_kinase_target_sequences.fasta"
    
    run_efficient_one_hot(fasta_path)

🧬 Generating Efficient One-Hot Protein Embeddings...
Found 188 protein sequences.


Generating efficient one-hot embeddings: 100%|██████████| 188/188 [00:00<00:00, 500.81it/s]
Generating efficient one-hot embeddings: 100%|██████████| 188/188 [00:00<00:00, 500.81it/s]


✓ Efficient one-hot embeddings saved to 'data/step4_efficient_onehot_embeddings.csv'
✓ Shape: 188 sequences × 116 features
✓ Feature breakdown:
   - Amino acid counts: 20 features
   - N-terminal composition: 20 features
   - C-terminal composition: 20 features
   - Central composition: 20 features
   - Physicochemical regions: 15 features
   - Positional gradients: 20 features
   - Total interpretable features: 115

✅ Embedding generation complete!
📊 Output file: data/step4_efficient_onehot_embeddings.csv
📈 Features: 116 interpretable features
🔬 Sequences: 188 kinase targets

📋 Feature Summary:
   • Amino acid counts: 20 features
   • N-terminal composition: 20 features
   • C-terminal composition: 20 features
   • Central region composition: 20 features
   • Physicochemical properties: 15 features
   • Positional gradients: 20 features
   • Total: 115 features (vs 21,000+ traditional)

🎯 Ready for machine learning pipeline!
