In [1]:
import numpy as np
import pandas as pd
import random
import hashlib
from collections import defaultdict
import networkx as nx


In [2]:
%store -r train_df


In [4]:
class MinHasher:
    def __init__(self, num_hashes=50):
        self.num_hashes = num_hashes
        self.max_hash = (1 << 32) - 1
        self.hash_seeds = [random.randint(0, self.max_hash) for _ in range(self.num_hashes)]

    def hash_shingle(self, shingle):
        """Hash each shingle using multiple hash functions."""
        return [int(hashlib.sha256((str(seed) + shingle).encode()).hexdigest(), 16) % self.max_hash for seed in self.hash_seeds]

    def compute_signature(self, sequence):
        """Compute MinHash signature for a peptide sequence."""
        shingles = {sequence[i:i+3] for i in range(len(sequence) - 2)}  # 3-mer shingles
        signature = [min(self.hash_shingle(shingle)) for shingle in shingles]
        return signature

# Apply MinHashing
minhasher = MinHasher(num_hashes=50)
train_df["minhash_signature"] = train_df["peptide_seq"].apply(lambda x: minhasher.compute_signature(x))

train_df.head()


Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target,minhash_signature
0,Q07337,MKKNTLSAILMTLFLFISCNNSGKDGNTSANSADESVKGPNLTEIS...,152,164,DADAKEAILKTNG,0.445652,0.022263,0.314149,0.693316,0.545305,0.130639,0.484455,0.101273,0,"[53574742, 55576440, 20817820, 120033596, 5123..."
1,F5HB53,MESRIWCLVVCVNLCIVCLGAAVSSSSTRGTSATHSHHSSHTTSAA...,76,90,LKYGDVVGVNTTKYP,0.524704,0.012585,0.52518,0.60108,0.377196,0.5384,0.50664,0.229353,1,"[28420430, 39882241, 123114014, 353988448, 263..."
2,P40136,MTRNKFIPNKFSIISFSVLLFAISSSQAIEVNAMNEHYTESDIKRN...,421,430,KGKKEIDNGK,0.597826,0.060091,0.17506,0.7785,0.357386,0.548684,0.391274,0.210738,0,"[188958994, 98564168, 49556088, 9255067, 29675..."
3,Q13018,MLLSPSLLLLLLLGAPRGCAEGVAAALTPERLLEWQDKGIFVIQSE...,111,125,VSLRWRCNRKMITGP,0.47332,0.010762,0.405276,0.538267,0.238595,0.708826,0.471259,0.286228,0,"[41684785, 133316561, 120540694, 71313097, 459..."
4,Q39967,MASVEVESAATALPKNETPEVTKAEETKTEEPAAPPASEQETADAT...,66,73,TEKAEEVE,0.224308,0.022534,0.294964,0.776406,0.046191,0.0,0.217079,0.659516,0,"[59049499, 195688861, 112582848, 59136136, 702..."


In [9]:
import pandas as pd

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False

class EpitopeTrie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        """Insert a peptide sequence into the Trie"""
        word = word.strip().upper()  # Normalize input
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end = True

    def search_exact(self, word):
        """Check if an exact peptide exists in the dataset"""
        word = word.strip().upper()
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.is_end

    def search_subsequence(self, subseq, peptides):
        """Check if any peptide contains a given subsequence (MHC binding check)"""
        subseq = subseq.strip().upper()
        return any(subseq in peptide for peptide in peptides)



# Normalize peptide sequences
train_df["peptide_seq"] = train_df["peptide_seq"].astype(str).str.strip().str.upper()

# Build Trie from peptide sequences
trie = EpitopeTrie()
for peptide in train_df["peptide_seq"]:
    trie.insert(peptide)

# Analyze all peptides for exact matches and MHC motif presence
train_df["Exact Match Exists"] = train_df["peptide_seq"].apply(lambda x: trie.search_exact(x))
train_df["Appears as Subsequence (MHC Binding)"] = train_df["peptide_seq"].apply(lambda x: trie.search_subsequence(x, train_df["peptide_seq"]))

# Save updated dataset with new features
train_df.to_csv("train_df_with_trie_features.csv", index=False)

print("\n Analysis complete! Updated dataset saved to 'train_df_with_trie_features.csv'.")



 Analysis complete! Updated dataset saved to 'train_df_with_trie_features.csv'.


In [10]:
train_df.head()


Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target,minhash_signature,Exact Match Exists,Appears as Subsequence (MHC Binding)
0,Q07337,MKKNTLSAILMTLFLFISCNNSGKDGNTSANSADESVKGPNLTEIS...,152,164,DADAKEAILKTNG,0.445652,0.022263,0.314149,0.693316,0.545305,0.130639,0.484455,0.101273,0,"[53574742, 55576440, 20817820, 120033596, 5123...",True,True
1,F5HB53,MESRIWCLVVCVNLCIVCLGAAVSSSSTRGTSATHSHHSSHTTSAA...,76,90,LKYGDVVGVNTTKYP,0.524704,0.012585,0.52518,0.60108,0.377196,0.5384,0.50664,0.229353,1,"[28420430, 39882241, 123114014, 353988448, 263...",True,True
2,P40136,MTRNKFIPNKFSIISFSVLLFAISSSQAIEVNAMNEHYTESDIKRN...,421,430,KGKKEIDNGK,0.597826,0.060091,0.17506,0.7785,0.357386,0.548684,0.391274,0.210738,0,"[188958994, 98564168, 49556088, 9255067, 29675...",True,True
3,Q13018,MLLSPSLLLLLLLGAPRGCAEGVAAALTPERLLEWQDKGIFVIQSE...,111,125,VSLRWRCNRKMITGP,0.47332,0.010762,0.405276,0.538267,0.238595,0.708826,0.471259,0.286228,0,"[41684785, 133316561, 120540694, 71313097, 459...",True,True
4,Q39967,MASVEVESAATALPKNETPEVTKAEETKTEEPAAPPASEQETADAT...,66,73,TEKAEEVE,0.224308,0.022534,0.294964,0.776406,0.046191,0.0,0.217079,0.659516,0,"[59049499, 195688861, 112582848, 59136136, 702...",True,True


In [6]:
import networkx as nx
import pandas as pd
from itertools import combinations
from collections import defaultdict

# Load the previous dataset (contains Trie + Hashing results)
previous_df = pd.read_csv("train_df_with_trie_features.csv")  

def peptide_similarity(p1, p2, k=3):
    """Calculate similarity based on shared k-mers (substrings of length k)."""
    set1 = {p1[i:i+k] for i in range(len(p1) - k + 1)}
    set2 = {p2[i:i+k] for i in range(len(p2) - k + 1)}
    return len(set1 & set2) / max(len(set1), len(set2)) if set1 and set2 else 0

def build_peptide_graph(train_df, threshold=0.3):
    """Build a graph where nodes are peptides and edges represent similarity."""
    G = nx.Graph()
    peptides = train_df["peptide_seq"].unique()

    # Create an LSH-like structure using k-mer hashes
    kmer_map = defaultdict(set)
    for peptide in peptides:
        kmers = {peptide[i:i+3] for i in range(len(peptide) - 2)}
        for kmer in kmers:
            kmer_map[kmer].add(peptide)
    
    # Only compare peptides that share at least one k-mer
    for kmer, peptide_set in kmer_map.items():
        for p1, p2 in combinations(peptide_set, 2):
            similarity = peptide_similarity(p1, p2)
            if similarity >= threshold:
                G.add_edge(p1, p2, weight=1 - similarity)  # Lower weight = higher similarity
    
    return G

# Build graph
G = build_peptide_graph(previous_df)

# Compute all shortest paths once
shortest_paths = dict(nx.all_pairs_dijkstra_path_length(G, weight="weight"))

# Store shortest path results
path_results = []
for peptide in previous_df["peptide_seq"].unique():
    if peptide in shortest_paths:
        # Find the closest *different* peptide
        neighbors = {p: d for p, d in shortest_paths[peptide].items() if p != peptide}
        
        if neighbors:
            closest_peptide = min(neighbors, key=neighbors.get)
            min_distance = neighbors[closest_peptide]
        else:
            closest_peptide, min_distance = None, None
    else:
        closest_peptide, min_distance = None, None

    path_results.append({
        "peptide_seq": peptide,
        "Closest Peptide": closest_peptide,
        "Path Length": min_distance
    })


# Convert to DataFrame
path_df = pd.DataFrame(path_results)

# Merge with the existing dataset (Trie + Hashing results)
final_df = previous_df.merge(path_df, on="peptide_seq", how="left")

# Save updated dataset (Trie + Hashing + Dijkstra results)
final_df.to_csv("peptide_analysis_results.csv", index=False)

# Display the first few rows
final_df.head()


Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target,minhash_signature,Exact Match Exists,Appears as Subsequence (MHC Binding),Closest Peptide,Path Length
0,Q07337,MKKNTLSAILMTLFLFISCNNSGKDGNTSANSADESVKGPNLTEIS...,152,164,DADAKEAILKTNG,0.445652,0.022263,0.314149,0.693316,0.545305,0.130639,0.484455,0.101273,0,"[53574742, 55576440, 20817820, 120033596, 5123...",True,True,VTDADAKEAILKT,0.181818
1,F5HB53,MESRIWCLVVCVNLCIVCLGAAVSSSSTRGTSATHSHHSSHTTSAA...,76,90,LKYGDVVGVNTTKYP,0.524704,0.012585,0.52518,0.60108,0.377196,0.5384,0.50664,0.229353,1,"[28420430, 39882241, 123114014, 353988448, 263...",True,True,VVGVNTTKYPYRVCS,0.384615
2,P40136,MTRNKFIPNKFSIISFSVLLFAISSSQAIEVNAMNEHYTESDIKRN...,421,430,KGKKEIDNGK,0.597826,0.060091,0.17506,0.7785,0.357386,0.548684,0.391274,0.210738,0,"[188958994, 98564168, 49556088, 9255067, 29675...",True,True,KKEIDNGKKY,0.125
3,Q13018,MLLSPSLLLLLLLGAPRGCAEGVAAALTPERLLEWQDKGIFVIQSE...,111,125,VSLRWRCNRKMITGP,0.47332,0.010762,0.405276,0.538267,0.238595,0.708826,0.471259,0.286228,0,"[41684785, 133316561, 120540694, 71313097, 459...",True,True,CDSTLVSLRWRCNRK,0.384615
4,Q39967,MASVEVESAATALPKNETPEVTKAEETKTEEPAAPPASEQETADAT...,66,73,TEKAEEVE,0.224308,0.022534,0.294964,0.776406,0.046191,0.0,0.217079,0.659516,0,"[59049499, 195688861, 112582848, 59136136, 702...",True,True,EKAEEVEK,0.166667
