In [1]:
import os

os.chdir("../../analyses_data/fastas/fetched_02_2024")

In [2]:
from pyteomics import fasta, parser
import itertools

def find_intersecting_peptides(list_of_files):
    peptide_sets = [generate_tryptic_peptides_pyteomics(fasta_file) for fasta_file in list_of_files]

    # Find all unique intersections using combinations to avoid redundant comparisons
    intersecting_peptides = set()
    for set1, set2 in itertools.combinations(peptide_sets, 2):
        intersecting_peptides.update(set1.intersection(set2))
    
    return intersecting_peptides


def generate_tryptic_peptides_pyteomics(fasta_file):
    peptides = set()
    for description, sequence in fasta.read(fasta_file):
        # Generate tryptic peptides for each sequence
        for peptide in parser.cleave(sequence, parser.expasy_rules["trypsin"], missed_cleavages=2, min_length=6):
            if peptide:
                peptides.add(peptide)
    return peptides

list_of_fastas = ["uniprotkb_organism_id_4932_2024_02_23_YEAST.fasta", "uniprotkb_organism_id_6239_2024_02_23_CAEEL.fasta", 
                                                    "uniprotkb_organism_id_83333_2024_02_23_ECOLI.fasta", "uniprotkb_organism_id_9606_2024_02_23_HUMAN.fasta"]
intersecting_peptides = find_intersecting_peptides(list_of_fastas)

print(f"Number of intersecting peptides: {len(intersecting_peptides)}")



Number of intersecting peptides: 23174


In [3]:
list_of_fastas = ["uniprotkb_organism_id_4932_2024_02_23_YEAST.fasta", "uniprotkb_organism_id_6239_2024_02_23_CAEEL.fasta", 
                                                    "uniprotkb_organism_id_83333_2024_02_23_ECOLI.fasta", "uniprotkb_organism_id_9606_2024_02_23_HUMAN.fasta"]
for combo in itertools.combinations(list_of_fastas, 2):
    print(combo)

('uniprotkb_organism_id_4932_2024_02_23_YEAST.fasta', 'uniprotkb_organism_id_6239_2024_02_23_CAEEL.fasta')
('uniprotkb_organism_id_4932_2024_02_23_YEAST.fasta', 'uniprotkb_organism_id_83333_2024_02_23_ECOLI.fasta')
('uniprotkb_organism_id_4932_2024_02_23_YEAST.fasta', 'uniprotkb_organism_id_9606_2024_02_23_HUMAN.fasta')
('uniprotkb_organism_id_6239_2024_02_23_CAEEL.fasta', 'uniprotkb_organism_id_83333_2024_02_23_ECOLI.fasta')
('uniprotkb_organism_id_6239_2024_02_23_CAEEL.fasta', 'uniprotkb_organism_id_9606_2024_02_23_HUMAN.fasta')
('uniprotkb_organism_id_83333_2024_02_23_ECOLI.fasta', 'uniprotkb_organism_id_9606_2024_02_23_HUMAN.fasta')


In [4]:
import pandas as pd

df_intersecting_peptides = pd.DataFrame(intersecting_peptides, columns=["peptide"])
display(df_intersecting_peptides)


df_intersecting_peptides.to_csv("intersecting_peptides_human_yeast_cael_ecoli.tsv", sep="\t", index=False)

Unnamed: 0,peptide
0,ELQLEK
1,RLLQMK
2,LKKDAGR
3,NIGISAHIDSGK
4,GRPSGR
...,...
23169,LPLSLR
23170,KGASRR
23171,MKVAFK
23172,EILDLR
