# IMMUNOPEPTIDOMICS EVIDENCE

This pipeline tends to look for evidence supporting predicted neoantigens.
At least two datasets are required.

Based on: Chong, SPENCER and LncPep

In [None]:
import os
import pandas as pd
import glob
import numpy as np

DATADIR="data"
GENERAL = DATADIR

projects = [f.name for f in os.scandir(DATADIR) if f.is_dir()]

###if projects names are not the GEO ids, we suggest to change them. Keys correspond to project names as they are in the folder and values the GEO id
#dict_projects = {'liver_adjacent_totalRNA':'GSE101432', 'hcc_normal_totalRNA':'GSE77314','zou_hcc_RP_totalRNA':'GSE112705', 'GSE193567':'GSE193567'}
#GEO = dict_projects.values()
#GEO_list = list(GEO)

In [None]:
bash_list_projects = ''
for item in projects:
    bash_list_projects += str(item)+' '
print(bash_list_projects)

In [None]:
chong = pd.read_excel("immunopeptidomes_evidences/Chong_etal_2020_SupData3_41467_2020_14968_MOESM5_ESM.xlsx", skiprows=1)
chong['Transcript_ID'] = chong['Transcript_ID'].str[:-2]
to_compare_chong = chong.Sequence.values.tolist()

In [None]:
for project in projects:
    print(project)
    DIR = GENERAL + project

    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)

    patients_id=list(PATIENTS.iloc[:,0])
    folders=['noncanonical_CIPHER']
    out=DIR+"/analysis/14_immunopeptidomics/peptides_id_chong.csv"
    out_set=DIR+"/analysis/14_immunopeptidomics/only_peptides_chong.csv"
    total_peptides = pd.DataFrame()
    for f in folders:
        for p in patients_id:
            merged = pd.DataFrame()
            INDIR=DIR+"/analysis/11_PeptideBindingMHC/"+f+"/"+str(p)

            INFILE=pd.read_csv(INDIR+"/"+str(p)+"_peptides_GTEx.csv")
            shared = INFILE[INFILE['Peptide'].isin(to_compare_chong)]
            small = shared[['Peptide','transcript_id','gene_id']].drop_duplicates()
            #print(small)
            total_peptides = pd.concat([total_peptides,small])
    #print(total_peptides)
    total_peptides['type'] = np.where(total_peptides['transcript_id'].str.contains('ENST'), 'annotated', 'novel')
    total_peptides.to_csv(out, index=False)

    exclusive = pd.DataFrame(set(total_peptides.Peptide.values.tolist()))
    exclusive.to_csv(out_set, index=False, header=False)


In [None]:
%%bash -s "$GENERAL" "$bash_list_projects"

for project in $2; do
    echo $project
    cat ${1}${project}/analysis/14_immunopeptidomics/peptides_id_chong.csv | tail -n +2 | sort | uniq -c
    #cat ${1}${project}/analysis/14_immunopeptidomics/peptides_id_chong.csv | tail -n +2 | cut -d, -f1 | sort | uniq -c
done


**SPENCER**

In [None]:
spencer = pd.read_csv("immunopeptidomes_evidences/SPENCER_Immunogenic_peptide_info.txt", sep="\t")
spencer_to_compare = spencer.sequence.values.tolist()

In [None]:
for project in projects:
    print(project)
    DIR = GENERAL + project

    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)

    patients_id=list(PATIENTS.iloc[:,0])
    folders=['noncanonical_CIPHER']
    out=DIR+"/analysis/14_immunopeptidomics/peptides_id_spencer.csv"
    out_set=DIR+"/analysis/14_immunopeptidomics/only_peptides_spencer.csv"
    total_peptides = pd.DataFrame()
    for f in folders:
        for p in patients_id:
            merged = pd.DataFrame()
            INDIR=DIR+"/analysis/11_PeptideBindingMHC/"+f+"/"+str(p)

            INFILE=pd.read_csv(INDIR+"/"+str(p)+"_peptides_GTEx.csv")
            shared = INFILE[INFILE['Peptide'].isin(spencer_to_compare)]
            small = shared[['Peptide','transcript_id','gene_id']].drop_duplicates()
            #print(small)
            total_peptides = pd.concat([total_peptides,small])
    #print(total_peptides)
    total_peptides['type'] = np.where(total_peptides['transcript_id'].str.contains('ENST'), 'annotated', 'novel')
    total_peptides.to_csv(out, index=False)
    exclusive = pd.DataFrame(set(total_peptides.Peptide.values.tolist()))
    exclusive.to_csv(out_set, index=False, header=False)


In [None]:
%%bash -s "$GENERAL" "$bash_list_projects"

for project in $2; do
    echo $project
    cat ${1}${project}/analysis/14_immunopeptidomics/peptides_id_spencer.csv | tail -n +2 | sort | uniq -c
    #cat ${1}${project}/analysis/14_immunopeptidomics/peptides_id_spencer.csv | tail -n +2 | cut -d, -f1 | sort | uniq -c
done


**LncPep**

In [None]:
%%bash

tar -xvf immunopeptidomes_evidences/pep.info.human.txt.tar

In [None]:
lncpep = pd.read_csv("immunopeptidomes_evidences/pep.info.human.txt", sep="\t")
lncpep_to_compare = lncpep.sequence.values.tolist()

In [None]:
for project in projects:
    print(project)
    DIR = GENERAL + project

    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)

    patients_id=list(PATIENTS.iloc[:,0])
    folders=['noncanonical_CIPHER']
    out=DIR+"/analysis/14_immunopeptidomics/peptides_id_lncpep.csv"
    out_set=DIR+"/analysis/14_immunopeptidomics/only_peptides_lncpep.csv"
    total_peptides = pd.DataFrame()
    for f in folders:
        for p in patients_id:
            print(p)
            merged = pd.DataFrame()
            INDIR=DIR+"/analysis/11_PeptideBindingMHC/"+f+"/"+str(p)

            INFILE=pd.read_csv(INDIR+"/"+str(p)+"_peptides_GTEx.csv")
            
            peptides = INFILE.Peptide.values.tolist()
            shared=pd.DataFrame()
            small = pd.DataFrame()
            for i in lncpep_to_compare:
                for pep in peptides:
                    if pep in i:
                        present=INFILE[INFILE['Peptide'] == pep]
                        shared = pd.concat([shared,present])
            try:
                small = shared[['Peptide','transcript_id','gene_id']].drop_duplicates()
                #print(small)
                total_peptides = pd.concat([total_peptides,small])
            except:
                next
                
    #print(total_peptides)
    total_peptides['type'] = np.where(total_peptides['transcript_id'].str.contains('ENST'), 'annotated', 'novel')
    total_peptides.to_csv(out, index=False)
    exclusive = pd.DataFrame(set(total_peptides.Peptide.values.tolist()))
    exclusive.to_csv(out_set, index=False, header=False)


In [None]:
%%bash -s "$GENERAL" "$bash_list_projects"

for project in $2; do
echo ${project}
#cat ${1}${project}/analysis/14_immunopeptidomics/peptides_id_lncpep.csv | tail -n +2 | sort | uniq -c
cat ${1}/${project}/analysis/14_immunopeptidomics/peptides_id_lncpep.csv | tail -n +2 | cut -d, -f1 | sort | uniq -c
done

Create table : peptide | spencer/chong/lncpep | novel/known | patients project1 | patients project 2 | patients project3

In [None]:
total_chong = pd.DataFrame()

for project in projects:
    DIR = GENERAL + project
    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)
    patients_id=list(PATIENTS.iloc[:,0])
    INDIR = DIR + "/analysis/14_immunopeptidomics"
    os.chdir(INDIR)
    for file in glob.glob("peptides_id_*csv"):
        if 'chong' in file:
            chong = pd.read_csv(os.path.join(INDIR,file))
            # First we create count column with transform
            chong[dict_projects[project]] = chong.groupby(['Peptide', 'transcript_id']).Peptide.transform('size')
            chong = chong.drop_duplicates()
            total_chong = pd.concat([total_chong, chong])
total_chong = total_chong.fillna(0)
total_chong[GEO_list] = total_chong[GEO_list].astype(int)
total_chong['evidence_source'] = 'Chong et al.'

In [None]:
total_chong.to_csv(os.path.join(GENERAL,"chong_evidence_peptides.csv"), index=False)
total_chong

In [None]:
total_spencer = pd.DataFrame()

for project in projects:
    DIR = GENERAL + project
    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)
    patients_id=list(PATIENTS.iloc[:,0])
    INDIR = DIR + "/analysis/14_immunopeptidomics"
    os.chdir(INDIR)
    for file in glob.glob("peptides_id_*csv"):
        if 'spencer' in file:
            spencer = pd.read_csv(os.path.join(INDIR,file))
            # First we create count column with transform
            spencer[dict_projects[project]] = spencer.groupby(['Peptide', 'transcript_id']).Peptide.transform('size')
            spencer = spencer.drop_duplicates()
            total_spencer = pd.concat([total_spencer, spencer])
total_spencer = total_spencer.fillna(0)
total_spencer[GEO_list] = total_spencer[GEO_list].astype(int)
total_spencer['evidence_source'] = 'SPENCER database'

In [None]:
total_spencer.to_csv(os.path.join(GENERAL,"spencer_evidence_peptides.csv"), index=False)
total_spencer

In [None]:
total_lncpep = pd.DataFrame()

for project in projects:
    DIR = GENERAL + project
    try:
        PATIENTS=pd.read_csv(DIR+"/results/patients.txt", sep="\t", header=None)
    except:
        PATIENTS=pd.read_csv(DIR+"/results/patients.csv", header=None)
    patients_id=list(PATIENTS.iloc[:,0])
    INDIR = DIR + "/analysis/14_immunopeptidomics"
    os.chdir(INDIR)
    for file in glob.glob("peptides_id_*csv"):
        if 'lncpep' in file:
            lncpep = pd.read_csv(os.path.join(INDIR,file))
            # First we create count column with transform
            lncpep[dict_projects[project]] = lncpep.groupby(['Peptide', 'transcript_id']).Peptide.transform('size')
            lncpep = lncpep.drop_duplicates()
            total_lncpep = pd.concat([total_lncpep, lncpep])
total_lncpep = total_lncpep.fillna(0)
total_lncpep[GEO_list] = total_lncpep[GEO_list].astype(int)
total_lncpep['evidence_source'] = 'LncPep database'

In [None]:
total_lncpep.to_csv(os.path.join(GENERAL,"lncpep_evidence_peptides.csv"), index=False)
total_lncpep

In [None]:
table = pd.concat([total_spencer, total_chong, total_lncpep])
table = table.sort_values('Peptide')
#table = table.drop_duplicates(subset=['Peptide', 'type'])
#table.drop('ID', inplace=True, axis=1)
table.to_csv(os.path.join(GENERAL,"peptides_with_immunopeptidomics_evidence.csv"), index=False)
table