In [1]:
# import packages
import os, subprocess
import numpy as np
import pandas as pd
import glob
import sys
#set NetMHCpan envrionment variable
os.environ['NETMHCpan'] = '/scratch/prj/kordastilab_neoepitope/gffutil/netMHCpan-4.1/Linux_x86_64'

In [3]:
# function to processHLA file 
def ProcessHLA(file_path):
    df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=["HLA", "n"])
    hla = df["HLA"].tolist()
    return hla 

In [16]:
# Function: processPeps
# Inputs: peptide path(output file from SE_As2Pep) 
# Returns: None (2 separate fasta files will be generated) 
# Summary: processing output from SE-As2Pep  
def processPeps(hd_pep, hd_output_path, dis_output_path):
    
    dif = hd_pep[:,6] == 'Y'
    hd_pep = hd_pep[dif]
    
    with open(hd_output_path, 'w') as f_hd, open(dis_output_path, 'w') as f_dis:
        for l in hd_pep:
            gene_name = l[1]
            type = l[5]
            
            if type == 'Inc':
                dis_pep = l[3]
                hd_pep1 = l[2]
            else:
                dis_pep = l[2]
                hd_pep1 = l[3]
                
            f_hd.write(f'>{gene_name}\n')
            f_hd.write(f'{hd_pep1}\n')
            f_dis.write(f'>{gene_name}\n')
            f_dis.write(f'{dis_pep}\n')

    return

In [27]:
def processSEPeps(hd_pep, hd_output_path, dis_output_path):
    
    dif = hd_pep[:,6] == 'Y'
    hd_pep = hd_pep[dif]
    
    with open(hd_output_path, 'w') as f_hd, open(dis_output_path, 'w') as f_dis:
        for l in hd_pep:
            gene_name = l[1]
            type = l[5]
            
            if type == 'Inc':
                dis_pep = l[2]
                hd_pep1 = l[3]
            else:
                dis_pep = l[3]
                hd_pep1 = l[2]
                
            f_hd.write(f'>{gene_name}\n')
            f_hd.write(f'{hd_pep1}\n')
            f_dis.write(f'>{gene_name}\n')
            f_dis.write(f'{dis_pep}\n')

    return

In [7]:
#process output from Seq2Pep file and generate fsa file function, chose the corresponding inclusion/exclusion type for disease-associated AS events
def ProcessSeq(dis_pep,pep_path):
    
    with open(pep_path, 'w') as f_dis:
        for l in dis_pep:
            gene_name = l[1]
            type = l[5]
            comp = l[6]
            if comp == 'N': 
                if type == 'Inc':
                    dis_pep = l[3]
                else:
                    dis_pep = l[2]
                f_dis.write(f'>{gene_name}\n')
                f_dis.write(f'{dis_pep}\n')
    return

In [26]:
#process output from Seq2Pep and generate fsa file function
def ProcessSESeq(dis_pep,pep_path):
    
    with open(pep_path, 'w') as f_dis:
        for l in dis_pep:
            gene_name = l[1]
            type = l[5]
            comp = l[6]
            if comp == 'N': 
                if type == 'Inc':
                    dis_pep = l[2]
                else:
                    dis_pep = l[3]
                f_dis.write(f'>{gene_name}\n')
                f_dis.write(f'{dis_pep}\n')
    return

In [11]:
# Function: runNetMHCpan(version: 4.1b)
# Inputs: netMHCpan executable path, FASTA file of peptide sequences, patient HLA alleles, output directory, output file name
# Returns: None (netMHCIpan will automatically write output to a .xls file)
# Summary: runs netMHCIpan for HLA and peptide binding prediction.
def runNetMHCpan(netMHCpan_path, peptide_file, hlaalleles, output):
    xls_output_file = os.path.join(f'{output}.xls')
    txt_output_file = os.path.join(f'{output}.txt')
    # Define the NetMHCpan command
    netMHCpan_command = f'{netMHCpan_path} -BA -f {peptide_file} -a {hlaalleles} -s -xls -xlsfile {xls_output_file} > {txt_output_file}'
    # Run the NetMHCpan command
    subprocess.run(netMHCpan_command, shell=True, check=True)
    return

In [10]:
# Function: postProcessing
# Inputs: output files(.xls) from runNetMHCpan, hla alleles, and output directory
# Returns: tidied excel file showing neoantigens of the correspoinding genes and HLA alleles 
# Summary: post-process output file from runNetMHCpan function 

def postProcessing(dis_output,hlaalleles,output):
    
    dis_pep = pd.read_csv(f'{dis_output}.xls', delimiter='\t', header = 1)
    df = dis_pep
    # Create an Excel writer object
    writer = pd.ExcelWriter(f'{output}final_peptides.xlsx', engine='xlsxwriter')
    columns_per_sheet = 6
    hlaalleles1 = [item.replace(':', '-') for item in hlaalleles]
    # Split the DataFrame and write to separate sheets
    for sheet_name in hlaalleles1:
        start_col = hlaalleles1.index(sheet_name) * columns_per_sheet + 3
        end_col = start_col + columns_per_sheet
        columns_to_include = df.columns[0:3].tolist() + df.columns[start_col:end_col].tolist()
        # print(columns_to_include)
    
        sub_df = df[columns_to_include]
        sub_df.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.close()
    return 

In [19]:
# Function: postProcessing2 
# Inputs: output files(.xls) from runNetMHCpan 
# Returns: peptides exclusively present in disease-associated protein and generate xls file facet by different HLA alleles
# Summary: postprocess data from NetMHCpan, and organize it to a tidy table 

def postProcessing2(hd_output,dis_output,hlaalleles,output):
    
    hd_pep = pd.read_csv(f'{hd_output}.xls', delimiter='\t', header = 1)
    dis_pep = pd.read_csv(f'{dis_output}.xls', delimiter='\t', header = 1)
    filtered_se_peps = dis_pep[~dis_pep['Peptide'].isin(hd_pep['Peptide'])]
    df = filtered_se_peps
    # Create an Excel writer object
    writer = pd.ExcelWriter(f'{output}final_peptides.xlsx', engine='xlsxwriter')
    columns_per_sheet = 6
    hlaalleles1 = [item.replace(':', '-') for item in hlaalleles]
    # Split the DataFrame and write to separate sheets
    df = filtered_se_peps
    for sheet_name in hlaalleles1:
        start_col = hlaalleles1.index(sheet_name) * columns_per_sheet + 3
        end_col = start_col + columns_per_sheet
        columns_to_include = df.columns[0:3].tolist() + df.columns[start_col:end_col].tolist()
        # print(columns_to_include)
    
        sub_df = df[columns_to_include]
        sub_df.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.close()
    return 

In [None]:
############# RI ############### 
pep_path = "MPN_neoepitope/neoj_peptides/RI_peptides_MPN.txt"
all_pep_ri = np.loadtxt(pep_path, dtype=str, delimiter='\t')
file_path_MF = 'MPN_neoepitope/HLA_MPN/MF_HLA_PLT_ftr.txt'
hla_MF = ProcessHLA(file_path_MF)
output_dir = 'MPN_neoepitope/output/'
dis_pep_path = f'{output_dir}ri_mf_pep.fsa'
ProcessSeq(all_pep_ri,dis_pep_path)
#run netMHCpan
netMHCpan_path = './netMHCpan'
output_dir = 'MPN_neoepitope/output/'
Hla_MF=','.join(hla_MF)
dis_output = f'{output_dir}MF_ri_pep'
pv_pep = f'{output_dir}ri_mf_pep.fsa'
runNetMHCpan(netMHCpan_path, pv_pep, Hla_MF, dis_output)
# postprocessing of output files 
mf_ri = 'MPN_neoepitope/output/MF_ri_pep'
output = 'MPN_neoepitope/output/MF_ri'
postProcessing(mf_ri,hla_MF,output)


In [None]:
# def main():
#     all_pep_ri = np.loadtxt(pep_path, dtype=str, delimiter='\t')
#     hla_MF = ProcessHLA(file_path_MF)
#     dis_pep_path = f'{output_dir}ri_mf_pep.fsa'
#     ProcessSeq(all_pep_ri,dis_pep_path)
#     Hla_MF=','.join(hla_MF)
#     dis_output = f'{output_dir}MF_ri_pep'
#     pv_pep = f'{output_dir}ri_mf_pep.fsa'
#     runNetMHCpan(netMHCpan_path, pv_pep, Hla_MF, dis_output)
#     postProcessing(mf_ri,hla_MF,output)
    

In [None]:
# if __name__ == '__main__':
    
#     pep_path = "MPN_neoepitope/neoj_peptides/RI_peptides_MPN.txt"
#     file_path_MF = 'MPN_neoepitope/HLA_MPN/MF_HLA_PLT_ftr.txt'
#     output_dir = 'MPN_neoepitope/output/'
#     netMHCpan_path = './netMHCpan'
#     mf_ri = 'MPN_neoepitope/output/MF_ri_pep'
#     output = 'MPN_neoepitope/output/MF_ri'
#     main()

In [5]:
#######A5SS##########
pep_path = "MPN_neoepitope/neoj_peptides/A5SS_peptides_MPN.txt"
all_pep_a5 = np.loadtxt(pep_path, dtype=str, delimiter='\t')
dis_pep_path = f'{output_dir}a5_mf_pep.fsa'
with open(dis_pep_path, 'w') as f_dis:
    f_dis.write(f'>{all_pep_a5[0]}\n')
    f_dis.write(f'{all_pep_a5[3]}\n')
dis_output = f'{output_dir}MF_a5ss_pep'
mf_pep = f'{output_dir}a5_mf_pep.fsa'
runNetMHCpan(netMHCpan_path, mf_pep, Hla_MF, dis_output)
output = 'MPN_neoepitope/output/MF_a5ss'
postProcessing(dis_output,hla_MF,output)

['ENSG00000105640' 'RPL18A'
 'LREYKVFEKSPLRVKNFGIWLRYDSRSGTHNMYREYRDLTTAGAVTQC'
 'LREYKVVGRCLPTPKCHTPPLYRMRIFAPNHVVAKSRFWYFVSQLKKMKKSSGEIVYCGQVFEKSPLRVKNFGIWLRYDSRSGTHNMYREYRDLTTAGAVTQC'
 'MF' 'Ex' 'N']


In [17]:
######A3SS##########
pep_path = "MPN_neoepitope/neoj_peptides/A3SS_peptides_MPN.txt"
all_pep_a3 = np.loadtxt(pep_path, dtype=str, delimiter='\t')
PV_pep_a3 = all_pep_a3[all_pep_a3[:,4] == 'PV']
MF_pep_a3 = all_pep_a3[all_pep_a3[:,4] == 'MF']
pv_a3_pep_path = f'{output_dir}a3_pv_pep.fsa'
mf_a3_pep_path = f'{output_dir}a3_mf_pep.fsa'
ProcessSeq(PV_pep_a3,pv_a3_pep_path)
ProcessSeq(MF_pep_a3,mf_a3_pep_path)
dis_mf_output = f'{output_dir}MF_a3ss_pep'
dis_pv_output = f'{output_dir}PV_a3ss_pep'
file_path_PV = 'MPN_neoepitope/HLA_MPN/PV_HLA_PLT_ftr.txt'
hla_PV = ProcessHLA(file_path_PV)
Hla_PV=','.join(hla_PV)
runNetMHCpan(netMHCpan_path, pv_a3_pep_path, Hla_PV, dis_pv_output)
runNetMHCpan(netMHCpan_path, mf_a3_pep_path, Hla_MF, dis_mf_output)
mf_output = 'MPN_neoepitope/output/MF_a3ss'
pv_output = 'MPN_neoepitope/output/PV_a3ss'
postProcessing(dis_mf_output,hla_MF,mf_output)
postProcessing(dis_pv_output,hla_PV,pv_output)


In [18]:
#'Y'peptides for PV and MF
PV_a3_hd_pep = f'{output_dir}PV_a3_hd_pep2.fsa'
PV_a3_dis_pep = f'{output_dir}PV_a3_dis_pep2.fsa'
processPeps(PV_pep_a3, PV_a3_hd_pep, PV_a3_dis_pep)
MF_a3_hd_pep = f'{output_dir}MF_a3_hd_pep2.fsa'
MF_a3_dis_pep = f'{output_dir}MF_a3_dis_pep2.fsa'
processPeps(MF_pep_a3, MF_a3_hd_pep, MF_a3_dis_pep)


In [None]:
#run netMHCpan for pv  
pv_a3_hd_output = f'{output_dir}pv_a3_hd_pep'
pv_a3_dis_output = f'{output_dir}pv_a3_dis_pep'
runNetMHCpan(netMHCpan_path, PV_a3_hd_pep, Hla_PV,pv_a3_hd_output)
runNetMHCpan(netMHCpan_path, PV_a3_dis_pep, Hla_PV,pv_a3_dis_output)
# postprocessing step to filter out normal proteome 
pv_final = 'MPN_neoepitope/output/pv_a32'
postProcessing2(pv_a3_hd_output,pv_a3_dis_output,hla_PV,pv_final)

In [None]:
#run netMHCpan for mf
mf_a3_hd_output = f'{output_dir}mf_a3_hd_pep'
mf_a3_dis_output = f'{output_dir}mf_a3_dis_pep'
runNetMHCpan(netMHCpan_path, MF_a3_hd_pep, Hla_MF,mf_a3_hd_output)
runNetMHCpan(netMHCpan_path, MF_a3_dis_pep, Hla_MF,mf_a3_dis_output)
# postprocessing step to filter out normal proteome
mf_final = 'MPN_neoepitope/output/mf_a32'
postProcessing2(mf_a3_hd_output,mf_a3_dis_output,hla_MF,mf_final)


In [22]:
#######MXE##########
pep_path = "MPN_neoepitope/neoj_peptides/MXE_peptides_MPN.txt"
all_pep_mxe = np.loadtxt(pep_path, dtype=str, delimiter='\t')
PV = all_pep_mxe[:,4] == 'PV'
MF = all_pep_mxe[:,4] == 'MF'
PV_pep_mxe = all_pep_mxe[PV]
MF_pep_mxe = all_pep_mxe[MF]

In [None]:
# analysing differental('Y') peptides between HD and disease 
PV_mxe_hd_pep = f'{output_dir}PV_mxe_hd_pep2.fsa'
PV_mxe_dis_pep = f'{output_dir}PV_mxe_dis_pep2.fsa'
processPeps(PV_pep_mxe, PV_mxe_hd_pep, PV_mxe_dis_pep)
MF_mxe_hd_pep = f'{output_dir}MF_mxe_hd_pep2.fsa'
MF_mxe_dis_pep = f'{output_dir}MF_mxe_dis_pep2.fsa'
processPeps(MF_pep_mxe, MF_mxe_hd_pep, MF_mxe_dis_pep)


In [None]:
# run netMHCpan in PV mxe(Y) events 
pv_mxe_hd_output = f'{output_dir}pv_mxe_hd_pep'
pv_mxe_dis_output = f'{output_dir}pv_mxe_dis_pep'
runNetMHCpan(netMHCpan_path, PV_mxe_hd_pep, Hla_PV,pv_mxe_hd_output)
runNetMHCpan(netMHCpan_path, PV_mxe_dis_pep, Hla_PV,pv_mxe_dis_output)
#postprocessing pv mxe files
pv_final = 'MPN_neoepitope/output/pv_mxe'
postProcessing2(pv_mxe_hd_output,pv_mxe_dis_output,hla_PV,pv_final)

In [None]:
# run netMHCpan in MF mxe(Y) events
mf_mxe_hd_output = f'{output_dir}mf_mxe_hd_pep'
mf_mxe_dis_output = f'{output_dir}mf_mxe_dis_pep'
runNetMHCpan(netMHCpan_path, MF_mxe_hd_pep, Hla_MF,mf_mxe_hd_output)
runNetMHCpan(netMHCpan_path, MF_mxe_dis_pep, Hla_MF,mf_mxe_dis_output)
#postprocessing mf mxe files
mf_final = 'MPN_neoepitope/output/mf_mxe'
postProcessing2(mf_mxe_hd_output,mf_mxe_dis_output,hla_MF,mf_final)

In [None]:
# run netMHCpan in MF mxe(N) events
MF_pep_mxe_path = f'{output_dir}MF_mxe_pep2.fsa'
mf_mxe_output2 = f'{output_dir}mf_mxe_pep2'
ProcessSeq(MF_pep_mxe,MF_pep_mxe_path)
runNetMHCpan(netMHCpan_path, MF_pep_mxe_path, Hla_MF,mf_mxe_output2)
#postprocessing mf mxe(N) files
output_mf_mxe = 'MPN_neoepitope/output/MF_mxe2'
postProcessing(mf_mxe_output2,hla_MF,output_mf_mxe)

In [25]:
###########SE#############
pep_path = "SE_peptides_MPN_1128.txt"
all_pep_se = np.loadtxt(pep_path, dtype=str, delimiter='\t')
PV = all_pep_se[:,4] == 'PV'
MF = all_pep_se[:,4] == 'MF'
ET = all_pep_se[:,4] == 'ET'
PV_pep_se = all_pep_se[PV]
MF_pep_se = all_pep_se[MF]
ET_pep_se = all_pep_se[ET]


In [None]:
# 'N' se peptides for ET and PV
pv_se_pep_path = f'{output_dir}se_pv_pep.fsa'
et_se_pep_path = f'{output_dir}se_et_pep.fsa'
ProcessSESeq(PV_pep_se,pv_se_pep_path)
ProcessSESeq(ET_pep_se,et_se_pep_path)
dis_et_output = f'{output_dir}ET_se_pep'
dis_pv_output = f'{output_dir}PV_se_pep'
file_path_PV = 'MPN_neoepitope/HLA_MPN/ET_HLA_PLT_ftr.txt'
hla_ET = ProcessHLA(file_path_ET)
Hla_ET=','.join(hla_ET)
runNetMHCpan(netMHCpan_path, pv_se_pep_path, Hla_PV, dis_pv_output)
runNetMHCpan(netMHCpan_path, et_se_pep_path, Hla_ET, dis_et_output)

# postprocessing 
output_dir = 'MPN_neoepitope/output/'
dis_et_output = f'{output_dir}ET_se_pep'
dis_pv_output = f'{output_dir}PV_se_pep'
output_pv_se = 'MPN_neoepitope/output/PV_SE'
output_et_se = 'MPN_neoepitope/output/ET_SE'
postProcessing(dis_pv_output,hla_PV,output_pv_se)
postProcessing(dis_et_output,hla_ET,output_et_se)

In [None]:
# 'Y' se peptides for MF
MF_se_hd_pep = f'{output_dir}MF_se_hd_pep2.fsa'
MF_se_dis_pep = f'{output_dir}MF_se_dis_pep2.fsa'
processSEPeps(MF_pep_se, MF_se_hd_pep, MF_se_dis_pep)

In [None]:
mf_se_hd_output = f'{output_dir}mf_se_hd_pep'
mf_se_dis_output = f'{output_dir}mf_se_dis_pep'
Hla_MF=','.join(hla_MF)
netMHCpan_path = './netMHCpan'
runNetMHCpan(netMHCpan_path, MF_se_hd_pep, Hla_MF,mf_se_hd_output)
runNetMHCpan(netMHCpan_path, MF_se_dis_pep, Hla_MF,mf_se_dis_output)
# postprocessing 
mf_se_y_final = 'MPN_neoepitope/output/mf_se_y'
postProcessing2(mf_se_hd_output,mf_se_dis_output,hla_MF,mf_se_y_final)

In [None]:
# 'N' se peptides for MF
output_dir = 'MPN_neoepitope/output/'
mf_se_pep_path = f'{output_dir}mf_n_se_pep.fsa'
pep_path = "SE_peptides_MPN_1128.txt"
all_pep_se = np.loadtxt(pep_path, dtype=str, delimiter='\t')
MF = all_pep_se[:,4] == 'MF'
MF_pep_se = all_pep_se[MF]
ProcessSESeq(MF_pep_se,mf_se_pep_path)
# postprocessing 
output_mf_se = 'MPN_neoepitope/output/MF_SE_N'
postProcessing(dis_mf_output,hla_MF,output_mf_se)