In [1]:
# import packages 
import gffutils
from Bio import SeqIO 
import numpy as np
from Bio.Seq import Seq
import pandas as pd

In [2]:
# create database using gtf file(should be the same one as used for rMATS analysis)
fn1='Homo_sapiens.GRCh38.90.gtf.gz'
db = gffutils.create_db(fn1,":memory:",keep_order=True,disable_infer_genes=True, disable_infer_transcripts=True)

In [3]:
# parse ref fasta seq
file_path = "Homo_sapiens.GRCh38.dna.primary_assembly.fa"
hg38_sequences = list(SeqIO.parse(file_path, "fasta"))

In [10]:
# covert csv to txt file, then load txt  
def covReadCSVMXE(csv_path,txt_path):
    data = pd.read_csv(csv_path)
    data.to_csv(txt_path, sep='\t', index=False)
    columns_to_read = list(range(2, 14)) + list(range(20, 23))
    AS = np.loadtxt(txt_path, dtype=str, delimiter='\t', skiprows=1, usecols=columns_to_read)
    return AS

In [7]:
# Function: getMXEFrame
# Inputs: rMATs file after processing(DSSEs) and gff database(!Note the release version of gff file, fasta file, and rMATs file should be the same!)  
# Returns: array file containing info of the upstream exon start/end position, frame, target exon start/end position, downstream exon start/end position 
# Summary: retrieve the frame information from gff file for translation

def getMXEFrame(mxe, db):
    dtype = [('Gene_ID', 'U20'), ('strand', 'U10'), ('CDS_ID', 'U20'), ('CDS_Start', int), ('CDS_Stop', int), ('CDS_Frame', int),('T_Start1', int), ('T_Stop1', int),('T_Start2', int), ('T_Stop2', int),('Down_Start', int), ('Down_Stop', int), ('Chrom', 'U20'), ('Gene_name', 'U20'),
            ('subtype', 'U20'), ('type', 'U10'), ('comp', 'U20')]
    gene_frame_array = np.array([], dtype=dtype)
    for l in mxe:
        gene_id = l[0]
        gene_name = l[1]
        chrom = l[2]
        strand = l[3]
        UpES = int(l[8])
        UpEE = int(l[9])
        TES1 = int(l[4])
        TEE1 = int(l[5])
        TES2 = int(l[6])
        TEE2 = int(l[7])
        DownES = int(l[10])
        DownEE = int(l[11])
        subtype = l[12]
        type = l[13]
        comp = l[14]
        cds_features = db.children(gene_id, featuretype='CDS')
        if strand == '+':
            for cds in cds_features:
                if cds.start == UpES+1 and cds.stop == UpEE:
                    # print(gene_id,cds.id,cds.start,cds.stop,cds.frame)
                    gene_frame_array = np.append(gene_frame_array, np.array([(gene_id, strand, cds.id, cds.start, cds.stop, cds.frame, TES1, TEE1, TES2, TEE2, DownES, DownEE, chrom, gene_name,subtype,type,comp)], dtype=dtype))
                    break
                elif cds.stop == UpEE and cds.start > UpES+1:
                    gene_frame_array = np.append(gene_frame_array, np.array([(gene_id, strand, cds.id, cds.start, cds.stop, cds.frame, TES1, TEE1, TES2, TEE2, DownES, DownEE, chrom, gene_name,subtype,type,comp)], dtype=dtype))
                    break 
        else:
            for cds in cds_features:
                if cds.start == DownES+1 and cds.stop == DownEE:
                    # print(gene_id,cds.id,cds.start,cds.stop,cds.frame)
                    gene_frame_array = np.append(gene_frame_array, np.array([(gene_id, strand, cds.id, cds.start, cds.stop, cds.frame, TES1, TEE1, TES2, TEE2, UpES, UpEE,chrom, gene_name,subtype,type,comp)], dtype=dtype))
                    break
                elif cds.start == DownES+1 and cds.stop < DownEE:
                    gene_frame_array = np.append(gene_frame_array, np.array([(gene_id, strand, cds.id, cds.start, cds.stop, cds.frame, TES1, TEE1, TES2, TEE2, UpES, UpEE,chrom, gene_name,subtype,type,comp)], dtype=dtype))
                    break 
    return gene_frame_array

In [13]:
# Function: getMXESeq
# Inputs: rMATs file after processing(DSSEs) and hg38 reference(!Note the release version of gff file, fasta file, and rMATs file should be the same!)  
# Returns: nucleotide sequence of designated coordinate range
# Summary: retrieve nucleotide sequence based on the coordiate(start and end position of each exon of RI events) 
def getMXESeq(gene_frame_array, hg38_sequences):
    # Define the data type for the structured array
    dtype = [('Gene_ID', 'U20'), ('CDS_Frame', int), ('Chrom', 'U20'), ('Seq', 'U10000'), ('se_Seq', 'U10000'), ('gene_name', 'U20'),('subtype', 'U20'),('type', 'U20'),('comp', 'U20')]  # Adjust the max sequence length as needed
    seq_array = np.array([], dtype=dtype)
    
    # Loop through your data, extract sequences, and add them to the structured array
    for l in gene_frame_array:
        gene = l[0]
        strand = l[1]
        UpES = l[3]
        UpEE = l[4]
        frame = l[5]
        TES1 = l[6]
        TEE1 = l[7]
        TES2 = l[8]
        TEE2 = l[9]
        DownES = l[10]
        DownEE = l[11]
        chrom = l[12]
        gene_name = l[13]
        subtype = l[14]
        type = l[15]
        comp = l[16]
        
        chrom_id = chrom[3:] # Extract the chromosome ID
    
        # Find the sequence for the specified chromosome
        for record in hg38_sequences:
            if record.id == chrom_id:
                up_seq = record.seq[UpES - 1:UpEE]
                t1_seq = record.seq[TES1:TEE1]
                t2_seq = record.seq[TES2:TEE2]
                down_seq = record.seq[DownES:DownEE]
                mxe_seq = str(up_seq + t1_seq + down_seq)  # Convert the sequence to a string
                ex_mxe_seq = str(up_seq +t2_seq + down_seq)
               
                if strand == '+':
                    seq_array = np.append(seq_array, np.array([(gene, frame, chrom_id, ex_mxe_seq, mxe_seq, gene_name, subtype, type, comp)], dtype=dtype))
                    # print(gene, seq)
                    # print(se_seq)
                elif strand == '-':
                    mxe_seq = up_seq[::-1]+t2_seq[::-1]+down_seq[::-1]
                    mxe_seq = str(Seq(mxe_seq).complement())
                    ex_mxe_seq = up_seq[::-1] +t1_seq[::-1]+ down_seq[::-1]
                    ex_mxe_seq = str(Seq(ex_mxe_seq).complement())  
                    seq_array = np.append(seq_array, np.array([(gene, frame, chrom_id, ex_mxe_seq, mxe_seq, gene_name, subtype, type, comp)], dtype=dtype))
    return seq_array

In [15]:
# Function: getAASeq
# Inputs: seq_array 
# Returns: amino acid sequence 
# Summary: translate the nucleotide sequence into amino acid sequence 
def getAASeq(seq_array):
    dtype = [('Gene_ID', 'U20'),('Gene_name', 'U20'), ('aa_seq', 'U10000'), ('se_aa_seq', 'U10000'),('subtype','U20'),('type','U20'),('comp','U20')]  # Adjust the max sequence length as needed
    aa_array = np.array([], dtype=dtype)

    for l in seq_array:
        gene = l[0]
        frame = l[1]
        seq = l[3]
        se_seq = l[4]
        gene_name = l[5]
        subtype = l[6]
        type = l[7]
        comp = l[8]
        aa = Seq(seq[frame:]).translate(to_stop=True)
        se_aa = Seq(se_seq[frame:]).translate(to_stop = True)
        aa_array = np.append(aa_array, np.array([(gene, gene_name, str(aa), str(se_aa),subtype,type,comp)], dtype=dtype))
    return aa_array 

In [16]:
def main():
    mxe=covReadCSVMXE(mxe_csv_path,mxe_txt_path)
    mxe_frame = getMXEFrame(mxe,db)
    mxe_seq=getMXESeq(mxe_frame, hg38_sequences)
    mxe_seq=getAASeq(mxe_seq)
    file_name = "MPN_neoepitope/MXE_peptides_MPN.txt"
    fmt = "%s\t%s\t%s\t%s\t%s\t%s\t%s"
    np.savetxt(file_name, mxe_seq, fmt=fmt, delimiter='\t')
    print('file saved!')
    

In [17]:
if __name__ == '__main__':
    mxe_csv_path = 'MPN_neoepitope/neojunction/MXE_all_neoj.csv'
    mxe_txt_path = 'MPN_neoepitope/neojunction/MXE_all_neoj.txt'
    main()


file saved!


