In [4]:
import pickle
import gzip
import warnings
warnings.filterwarnings("ignore")
import numpy as np

# Requirements to Run This Notebook

### 1. Input Data
- **Training or Independent Data**: The data should be provided in `.dat` format.
- **HLA Allele**: Aligned sequence form of HLA alleles, for example  file named -- "All_prot_alignseq_C_369.dat"

### 2. Supporting Files
- ** C-alpha Matrix**: A pre-computed matrix which includs the  structural relationship between amino acids based on c-alpha distance

### 3. Output
- **Amino Acid Interaction Matrix**: 
  - For all used immunogenic peptides, the interaction matrix is generated.
  - This matrix serves as the initial output and is used for selecting the Optimal Number of Features (ONF). 
 




### Note on Code Provenance

This code is a modified version of the previous article:  
**"MHC II immunogenicity shapes the neoepitope landscape in human tumors"**  
*Jeong Yeon Kim et al.*

In [18]:
def matchDat(afflst, hladic, aadic):
    seqlst = []
    tablst = []
    header = []
    for affin in afflst:
        affstr = affin.strip().split('\t')
        if affstr[0] in hladic:
            hlaseq = hladic[affstr[0]]
            aaseq = affstr[1]
            tmp = []
            tmp0 = []
            for hlain in hlaseq:
                for aain in aaseq:
                    if hlain == 'X' or aain == 'X':
                        tmp0.append([0.0])
                    elif hlain == '*' or hlain == '.' or aain == 'X':
                        tmp0.append([0.0])
                    elif aain == 'U':
                        tmp0.append([aadic.get((hlain, 'C'), 0.0)])
                    elif aain == 'J':
                        aa1 = aadic.get((hlain, 'L'), 0.0)
                        aa2 = aadic.get((hlain, 'I'), 0.0)
                        aamax = max(aa1, aa2)
                        tmp0.append([aamax])
                    elif aain == 'Z':
                        aa1 = aadic.get((hlain, 'Q'), 0.0)
                        aa2 = aadic.get((hlain, 'E'), 0.0)
                        aamax = max(aa1, aa2)
                        tmp0.append([aamax])
                    elif aain == 'B':
                        aa1 = aadic.get((hlain, 'D'), 0.0)
                        aa2 = aadic.get((hlain, 'N'), 0.0)
                        aamax = max(aa1, aa2)
                        tmp0.append([aamax])
                    else:
                        tmp0.append([aadic.get((hlain, aain), 0.0)])
                tmp.append(tmp0)
                tmp0 = []
            seqlst.append(list(zip(*tmp)))  # Convert zip to list for TensorFlow compatibility
            tablst.append(int(affstr[2]))
            header.append((affstr[0], affstr[1]))
    seqarray0 = np.array(seqlst, dtype=np.float32)
    del seqlst
    a_seq2 = seqarray0.reshape(seqarray0.shape[0], seqarray0.shape[1] * seqarray0.shape[2])
    a_lab2 = np.array(tablst, dtype=np.float32)
    del tablst
    return (a_seq2, a_lab2), header

def HeaderOutput(lstin, outname):
    with open(outname, 'w') as outw:
        for lin in lstin:
            outw.write('\t'.join(map(str, lin)) + '\n')

def modifyMatrix(affydatin_test, seqdatin, outfile):
    hladicin = {x.strip().split('\t')[0]: list(x.strip().split('\t')[1]) for x in open(seqdatin).readlines()}
    aalst = open('../All_Data/Calpha.txt').readlines() # c_alpha matrix
    aadicin = {}
    aaseq0 = aalst[0].strip().split('\t')
    for aain in aalst[1:]:
        aastr = aain.strip().split('\t')
        for i in range(1, len(aastr)):
            aadicin[(aaseq0[i-1], aastr[0])] = float(aastr[i])
    afflst = open(affydatin_test).readlines()
    d, test_header = matchDat(afflst, hladicin, aadicin)
    
    outname0 = affydatin_test
    outname2 = affydatin_test + '.header'
    
    with gzip.open(outfile, 'wb') as f:
        pickle.dump(d, f, protocol=2)
    
    HeaderOutput(test_header, outname2)


In [8]:
# featue matrix generating for training data 
modifyMatrix('../MHC_I/our_data_mhc_i_balance_training.dat', '../All_Data/All_prot_alignseq_C_369.dat', 'Feature_MHC_i_our_training')

In [9]:
## feature matrix generating for  independent/external/unseen data 200
modifyMatrix('../MHC_I/our_data_mhc_i_balance_independent.dat', '../All_Data/All_prot_alignseq_C_369.dat', 'Feature_MHC_i_our_independent')