1. collect whole HLA-E/G sequences from hla_prot.fasta

In [5]:
def get_hla_seqs(fasta_name, hla_name):
    hla_seq_file = open(fasta_name,"r")
    line = hla_seq_file.readline()

    seq_list = []
    seq_num = 0
    while line:
        if line.find(hla_name) != -1:
            seq_num += 1
            seq_len = int(line[
                line.find(" bp")-3 : line.find(" bp")
                ])
            line = hla_seq_file.readline()

            seq = ""
            while line.find("bp") == -1:        # not find bp
                seq += line.strip("\n")
                line = hla_seq_file.readline()
            
            assert len(seq)==seq_len
            seq_list.append(seq)
        else:
            line = hla_seq_file.readline()

    assert seq_num==len(seq_list)
    return seq_list

In [6]:
# get each HLA's whole sequences from .fasta
from collections import defaultdict
data_path = "/data/lujd/neoag_data/raw_data/hla_prot.fasta"

hla2seqs_dict = defaultdict(list)
hla_list = ["G*01:01", "G*01:03", "G*01:04", "E*01:01","E*01:03"]
for hla in hla_list:
    hla2seqs_dict[hla] = get_hla_seqs(data_path, hla)

In [7]:
# the number of sequences of each HLA
for hla, seq_list in hla2seqs_dict.items():
    seq_set = set(seq_list)
    print(hla, len(seq_list), len(seq_set),
        [len(seq) for seq in seq_set]
    )

    hla2seqs_dict[hla] = list(seq_set)      # remove duplicate sequences

G*01:01 49 4 [181, 338, 273, 298]
G*01:03 2 1 [338]
G*01:04 11 3 [312, 338, 273]
E*01:01 83 2 [358, 181]
E*01:03 84 1 [358]


2. clip sequences

In [23]:
from collections import defaultdict

hla2clip_dict = defaultdict(list)
for hla, seq_list in hla2seqs_dict.items():
    clip_list = []
    for seq in seq_list:
        if len(seq) < 182:
            clip_list.append(seq)
        else:
            if hla.find("G") != -1:
                clip_list.append(seq[24:206])
            elif hla.find("E") != -1:
                clip_list.append(seq[21:203])   # "GSH..." starts from 21
    hla2clip_dict[hla] = clip_list

In [24]:
# the number of clip sequences of each HLA
for hla, seq_list in hla2clip_dict.items():
    seq_set = sorted(set(seq_list))
    print(hla, len(seq_list), len(seq_set),
        [len(seq) for seq in seq_set]
    )

    hla2clip_dict[hla] = list(seq_set)      # remove duplicate sequences

G*01:01 4 3 [182, 182, 181]
G*01:03 1 1 [182]
G*01:04 3 2 [182, 182]
E*01:01 2 2 [182, 181]
E*01:03 1 1 [182]


#### Summary

|HLA_name|number|num_whole_seq|num_clip_seq|
|:-:|:-:|:-:|:-:|
|G*01:01|49|4|3|
|G*01:03|2|1|1|
|G*01:04|11|3|2|
|E*01:01|83|2|2|
|E*01:03|84|1|1|

G*01:01 [181, 338, 273, 298]
- 181: 01:01:04, 01:01:13
- 273: 01:01:7, 01:01:15, 01:01:18, 01:01:19   -> clip GVVD...
- 298: 01:01:11
- 181, 273, 298 are subsegments of 338

G*01:04 [312, 338, 273] 
- 273: 01:04:02, 01:04:05
- 312: 01:04:06
- 273, 312 are subsegments of 338

E*01:01 [358, 181]
- E*01:01:02 181
- else 358

clip: _ * _ : _ : 01 (:01)

3. check these clip sequences 

In [25]:
hla2clip_dict

defaultdict(list,
            {'G*01:01': ['GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRMEPRAPWVEQEGPEYWEEETRNTKAHAQTDRMNLQTLRGYYNQSEASSHTLQWMIGCDLGSDGRLLRGYEQYAYDGKDYLALNEDLRSWTAADTAAQISKRKCEAANVAEQRRAYLEGTCVEWLHRYLENGKEMLQRA',
              'GYVDDTQFVRFDSDSACPRMEPRAPWVEQEGPEYWEEETRNTKAHAQTDRMNLQTLRGYYNQSEASSHTLQWMIGCDLGSDGRLLRGYEQYAYDGKDYLALNEDLRSWTAADTAAQISKRKCEAANVAEQRRAYLEGTCVEWLHRYLENGKEMLQRADPPKTHVTHHPVFDYEATLRCWALG',
              'SHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRMEPRAPWVEQEGPEYWEEETRNTKAHAQTDRMNLQTLRGYYNQSEASSHTLQWMIGCDLGSDGRLLRGYEQYAYDGKDYLALNEDLRSWTAADTAAQISKRKCEAANVAEQRRAYLEGTCVEWLHRYLENGKEMLQRA'],
             'G*01:03': ['GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDSQFVRFDSDSACPRMEPRAPWVEQEGPEYWEEETRNTKAHAQTDRMNLQTLRGYYNQSEASSHTLQWMIGCDLGSDGRLLRGYEQYAYDGKDYLALNEDLRSWTAADTAAQISKRKCEAANVAEQRRAYLEGTCVEWLHRYLENGKEMLQRA'],
             'G*01:04': ['GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRMEPRAPWVEQEGPEYWEEETRNTKAHAQTDRMNLQTLRGYYNQSEASSHTLQWMIGCDLGSDGRLIRGYEQYAYDGKDYLALNEDLRSW

(1) HLA - G


In [26]:
print(
    hla2clip_dict["G*01:01"][0][1:] == hla2clip_dict["G*01:01"][2],
    hla2clip_dict["G*01:01"][0] == hla2clip_dict["G*01:03"][0],
    hla2clip_dict["G*01:01"][0] == hla2clip_dict["G*01:04"][0],
    hla2clip_dict["G*01:03"][0] == hla2clip_dict["G*01:04"][0],
    hla2clip_dict["G*01:01"][1] == hla2clip_dict["G*01:04"][1]
)

True False False False False


In [27]:
for ind in range(len(hla2clip_dict["G*01:01"][0])):
    if hla2clip_dict["G*01:01"][0][ind] != hla2clip_dict["G*01:03"][0][ind]:
        print("01 vs 03: ", ind, hla2clip_dict["G*01:01"][0][ind], hla2clip_dict["G*01:03"][0][ind])
    if hla2clip_dict["G*01:01"][0][ind] != hla2clip_dict["G*01:04"][0][ind]:
        print("01 vs 04: ", ind, hla2clip_dict["G*01:01"][0][ind], hla2clip_dict["G*01:04"][0][ind])
    if hla2clip_dict["G*01:03"][0][ind] != hla2clip_dict["G*01:04"][0][ind]:
        print("03 vs 04: ", ind, hla2clip_dict["G*01:03"][0][ind], hla2clip_dict["G*01:04"][0][ind])
# vs 01:
# 03 -> 30  S
# 04 -> 109 I

01 vs 03:  30 T S
03 vs 04:  30 S T
01 vs 04:  109 L I
03 vs 04:  109 L I


In [28]:
# these sequences start from GYVD, instead of GSHS.

for ind in range(len(hla2clip_dict["G*01:01"][1])):
    if hla2clip_dict["G*01:01"][1][ind] != hla2clip_dict["G*01:04"][1][ind]:
        print("01 vs 04: ", ind, hla2clip_dict["G*01:01"][1][ind], hla2clip_dict["G*01:04"][1][ind])

01 vs 04:  84 L I


(2) HLA - E

In [29]:
print(
    hla2clip_dict["E*01:01"][0][1:] == hla2clip_dict["E*01:01"][1],
    hla2clip_dict["E*01:01"][0] == hla2clip_dict["E*01:03"][0]
)

True False


In [30]:
for ind in range(len(hla2clip_dict["E*01:01"][1])):
    if hla2clip_dict["E*01:01"][0][ind] != hla2clip_dict["E*01:03"][0][ind]:
        print("01 vs 03: ", ind, hla2clip_dict["E*01:01"][0][ind], hla2clip_dict["E*01:03"][0][ind])

01 vs 03:  106 R G


4. final decision (use clip_seqs that start from "GSHS", 182)

In [41]:
import pandas as pd
hla_eg_seq_df = pd.DataFrame(columns=["HLA_name","full","clip","short"])
hla_eg_seq_df["HLA_name"] = [
                            "HLA-G*01:01", 
                            "HLA-G*01:03", 
                            "HLA-G*01:04", 
                            "HLA-E*01:01",
                            "HLA-E*01:03"]
hla_eg_seq_df["clip"] = [
                        hla2clip_dict["G*01:01"][0],
                        hla2clip_dict["G*01:03"][0],
                        hla2clip_dict["G*01:04"][0],
                        hla2clip_dict["E*01:01"][0],
                        hla2clip_dict["E*01:03"][0]]
hla_eg_seq_df

Unnamed: 0,HLA_name,full,clip,short
0,HLA-G*01:01,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRME...,
1,HLA-G*01:03,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDSQFVRFDSDSACPRME...,
2,HLA-G*01:04,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRME...,
3,HLA-E*01:01,,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,
4,HLA-E*01:03,,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,


5. add it to our HLA_sequence_dict.csv

(1) clip -> short

In [37]:
import pandas as pd
hla_abc_seq_df = pd.read_csv(
    "/data/lujd/neoag_data/main_task/HLA_sequence_dict.csv",index_col=0)
hla_abc_seq_df[:5]

Unnamed: 0,HLA_name,full,clip,short
0,HLA-A*01:01,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKME...,YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY
1,HLA-A*02:01,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,YFAMYGEKVAHTHVDTLYVRYHYYTWAVLAYTWY
2,HLA-A*02:02,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASRRME...,YFAMYGEKVAHTHVDTLYLRYHYYTWAVWAYTWY
3,HLA-A*02:03,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,YFAMYGEKVAHTHVDTLYVRYHYYTWAEWAYTWY
4,HLA-A*02:04,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,YFAMYGEKVAHTHVDTLYVMYHYYTWAVLAYTWY


In [38]:
hla_abc_clip = hla_abc_seq_df["clip"]
hla_abc_short = hla_abc_seq_df["short"]
print(hla_abc_clip[0])
print(hla_abc_short[0])

GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRCVDGLRRYLENGKETLQRT
YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGY


In [39]:
short_position = [7,9,24,45,59,62,63,66,67,69,70,73,74,76,77,80,81,84,95,97,99,
                    114,116,118,143,147,150,152,156,158,159,163,167,171]        # from NetMHCpan
short_position = [pos-1 for pos in short_position]
short_seq = ""
for ind in short_position:
    short_seq += hla_abc_clip[0][ind]

short_seq == hla_abc_short[0]

True

In [42]:
# HLA E/G
for i in range(len(hla_eg_seq_df["clip"])):
    short_seq = ""
    for ind in short_position:
        short_seq += hla_eg_seq_df["clip"][i][ind]
    hla_eg_seq_df["short"][i] = short_seq
hla_eg_seq_df

Unnamed: 0,HLA_name,full,clip,short
0,HLA-G*01:01,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRME...,YSAMYEENTAHTDMNTLYLWIEYYSCAVRAYTWY
1,HLA-G*01:03,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDSQFVRFDSDSACPRME...,YSAMYEENTAHTDMNTLYLWIEYYSCAVRAYTWY
2,HLA-G*01:04,,GSHSMRYFSAAVSRPGRGEPRFIAMGYVDDTQFVRFDSDSACPRME...,YSAMYEENTAHTDMNTLYLWIEYYSCAVRAYTWY
3,HLA-E*01:01,,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY
4,HLA-E*01:03,,GSHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDAASPRMV...,YHSMYRESADTIFVNTLYLWHEFYSSAEQAYTWY


(2) cat and save

In [43]:
hla_seq_df = pd.concat(
    (hla_abc_seq_df, hla_eg_seq_df),
    axis=0
    )
i=0
for seq in hla_seq_df["clip"]:
    if len(seq) != 182:
        print(hla_seq_df["HLA_name"][i])
    i+=1

HLA-A*02:50
HLA-A*24:06
HLA-A*24:13
HLA-B*45:06


In [45]:
hla_seq_df.to_csv(
    "/data/lujd/neoag_data/main_task/HLA_sequence_dict_new.csv"
    )

In [87]:
import pandas as pd

hla_abc_seq_df = pd.read_csv(
    "/data/lujd/neoag_data/main_task/HLA_sequence_dict_new.csv",index_col=0)

HLAA_file = open("HLA_G_clip.txt","w")

hla_clip_list = hla_abc_seq_df["clip"].to_list()
hla_a_clip_list = hla_clip_list[112:115]
for seq in hla_a_clip_list:
    HLAA_file.write(f"{seq}\n")

HLAA_file.close()
len(hla_clip_list)

117

In [90]:
a_hlag_clip = hla_clip_list[2]
for j in range(112):
    a_hlaa_clip = hla_clip_list[j]

    if len(a_hlaa_clip) == len(a_hlag_clip):
        sum = 0
        for i in range(len(a_hlaa_clip)):
            if a_hlaa_clip[i] != a_hlag_clip[i]:
                sum+=1
        print(sum)

25
3
0
4
4
1
4
4
5
3
4
4
7
4
16
19
22
20
18
26
21
21
18
20
20
21
18
24
25
25
19
20
12
11
11
9
24
35
35
40
34
35
33
36
35
35
34
33
36
36
34
34
33
34
32
33
31
33
31
31
31
33
37
37
37
32
36
33
35
35
39
33
36
40
40
40
38
31
36
41
38
40
39
39
40
36
36
36
35
36
34
36
32
33
34
33
34
33
34
36
36
37
33
32
32
36
32
36
