In [1]:
from Bio.Align import substitution_matrices
print(substitution_matrices.load())  # doctest: +ELLIPSIS

['BENNER22', 'BENNER6', 'BENNER74', 'BLOSUM45', 'BLOSUM50', 'BLOSUM62', 'BLOSUM80', 'BLOSUM90', 'DAYHOFF', 'FENG', 'GENETIC', 'GONNET1992', 'HOXD70', 'JOHNSON', 'JONES', 'LEVIN', 'MCLACHLAN', 'MDM78', 'NUC.4.4', 'PAM250', 'PAM30', 'PAM70', 'RAO', 'RISLER', 'SCHNEIDER', 'STR', 'TRANS']


In [2]:
matrix = substitution_matrices.load("BLOSUM62")

ZERO_DIAG = False
if ZERO_DIAG:
    import numpy as np
    np.fill_diagonal(matrix, 0)

print(matrix)

#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
     A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V    B    Z    X    *
A  4.0 -1.0 -2.0 -2.0  0.0 -1.0 -1.0  0.0 -2.0 -1.0 -1.0 -1.0 -1.0 -2.0 -1.0  1.0  0.0 -3.0 -2.0  0.0 -2.0 -1.0  0.0 -4.0
R -1.0  5.0  0.0 -2.0 -3.0  1.0  0.0 -2.0  0.0 -3.0 -2.0  2.0 -1.0 -3.0 -2.0 -1.0 -1.0 -3.0 -2.0 -3.0 -1.0  0.0 -1.0 -4.0
N -2.0  0.0  6.0  1.0 -3.0  0.0  0.0  0.0  1.0 -3.0 -3.0  0.0 -2.0 -3.0 -2.0  1.0  0.0 -4.0 -2.0 -3.0  3.0  0.0 -1.0 -4.0
D -2.0 -2.0  1.0  6.0 -3.0  0.0  2.0 -1.0 -1.0 -3.0 -4.0 -1.0 -3.0 -3.0 -1.0  0.0 -1.0 -4.0 -3.0 -3.0  4.0  1.0 -1.0 -4.0
C  0.0 -3.0 -3.0 -3.0  9.0 -3.0 -4.0 -3.0 -3.0 -1.0 -1.0 -3.0 -1.0 -2.0 -3.0 -1.0 -1.0 -2.0 -2.0 -1.0 -3.0 -3.0 -2.0 -4.0
Q -1.0  1.0  0.0  0.

---

In [3]:
def cal_blosum_score(matrix, ref_seq, query_seqs):

    from Bio import Align
    aligner = Align.PairwiseAligner()
    aligner.substitution_matrix = matrix

    scores = []

    if type(query_seqs) == list:
        for query_seq in query_seqs:
            scores.append(aligner.score(ref_seq, query_seq))
    elif type(query_seqs) == str:
        scores.append(aligner.score(ref_seq, query_seqs))
    else:
        raise ValueError(f"Wrong type of query_seqs ({type(query_seqs)})")
    
    return scores

def blosum_dist(matrix, ref_seq, query_seq):
    blosum_dist = 0
    for ref, query in zip(ref_seq, query_seq):
        i = matrix.alphabet.find(ref)
        j = matrix.alphabet.find(query)
        blosum_dist += matrix[i, j]
    return blosum_dist

def blosum_weighted_dist(matrix, ref_seq, query_seq):
    blosum_dist = 0
    for ref, query in zip(ref_seq, query_seq):
        if ref != query:
            i = matrix.alphabet.find(ref)
            j = matrix.alphabet.find(query)
            blosum_dist += matrix[i, j]
    return blosum_dist

def cal_blosum_dist(matrix, ref_seq, query_seqs, weighted=False):
    distances = []

    if weighted:
        cal_func = blosum_weighted_dist
    else:
        cal_func = blosum_dist

    if type(query_seqs) == list:
        for query_seq in query_seqs:
            distances.append(cal_func(matrix, ref_seq, query_seq))
    elif type(query_seqs) == str:
        distances.append(cal_func(matrix, ref_seq, query_seqs))
    else:
        raise ValueError(f"Wrong type of query_seqs ({type(query_seqs)})")
    
    return distances

In [4]:
import pandas as pd

data_path = "/data/lujd/neoag_data/main_task/"
hla_seq_df = pd.read_csv(data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0)
hla_seq_dict = hla_seq_df.set_index(["HLA_name"])["clip"].to_dict()

hla_abc_df = hla_seq_df[:112]
hla_abc_list = hla_abc_df["HLA_name"].to_list()
hla_abc_clip_list = hla_abc_df["clip"].to_list()
hla_abc_clip_list = ["G"+seq if len(seq)==181 else seq for seq in hla_abc_clip_list]    # alignment

target_hla_list = [ "HLA-A*03:02", "HLA-A*11:02", "HLA-B*42:01", "HLA-B*08:02",
                    "HLA-G*01:01", "HLA-E*01:03", "HLA-A*33:03", "HLA-A*34:02",]

In [5]:
import numpy as np

target_hla = target_hla_list[1]
target_hla_clip = hla_seq_dict[target_hla]
print(target_hla)

ref_seq = target_hla_clip
query_seqs = hla_abc_clip_list

blosum_distances = cal_blosum_dist(matrix, ref_seq, query_seqs)
# blosum_distances = cal_blosum_dist(matrix, ref_seq, query_seqs, weighted=True)
# blosum_distances = cal_blosum_score(matrix, ref_seq, query_seqs)
blosum_distances = np.asarray(blosum_distances)

kdist = [750, 800, 825, 835, 845, 855, 865, 875, 885, 900]
# kdist = [-20, -15, -10, -5, 0, 5, 10]
print("distance\t", kdist[:5], "\t", kdist[5:])

num_cluster = []
for i in kdist:
    num_cluster.append( sum(blosum_distances>i) )
print("number (>dist)\t", num_cluster[:5], "\t", num_cluster[5:])

HLA-A*11:02
distance	 [750, 800, 825, 835, 845] 	 [855, 865, 875, 885, 900]
number (>dist)	 [112, 79, 41, 37, 36] 	 [35, 33, 21, 17, 11]


In [12]:
clusterset = []
for i, hla in enumerate(hla_abc_list):
    if blosum_distances[i] > 826:
        clusterset.append(hla)
clusterset = set(clusterset)
len(clusterset)

38

|Allele|HLA-B*42:01|HLA-A*11:02|HLA-G*01:01|HLA-E*01:03|HLA-A*33:03|HLA-A*34:02|
|-|-|-|-|-|-|-|
|blosum_dist|
|threshold|857|826|784|720|865|855|
|number|55|38|29|29|35|35|
|diff|3|0|8|7|1|0|
|blosum_weighted_dist|
|threshold|-8|-11|-2(-3)|-3|-11|-13|
|number|52|35|26(32)|31|34|35|
|diff|14|11|17(22)|27|11|3|
|blosum_score|
|threshold|883|857|813|772|886|875|
|number|53|40|29|27|34|35|

In [13]:
print(f"\'{target_hla}\': [", end="")
for i, hla in enumerate(sorted(list(clusterset))):
    if i % 5 == 4 and i != len(clusterset)-1:
        print(f"\'{hla}\'", end=",\n")
    elif i % 5 == 0 and i != 0:
        print(f"\t\t\t\t\'{hla}\'", end=", ")
    else:
        print(f"\'{hla}\'", end=", ")
print("],")

'HLA-A*11:02': ['HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:02', 'HLA-A*02:03', 'HLA-A*02:04',
				'HLA-A*02:05', 'HLA-A*02:06', 'HLA-A*02:07', 'HLA-A*02:11', 'HLA-A*02:12',
				'HLA-A*02:16', 'HLA-A*02:17', 'HLA-A*02:19', 'HLA-A*02:20', 'HLA-A*02:50',
				'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*24:02', 'HLA-A*24:03', 'HLA-A*25:01',
				'HLA-A*26:01', 'HLA-A*26:02', 'HLA-A*26:03', 'HLA-A*29:02', 'HLA-A*30:01',
				'HLA-A*30:02', 'HLA-A*31:01', 'HLA-A*32:01', 'HLA-A*32:07', 'HLA-A*32:15',
				'HLA-A*33:01', 'HLA-A*66:01', 'HLA-A*68:01', 'HLA-A*68:02', 'HLA-A*68:23',
				'HLA-A*69:01', 'HLA-A*80:01', 'HLA-C*16:01', ],


In [14]:
l = ['HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:02', 'HLA-A*02:03', 'HLA-A*02:04',
				'HLA-A*02:05', 'HLA-A*02:06', 'HLA-A*02:07', 'HLA-A*02:11', 'HLA-A*02:12',
				'HLA-A*02:16', 'HLA-A*02:17', 'HLA-A*02:19', 'HLA-A*02:20', 'HLA-A*02:50',
				'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*24:02', 'HLA-A*24:03', 'HLA-A*24:06',
				'HLA-A*24:13', 'HLA-A*25:01', 'HLA-A*26:01', 'HLA-A*26:02', 'HLA-A*26:03',
				'HLA-A*29:02', 'HLA-A*30:01', 'HLA-A*30:02', 'HLA-A*31:01', 'HLA-A*32:01',
				'HLA-A*32:07', 'HLA-A*32:15', 'HLA-A*33:01', 'HLA-A*66:01', 'HLA-A*68:01',
				'HLA-A*68:02', 'HLA-A*68:23', 'HLA-A*69:01', 'HLA-A*80:01', 'HLA-C*16:01', ]

print(f"\'{target_hla}\': [", end="")
for i, hla in enumerate(sorted(list(l))):
    if i % 5 == 4 and i != len(l)-1:
        print(f"\'{hla}\'", end=",\n")
    elif i % 5 == 0 and i != 0:
        print(f"\t\t\t\t\'{hla}\'", end=", ")
    else:
        print(f"\'{hla}\'", end=", ")
print("],")

'HLA-A*11:02': ['HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:02', 'HLA-A*02:03', 'HLA-A*02:04',
				'HLA-A*02:05', 'HLA-A*02:06', 'HLA-A*02:07', 'HLA-A*02:11', 'HLA-A*02:12',
				'HLA-A*02:16', 'HLA-A*02:17', 'HLA-A*02:19', 'HLA-A*02:20', 'HLA-A*02:50',
				'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*24:02', 'HLA-A*24:03', 'HLA-A*24:06',
				'HLA-A*24:13', 'HLA-A*25:01', 'HLA-A*26:01', 'HLA-A*26:02', 'HLA-A*26:03',
				'HLA-A*29:02', 'HLA-A*30:01', 'HLA-A*30:02', 'HLA-A*31:01', 'HLA-A*32:01',
				'HLA-A*32:07', 'HLA-A*32:15', 'HLA-A*33:01', 'HLA-A*66:01', 'HLA-A*68:01',
				'HLA-A*68:02', 'HLA-A*68:23', 'HLA-A*69:01', 'HLA-A*80:01', 'HLA-C*16:01', ],


In [15]:
num = 0
for h in clusterset:
    if h not in l:
        num += 1
num

0

In [16]:
num = 0
for h in l:
    if h not in clusterset:
        num += 1
num

2