In [19]:
import os
import re
import random
from collections import defaultdict
os.chdir("/home/ys/ExplainablePPI")
from experiments.utils import organisms
random.seed(100)

from Bio import Align
from Bio.Align import substitution_matrices

matrix = substitution_matrices.load("BLOSUM62")
aligner = Align.PairwiseAligner()
aligner.substitution_matrix = matrix

In [20]:
def get_pairs(pair_dir, k):
    pairs = {}
    for orga in organisms:
        #读取pair
        orga_pair_fp = os.path.join(pair_dir, orga+"_test.tsv")
        lines = open(orga_pair_fp, "r").readlines()
        # pos_lines = [l for l in lines if l.strip()[-1] == '1']
        # neg_lines = [l for l in lines if l.strip()[-1] == '0']
        # pairs[orga] = random.sample(pos_lines, k//10) + random.sample(neg_lines, k-(k//10))
        selected_lines =  random.sample(lines, k=k)
    
        pairs[orga] = selected_lines

    return pairs

def get_score(seq1, seq2):
    seq1 = re.sub(r"[UZOB]", "X", seq1)
    seq2 = re.sub(r"[UZOB]", "X", seq2)
    score = aligner.score(seq1, seq2) / min(len(seq1), len(seq2))
    return score

orga_max_scores = dict(zip(organisms, [-1000] * len(organisms)))
orga_min_scores = dict(zip(organisms, [1000] * len(organisms)))


def get_samples(seq_dir, pairs):
    # 读取 seq
    seqs = defaultdict(list)
    _pairs = defaultdict(list)
    for orga in organisms:
        orga_fasta_fp = os.path.join(seq_dir, orga+"_test.fasta")
        lines = open(orga_fasta_fp, "r").readlines()
        orga_seq_dict = dict([l.strip().split() for l in lines])
        pros = set([])
        for p in pairs[orga]:
            fpro, spro, _ = p.strip().split()
            pros.add(fpro)
            pros.add(spro)
            _score = get_score(orga_seq_dict[fpro], orga_seq_dict[spro])

            orga_max_scores[orga] = max(orga_max_scores[orga], _score)
            orga_min_scores[orga] = min(orga_min_scores[orga], _score)

            _pairs[orga].append(f"{fpro}\t{spro}\t{_score}\n")
        seqs[orga] = [f"{pro}\t{orga_seq_dict[pro]}\n" for pro in pros]
    return seqs, _pairs

In [21]:
seq_dir = "./data/dscript/processed/seqs"
pair_dir = "./data/dscript/processed/pairs"

pairs = get_pairs(pair_dir=pair_dir, k=2000)
train_pairs = {}
test_pairs = {}
for orga in pairs:
    random.shuffle(pairs[orga])
    train_pairs[orga] = pairs[orga][:1600]
    test_pairs[orga] = pairs[orga][1600:]

train_seqs, train_pairs = get_samples(seq_dir=seq_dir, pairs=train_pairs)
test_seqs, test_pairs = get_samples(seq_dir=seq_dir, pairs=test_pairs)

# 归一化
for orga in organisms:
    tr_p = train_pairs[orga]
    te_p = test_pairs[orga]

    _tr_p = []
    _te_p = []
    for p in tr_p:
        id1, id2, score = p.strip().split()
        score = float(score)
        score = (score - orga_min_scores[orga]) / (orga_max_scores[orga] - orga_min_scores[orga])
        _tr_p.append(f"{id1}\t{id2}\t{round(score, 5)}\n")
    
    for p in te_p:
        id1, id2, score = p.strip().split()
        score = float(score)
        score = (score - orga_min_scores[orga]) / (orga_max_scores[orga] - orga_min_scores[orga])
        _te_p.append(f"{id1}\t{id2}\t{round(score, 5)}\n")

    train_pairs[orga] = _tr_p
    test_pairs[orga] = _te_p

os.makedirs("./experiments/8.align_score_prediction/data/train", exist_ok=True)
os.makedirs("./experiments/8.align_score_prediction/data/test", exist_ok=True)

for orga in organisms:
    with open(f"./experiments/8.align_score_prediction/data/test/{orga}.seq", "w") as f:
        f.writelines(test_seqs[orga])
    with open(f"./experiments/8.align_score_prediction/data/test/{orga}.tsv", "w") as f:
        f.writelines(test_pairs[orga])

    with open(f"./experiments/8.align_score_prediction/data/train/{orga}.seq", "w") as f:
        f.writelines(train_seqs[orga])
    with open(f"./experiments/8.align_score_prediction/data/train/{orga}.tsv", "w") as f:
        f.writelines(train_pairs[orga])
