In [1]:
%cd ..
%pwd

/home/user/Documents/Uni/Master/PP2/pp2-2023


'/home/user/Documents/Uni/Master/PP2/pp2-2023'

In [2]:
from pathlib import Path
from typing import Union

import polars as pl
from Bio import SeqIO

from src.dataset.trizod_scores.parse import read_score_csv

from typing import Set
import h5py
import torch
import numpy as np
from collections import defaultdict

In [3]:
def read_cluster_assignments(filepath: Union[Path, str]) -> pl.DataFrame:
    return pl.read_csv(
        filepath,
        separator="\t",
        has_header=False,
        new_columns=["cluster_representative_id", "sequence_id"]
    )

def filter_scores(score_csv: pl.DataFrame, ids: Set[str]) -> pl.DataFrame:
    return score_csv.filter(score_csv["ID"].is_in(ids))[
        ["ID", "pscores"]
    ].group_by("ID").agg(pl.col("pscores"))

def to_file(data: pl.DataFrame, embs: h5py.File, name: str):
    with h5py.File(f"data/{name}.h5", 'w') as f:
        embedding = f.create_group("embedding")
        trizod = f.create_group("trizod")
        cluster_group: Group = f.create_group("cluster")
        cluster = defaultdict(list)

        for row in data.rows():
            cluster[row[2]].append(row[0])
            trizod[row[0]] = torch.tensor(np.array(row[1], dtype=np.float32))
            embedding[row[0]] = torch.tensor(embs[row[0]])

        for key in cluster:
            cluster_group[key] = cluster[key]
        

In [4]:
datasets = ["unfiltered", "tolerant", "moderate", "strict"]
clusters = {
    dataset: read_cluster_assignments(f"data/clusters/{dataset}_rest_clu.tsv")
    for dataset in datasets
}
score_csv = {dataset: read_score_csv(f"data/{dataset}.csv") for dataset in datasets}
embs = h5py.File("data/embeddings/unfiltered_all_esm2_3b.h5", "r")


In [5]:
ids = {dataset: set(score_csv[dataset]["ID"]) for dataset in score_csv}
test_ids = set(rec.id for rec in SeqIO.parse(f"data/TriZOD_test_set.fasta", "fasta"))

In [6]:
train_ids = {dataset: ids[dataset] - test_ids for dataset in ids}

In [7]:
test_data = filter_scores(score_csv["strict"], test_ids)
training_data = {
    dataset: filter_scores(score_csv[dataset], train_ids[dataset])
    for dataset in datasets
}


In [8]:
training_data = {
    dataset: training_data[dataset].join(clusters[dataset], left_on="ID", right_on="sequence_id")
    for dataset in datasets
}

test_data = test_data.with_columns(test_data["ID"].alias("cluster_representative_id"))

In [9]:
to_file(test_data, embs, "test")

for dataset in datasets:
    to_file(training_data[dataset], embs, f"train_{dataset}")

  embedding[row[0]] = torch.tensor(embs[row[0]])
