In [3]:
import numpy as np
import h5py
import os
import ast

# Create the dataset
- Amino Acid sequence (variable length)
- Z-scores

1. Dataset with AA sequence and protein ID
2. Dataset with Z-scores and protein ID

Dataset is not tabular: different sequence lengths -> different lengths of z-score lists
-> If in-memory: dictionary?

Recap of task:
- Input: AA **Embeddings** of variable shape (N, (1, D_i)), where i is the i-th amino acid embedding
- Output: Z-score array (one score for each AA in the sequence) of variable shape (N, (1, D_i))


1. Create a dataset (and dataloader?)
    - Store embeddings and z_scores in different dictionaries
    - sequence_ids are keys

In [17]:
class Dataset:
    
    def __init__(self, embeddings_path, scores_path, seq_ids_path, split={"train": 0.8, "val": 0.2}):
        
        self.raw_ids = None
        self.raw_seqs = None
        self.embeddings = None
        self.scores = None
        
        assert os.path.exists(embeddings_path), f"Embeddings file not found in path: {embeddings_path}"
        assert os.path.exists(scores_path), f"Scores file not found in path: {scores_path}"
        assert os.path.exists(scores_path), f"Scores file not found in path: {seq_ids_path}"
        assert sum(split.values()) == 1.0, "Invalid split"
        
        self.embeddings_path = embeddings_path
        self.scores_path = scores_path
        self.seq_ids_path = seq_ids_path
        
        self.split = split
        
        self.create_dataset()
        pass
    
    def __getitem__(self, idx):
        
        if isinstance(idx, slice):
            raise(Exception("Not yet implemented :(("))
        elif isinstance(idx, int):
            idx = self.raw_ids[idx]
            print(f"idx: {idx}")
        elif not isinstance(idx, str):
            raise(ValueError("Invalid index type"))
        
        return (self.embeddings[idx], self.scores[idx], self.raw_seqs[idx])    
    
    def __len__(self):
        pass
    
    def create_dataset(self):
        
        with h5py.File(self.embeddings_path, "r") as embeddings_h5:
            self.embeddings = {id: np.array(embs) for id, embs in embeddings_h5.items()}

        with open(self.scores_path, "r") as scores_file:
            lines = scores_file.readlines()
            raw_scores = [np.array(ast.literal_eval(line)) for line in lines]
        with open(self.seq_ids_path, "r") as id_file:
            lines = id_file.readlines()
            self.raw_ids = [str(line.split(" ")[0]) for line in lines]
            raw_seqs = [line.split(" ")[1].strip() for line in lines]
        
        self.scores = dict(zip(self.raw_ids, raw_scores))
        self.raw_seqs = dict(zip(self.raw_ids, raw_seqs))
        
        pass
    

In [18]:
data_dir = "../../data/raw/"
embeddings_path = os.path.join(data_dir, "t5_xl_u50_v3_EncOnlyHalfPrec_CheZOD_1325_nRtoTestAt20_nrWithinAt20.h5")
scores_path = os.path.join(data_dir, "allscores1325newest.txt")
id_path = os.path.join(data_dir, "allseqs1325.txt")

In [6]:
first_dataset = Dataset(embeddings_path=embeddings_path, scores_path=scores_path, seq_ids_path=id_path)

In [15]:
embd, score, seq = first_dataset[0]

idx: 26672


In [10]:
embd.shape

(163, 1024)

In [14]:
score.shape

(163,)

In [24]:
embd.shape

(163, 1024)

In [27]:
len(seq)

163

In [16]:
seq

'MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDTSGYGQSSYSSYGQSQNTGYGTQSTPQGYGSTGGYGSSQSSQSSYGQQSSYPGYGQQPAPSSTSGSYGSSSQSSSYGQPQSGSYSQQPSYGGQQQSYGQQQSYNPPQGYGQQNQYNS'