## Tape Models

Trying out: https://github.com/songlab-cal/tape

In [10]:
import json
from pathlib import Path

import torch
from torch.utils.data import DataLoader, Dataset

from tape import ProteinBertModel
from tape.tokenizers import TAPETokenizer

In [11]:
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model

# Pfam Family: Hexapep, Clan: CL0536
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
token_ids = torch.tensor([tokenizer.encode(sequence)])
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]

# NOTE: pooled_output is *not* trained for the transformer, do not use
# w/o fine-tuning. A better option for now is to simply take a mean of
# the sequence output

## Reproducing Secondary Structure Results

We would like to reproduce the results that they've achieved for Secondary Structure in order to better understand what's going on.

In [12]:
data = Path('data/remote_homology/')
train = data/'remote_homology_train.json'

with open(train, "r") as f:
    training_set = json.load(f)

In [30]:
class SecondaryStructureDataset(Dataset):
    
    def _load_from_json(self, raw_json):
        
        x = []
        y = []
        
        for entry in raw_json:
            x.append(entry['primary'])
            y.append(entry['secondary_structure'])
            
        return x, y
    
    def __init__(self, raw_json):
        self.items, self.labels = self._load_from_json(raw_json)
    
    def __getitem__(self, idx):
        x = torch.Tensor(self.items[idx]).long().cuda()
        y = torch.Tensor(self.labels[idx]).long().cuda()
        
        return x, y
    
    def __len__(self):
        return len(self.items)

In [31]:
train_ds = SecondaryStructureDataset(training_set)

In [32]:
class SecondaryStructureDatasetModel(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.model = ProteinBertModel.from_pretrained('bert-base')
        self.tokenizer = TAPETokenizer(vocab='iupac')
        self.linear = torch.nn.Linear(in_features=768, out_features=3)
        
    def forward(self, x):
        
        out, _ = self.model(x)
        out = self.linear(out)
        return out

In [33]:
model = SecondaryStructureDatasetModel()
model = model.cuda()