In [1]:
import numpy as np
import tensorflow as tf
from transformers import T5EncoderModel, T5Tokenizer
import torch
import h5py
import time
import argparse
import os

In [2]:
class ConvNet(torch.nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()

        self.elmo_feature_extractor = torch.nn.Sequential(
            torch.nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)),  # 7x32
            torch.nn.ReLU(),
            torch.nn.Dropout(0.25),
        )
        n_final_in = 32
        self.dssp3_classifier = torch.nn.Sequential(
            torch.nn.Conv2d(n_final_in, 3, kernel_size=(7, 1), padding=(3, 0))  # 7
        )

        self.dssp8_classifier = torch.nn.Sequential(
            torch.nn.Conv2d(n_final_in, 8, kernel_size=(7, 1), padding=(3, 0))
        )
        self.diso_classifier = torch.nn.Sequential(
            torch.nn.Conv2d(n_final_in, 2, kernel_size=(7, 1), padding=(3, 0))
        )

    def forward(self, x):
        # IN: X = (B x L x F); OUT: (B x F x L, 1)
        x = x.permute(0, 2, 1).unsqueeze(dim=-1)
        x = self.elmo_feature_extractor(x)  # OUT: (B x 32 x L x 1)
        d3_Yhat = self.dssp3_classifier(x).squeeze(dim=-1).permute(0, 2, 1)  # OUT: (B x L x 3)
        d8_Yhat = self.dssp8_classifier(x).squeeze(dim=-1).permute(0, 2, 1)  # OUT: (B x L x 8)
        diso_Yhat = self.diso_classifier(x).squeeze(dim=-1).permute(0, 2, 1)  # OUT: (B x L x 2)
        return d3_Yhat, d8_Yhat, diso_Yhat

In [3]:
def load_sec_struct_model():
    checkpoint_dir = "D:/htchang/DPCR/Data/PortTrans/protT5/sec_struct_checkpoint/secstruct_checkpoint.pt"
    state = torch.load(checkpoint_dir)
    model = ConvNet()
    model.load_state_dict(state['state_dict'])
    model = model.eval()
    model = model.to(device)

    return model

In [4]:
def read_fasta(fasta_path, split_char="!", id_field=0):
    seq = ''
    with open(fasta_path, 'r') as fasta_f:
        for line in fasta_f:
            if not line.startswith('>'):
                seq += line.strip()

    seq_id = os.path.splitext(os.path.basename(fasta_path))[0] # Get only the file name without path and extension
    seqs = [(seq_id, seq)]

    return seqs
print("hi there")

hi there


In [5]:
def get_T5_model():
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    # Rostlab/prot_t5_xl_uniref50
    model = model.to(device)  # move model to GPU
    model = model.eval()  # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

    return model, tokenizer

In [6]:
def get_embeddings(model, tokenizer, seqs, max_residues=4000, max_seq_len=1000, max_batch=100):
    results = {"residue_embs": dict()}

    # sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
    seq_dict = sorted(seqs, key=lambda x: len(x[1]), reverse=True)
    start = time.time()
    batch = []

    for seq_idx, (pdb_id, seq) in enumerate(seq_dict, 1):
        seq_len = len(seq)
        seq = ' '.join(list(seq))
        batch.append((pdb_id, seq, seq_len))

        # count residues in current batch and add the last sequence length to
        # avoid that batches with (n_res_batch > max_residues) get processed
        n_res_batch = sum([s_len for _, _, s_len in batch]) + seq_len
        if len(batch) >= max_batch or n_res_batch >= max_residues or seq_idx == len(seq_dict) or seq_len > max_seq_len:
            pdb_ids, seqs, seq_lens = zip(*batch)
            batch = []

            # add_special_tokens adds extra token at the end of each sequence
            token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
            input_ids = torch.tensor(token_encoding['input_ids']).to(device)
            attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

            try:
                with torch.no_grad():
                    # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
                    embedding_repr = model(input_ids, attention_mask=attention_mask)
            except RuntimeError:
                print("RuntimeError during embedding for {} (L={})".format(pdb_id, seq_len))
                continue

            for batch_idx, identifier in enumerate(pdb_ids):  # for each protein in the current mini-batch
                s_len = seq_lens[batch_idx]
                # slice off padding --> batch-size x seq_len x embedding_dim
                emb = embedding_repr.last_hidden_state[batch_idx, :s_len]
                if "residue_embs" in results:
                    results["residue_embs"][identifier] = emb.detach().cpu().numpy().squeeze()

    passed_time = time.time() - start
    avg_time = passed_time / len(results["residue_embs"])

    return results

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using {}".format(device))
model, tokenizer = get_T5_model()

Using cpu


In [8]:
def save_port_map(port_data,output_file):
    np.savetxt(output_file, port_data)

In [9]:
import os
def main(input_folder="D:/Malik/SATs/Sugar vs others/Dataset", output_folder="D:/Malik/SATs/Sugar vs others/Prottrans"):
    os.makedirs(output_folder, exist_ok=True)

    for data_folder in ["Other SATs", "Sugar"]:
        data_folder_path = os.path.join(input_folder, data_folder)
        output_data_folder_path = os.path.join(output_folder, data_folder)
        os.makedirs(output_data_folder_path, exist_ok=True)

        for folder_name in ["Train", "Test"]:
            input_folder_path = os.path.join(data_folder_path, folder_name)
            output_folder_path = os.path.join(output_data_folder_path, folder_name)
            os.makedirs(output_folder_path, exist_ok=True)

            for file_name in os.listdir(input_folder_path):
                fasta_file = os.path.join(input_folder_path, file_name)
                output_file = os.path.join(output_folder_path, file_name)

                # Generate PortTran features here using your existing code
                filename = os.path.splitext(os.path.basename(fasta_file))[0]
                seqs = read_fasta(fasta_file)
                results = get_embeddings(model, tokenizer, seqs)
                embeddings = results['residue_embs'][filename]

                # Save the PortTran features to the output file
                save_port_map(embeddings, output_file)


In [10]:
main()


## For SLC Data

In [9]:
import os

# Define your read_fasta, get_embeddings, and save_port_map functions here

def main(input_folder="D:/Malik/SATs/SLC_Data", output_folder="D:/Malik/SATs/Amino acid vs others/Prottrans/SLC"):
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over the folders in the input dataset
    for data_folder in ["Amino acid", "Sodium", "Sugar"]:
        data_folder_path = os.path.join(input_folder, data_folder)
        output_data_folder_path = os.path.join(output_folder, data_folder)
        os.makedirs(output_data_folder_path, exist_ok=True)

        # Iterate over files in the input folder
        for file_name in os.listdir(data_folder_path):
            fasta_file = os.path.join(data_folder_path, file_name)
            output_file = os.path.join(output_data_folder_path, file_name)

            # Generate ProtTrans embeddings here
            filename = os.path.splitext(os.path.basename(fasta_file))[0]
            
            # Implement read_fasta function
            seqs = read_fasta(fasta_file)

            # Implement get_embeddings function
            results = get_embeddings(model, tokenizer, seqs)
            embeddings = results['residue_embs'][filename]

            # Implement save_port_map function
            save_port_map(embeddings, output_file)

if __name__ == "__main__":
    main()


In [3]:
#data Spliting
import os
from sklearn.model_selection import train_test_split
import shutil

def split_data(input_folder, output_train_folder, output_test_folder, test_size=0.2, random_state=42):
    # Create output folders if they don't exist
    os.makedirs(output_train_folder, exist_ok=True)
    os.makedirs(output_test_folder, exist_ok=True)

    # List all files in the input folder
    files = os.listdir(input_folder)

    # Split the files into training and testing sets
    train_files, test_files = train_test_split(files, test_size=test_size, random_state=random_state)

    # Move files to the corresponding output folders
    for file in train_files:
        source_path = os.path.join(input_folder, file)
        destination_path = os.path.join(output_train_folder, file)
        shutil.copyfile(source_path, destination_path)

    for file in test_files:
        source_path = os.path.join(input_folder, file)
        destination_path = os.path.join(output_test_folder, file)
        shutil.copyfile(source_path, destination_path)

# Replace these paths with your actual folder paths
amino_acids_folder = 'D:/Malik/SATs/Complete Data/Sugar vs Other/Sugar'
other_folder = 'D:/Malik/SATs/Complete Data/Sugar vs Other/Other'

train_amino_acids_folder = 'D:/Malik/SATs/Sugar vs others/Dataset/Sugar/Train'
test_amino_acids_folder = 'D:/Malik/SATs/Sugar vs others/Dataset/Sugar/Test'

train_other_folder = 'D:/Malik/SATs/Sugar vs others/Dataset/Other/Train'
test_other_folder = 'D:/Malik/SATs/Sugar vs others/Dataset/Other/Test'

# Split data for amino acids folder
split_data(amino_acids_folder, train_amino_acids_folder, test_amino_acids_folder)

# Split data for other folder
split_data(other_folder, train_other_folder, test_other_folder)


In [None]:
import os

def count_fasta_files(folder_path):
    # Ensure the provided path is a directory
    if not os.path.isdir(folder_path):
        print(f"Error: '{folder_path}' is not a valid directory.")
        return None

    # Get a list of files in the directory
    files = os.listdir(folder_path)

    # Count the number of files with the '.fasta' or '.fa' extension
    fasta_files = [file for file in files if file.lower().endswith(('.fasta', '.fa'))]

    # Print the count and list of FastA files
    print(f"Number of FastA files in '{folder_path}': {len(fasta_files)}")
    print("List of FastA files:")
    for file in fasta_files:
        print(file)

    return len(fasta_files)

# Replace 'path_to_your_folder' with the actual path to your folder containing FastA files
folder_path = 'D:/Malik/SATs/Amino acid vs others/Dataset/Other SATs/Train'

# Call the function to count FastA files
count = count_fasta_files(folder_path)

# Optionally, use the count in your further processing or analysis
if count is not None:
    # Your additional code here
    pass

Number of FastA files in 'D:/Malik/SATs/Amino acid vs others/Dataset/Other SATs/Train': 318
List of FastA files:
A0A3Q7ZPG5.fasta
A0A494BA31.fasta
A0AV02.fasta
A0PJK1.fasta
A2ARP9.fasta
A8MYU2.fasta
B2RXE2.fasta
D3Z291.fasta
D3ZJ86.fasta
D4AD53.fasta
E7FKV8.fasta
E9PQ53.fasta
E9Q3M5.fasta
G3X943.fasta
H1AFJ5.fasta
O00180.fasta
O00476.fasta
O00555.fasta
O08962.fasta
O13001.fasta
O14569.fasta
O14949.fasta
O14957.fasta
O15239.fasta
O35119.fasta
O35174.fasta
O35240.fasta
O35316.fasta
O35458.fasta
O35526.fasta
O43497.fasta
O43526.fasta
O43826.fasta
O43920.fasta
O54982.fasta
O55143.fasta
O55192.fasta
O60721.fasta
O60928.fasta
O70578.fasta
O70594.fasta
O75185.fasta
O88427.fasta
O88454.fasta
O88457.fasta
O88602.fasta
O88704.fasta
O88944.fasta
O95069.fasta
O95139.fasta
O95167.fasta
O95168.fasta
O95182.fasta
O95259.fasta
O95528.fasta
P00158.fasta
P00174.fasta
P03891.fasta
P03892.fasta
P03893.fasta
P03899.fasta
P03903.fasta
P03905.fasta
P03911.fasta
P03915.fasta
P03920.fasta
P03921.fasta
P03923.f