# ***Classification of TRP Channels from Non-TRP Channels***
***20480 features for BERT Large Cased***

In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Check IF 12GB memory of GPU is shared with other users**

In [2]:
# Memory footprint support libraries/code
# If in case, the utilization is greater than 0% try to kill using the code (!kill -9 -1). 
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp37-none-any.whl size=7411 sha256=ba599eaeda97a205ce979df47bcd67f0a65ccc5251fa1a69fa1b964575880b67
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.7 GB  | Proc size: 118.6 MB
GPU RAM Free: 15109MB | Used: 0MB | Util   0% | Total 15109MB


# **Load Packages**

In [5]:
import pandas as pd 
import time
import csv
import json
import numpy as np
import math

# **Partition the protein sequence into subparts of length**

In [7]:
# Cut string to list
def cut_string(input_str, x):
    # Cut
    lst_res = [input_str[y-x:y] for y in range(x, len(input_str)+x, x)]
    return lst_res;

# **Generate N-gram**

In [8]:
# CREATE N-GRAMS DATA

'''
 A function to split the data into n-gram feature
'''
def ngrams(input, n):
  # Cut string the same with BERT max input 512
  if len(input) < 510:
    input = input[0:len(input)];
  else:
    input = input[0:510];
  
  # Create a list and dataframe
  output = []

  # loop for each residues (+1 needs max the loop)
  for i in range(0, (len(input)+1)-n): # minus n means stop at final string
      # Cut for each n-gram
      g = input[i:i+n];

      # Score in list
      output.append(g);

  # Convert list to string
  joinstr = ' '.join(output);

  return joinstr;

# **Installation/clone Bert**

In [9]:
import sys
!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

  

Cloning into 'bert_repo'...
remote: Enumerating objects: 340, done.[K
remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340[K
Receiving objects: 100% (340/340), 328.28 KiB | 2.88 MiB/s, done.
Resolving deltas: 100% (182/182), done.


In [11]:
# Download bert model
# BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip

--2021-06-05 04:09:44--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.141.128, 2607:f8b0:4023:c0b::80
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.141.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1242178883 (1.2G) [application/zip]
Saving to: ‘cased_L-24_H-1024_A-16.zip’


2021-06-05 04:09:59 (82.9 MB/s) - ‘cased_L-24_H-1024_A-16.zip’ saved [1242178883/1242178883]



# **Extract/unzip BERT Large Cased model**

In [12]:
# Extract all files
import zipfile

folder = 'model_folder'
with zipfile.ZipFile("cased_L-24_H-1024_A-16.zip","r") as zip_ref:
    zip_ref.extractall(folder)

In [13]:
!pip install tensorflow-gpu==1.15.2
import modeling
import optimization
import run_classifier
import run_classifier_with_tfhub
import tokenization
import tensorflow as tf
# import tfhub 
import tensorflow_hub as hub
import zipfile
import os




# **Get embeddings for input data files**

In [14]:
# Calculate a logistic sigmoid function
# def sigmoid(x):
#   return 1 / (1 + math.exp(-x))
data_bert_new = pd.DataFrame()
# Extract features for n-gram embeddings
def extractFeatureEmbedingJSONL(input_jsonl_file_path):
  # Temporary store variable
  temp_store_feature = [];
  embedding = []
  # Read JSONL files and append embedding vectors
  with open(input_jsonl_file_path) as f:
      for line in f:
        embedding.append(json.loads(line))
  # Print total rows data in test and train
  #print("Max embedings: "+str(len(embedding)))
    
  # Extract feature for each proteins here
  for row_index, get_prot_embedding in enumerate(embedding):
    # Temp variables
    store_token_amino_acid = [];
    store_token_embedding = [];
  
    # Get features
    features = embedding[row_index]["features"]
    # print('features')
    # print(features)

    # Extract amino acid tokens and vectors (token embedding)
    for index, feature in enumerate(features):
      token_amino_acid = feature["token"]
      # Order from original paper about layer (["layers"] ["index"] ["values"])
      # Index mens index of layer, ':' means select all layers
      token_embedding_layer0 = feature["layers"][0]["values"] # Sum last 4 layers
      token_embedding_layer1 = feature["layers"][1]["values"] # Sum last 4 layers
      token_embedding_layer2 = feature["layers"][2]["values"] # Sum last 4 layers
      token_embedding_layer3 = feature["layers"][3]["values"] # Sum last 4 layers
      
      # # Make list in list for all four layers
      # token_embedding = [token_embedding_layer0, token_embedding_layer1, token_embedding_layer2, token_embedding_layer3];

      # Take only last (-1) layer for each token
      token_embedding = token_embedding_layer0;
      # print(token_embedding)

  #     # Sum last 4 layers (sum of the last four layers)
  #     token_embedding = sum(map(np.array, token_embedding));
  #     # print(token_embedding);
  #     #print(token_amino_acid);
      
      # Store
      store_token_amino_acid.append(token_amino_acid);
      store_token_embedding.append(token_embedding);

  #     #print(f"{index}. token amino acid: {token_amino_acid}")
  #     #print(f" Protein embedding: {token_embedding[:]}")
  #     #print("\n")

  #   # Convert to dataframe (look like PSSM)
    data_bert = pd.DataFrame(store_token_embedding)

  #   # Add amino acid in dataframe
    data_bert['residue'] = store_token_amino_acid # Creat a new column represents all amino acids
    # Remove first and last rows containing special tokens
    data_bert = data_bert.drop(data_bert.index[len(data_bert)-1])
    data_bert = data_bert.drop(data_bert.index[0])
    return data_bert;

#**Generate Matrix fro BERT representations**

In [15]:
def GenerateBERTEmbeddingMatrix(df_protein_bert_embeddings):
  # Put all amino acids in order to know missing column 20 amino acid
  # default_AA = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
  default_AA = ['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y']

  # Sum/mean multiple row values of various columns grouped by 'residue'
  df_protein_bert_embeddings = df_protein_bert_embeddings.groupby('residue', as_index=False).sum()
  print(df_protein_bert_embeddings)
  # res_values = data_bert.groupby('residue', as_index=False).mean()
  # print(df_protein_bert_embeddings)

  # Transpose first row as header
  df_protein_bert_embeddings = df_protein_bert_embeddings.set_index('residue').T
  # print(df_protein_bert_embeddings)

  # Get recent column names 
  get_column_names = df_protein_bert_embeddings.columns.tolist()
  # print(get_column_names)
  # print(default_AA)
  
  # Check all columns are exist
  get_column_mis = list(set(default_AA).difference(get_column_names))
  # print(get_column_mis)

  # Check list 
  if   len(get_column_mis) > 0:
    for get_aa in get_column_mis:
      # Create column with value 0
      df_protein_bert_embeddings[get_aa] = 0;

  # Select only default amino acids
  bert_values_order = df_protein_bert_embeddings[default_AA]
  # print(bert_values_order)

  # 6. Transpose again
  bert_values_order = bert_values_order.transpose()

  # 7. Change the idea like PSSM (We used all 20 rows × 3072 columns matrix)
  # ##############################################################################
  # Change the idea like PSSM (We used all 20 rows × 3072 columns matrix)
  # It means that we will generate more than 61441 features for each proteins
  # ##############################################################################
  # Pandas flatten a dataframe to a list (use .flatten() on the DataFrame)
  bert_feature_used = bert_values_order.values.flatten();

  return bert_feature_used


In [16]:
# Note: Auto detect for GPU when set use_tpu=False (training will fall on CPU or GPU)
# From the jsonl file you have last 4 layers outputs or -1,-2,-3,-4
# Get embeddings for input data classifiers from Google Colab terminal command
def extractEmbeddingBertFeatures(df_data, bert_model_path='cased_L-24_H-1024_A-16'): 
    start_time = time.time()
    get_path = bert_model_path;
    print("Bert Path: {0}".format(get_path));

    # Save temp dataframe and run bert embedding extractor
    df_data.to_csv('input.txt', index=False, header=False, quoting=csv.QUOTE_NONE)
    os.system(f"python3 /content/bert_repo/extract_features.py \
               --input_file=input.txt \
               --output_file=output.jsonl \
               --vocab_file='{bert_model_path}/vocab.txt' \
               --bert_config_file='{bert_model_path}/bert_config.json' \
               --init_checkpoint='{bert_model_path}/bert_model.ckpt' \
               --layers='-1,-2,-3,-4' \
               --max_seq_length=512 \
               --do_lower_case=True \
               --batch_size=8 \
               --use_tpu=False")

    #bert_output = pd.read_json("output.jsonl", lines=True)
    #bert_output.head()
    
    # Call function and extract/genereate all bert features from embedding files
    result_features = extractFeatureEmbedingJSONL('output.jsonl');

    # Remove temp files
    os.system("rm input.txt")
    os.system("rm output.jsonl")
    
    #Convert to dataframe
    df_results = pd.DataFrame(result_features)
    
    # Timing
    print("[It takes {0} seconds to extract embedding features]".format((time.time() - start_time)))

    return result_features  

#**LOOP DATA FOR EACH PROTEINS & BERT PATH SETTING**


In [17]:
###############################################################################
# LOOP DATA FOR EACH PROTEINS & BERT PATH SETTINGS
# BECAUSE MAX LENGHT IS 510 (512), SO REPEATE THE PROCESS APPEND DATAFRAME LATER
###############################################################################

def generate_portionwise_embeddings(df_fasta_format):
  df_final_results = pd.DataFrame()
  bert_store_feature = [];
  bert_prot_id = [];
  for index, row in df_fasta_format.iterrows():
    df_selected = df_fasta_format.iloc[index:index+1 , : ]; # for each row
    str_sequence = df_selected['SEQUENCE'].tolist()[0];
  # Split to max 510 amino acids (with 2 additional special tokens)
    lst_part_seq = cut_string(str_sequence, 510);
    # print(lst_part_seq)
    get_id = df_selected['ID'].tolist()[0];
  # #  Create dataframe for each proteins ID
    df_prot = pd.DataFrame({"SEQUENCE": lst_part_seq, 'ID': get_id})
  # # #   CREATE N-GRAMS DATA
    df_prot['1-grams'] = df_prot.apply(lambda x: ngrams(x['SEQUENCE'], 1), axis=1)
    df_bert_res_new = pd.DataFrame()
    # print(df_prot['1-grams']);
    for subsequence in df_prot['1-grams'].values.tolist():
      df_unigram = pd.DataFrame({subsequence})
      df_unigram = df_unigram.rename(columns = {0: "unigram"})
      # print(df_unigram['unigram'][0]);
      BERT_PRETRAINED_DIR = '/content/model_folder/cased_L-24_H-1024_A-16' 
      print('>>  BERT pretrained directory: '+BERT_PRETRAINED_DIR)
      print("SUBSEQUENCE OF PROTEINS:");print(subsequence);
      df_bert_res_return = extractEmbeddingBertFeatures(df_unigram, BERT_PRETRAINED_DIR);
      print(df_bert_res_return)

      df_bert_res_new = df_bert_res_new.append(df_bert_res_return)
      df_bert_res_new.reset_index(drop=True, inplace=True)
      
    print("All BERT EMBDDING BEFORE CALCULATIONS:");
    print(df_bert_res_new);  
    
    # Simple method to calcuate bert feature for classifiers
    bert_feature_flattened = GenerateBERTEmbeddingMatrix(df_bert_res_new)
    print(bert_feature_flattened)
    
    # Store for all proteins in list with their ID
    bert_store_feature.append(bert_feature_flattened);
    bert_prot_id.append(get_id);

  df_results = pd.DataFrame(bert_store_feature)
  df_results ['ID'] = bert_prot_id;

  return df_results

#**Read Fasta Files**

In [18]:
def read_fasta_input(fastaSequenceInput):
    # Variables
    store_accesion_id = [];
    store_sequence_prot = [];
    store_seq_Length = [];
    
    data = fastaSequenceInput.replace('\n\n', '\n');
    getProtSeq = data.split(">")
    str_list = list(filter(None, getProtSeq)) # fastest
    
    for data_lst in str_list:
        try:
            each_prot = data_lst.split("\n")
            clear_prot = list(filter(None, each_prot)) # fastest
            # Get ID by first index and set to lowercase
            accesion_id = clear_prot[0];
            # Get sequence of protein by joining list from index
            get_sequence = "".join(clear_prot[1:len(clear_prot)]);
            get_sequence = get_sequence.replace('  ', ' ').replace(' ', '').replace('\t', '').replace('\n', '').replace('<br>', '');
            # Get sequence length
            get_seq_len = len(get_sequence);
            # Store
            store_accesion_id.append(accesion_id);
            store_sequence_prot.append(get_sequence);
            store_seq_Length.append(get_seq_len); 
        except:
            print("Found problem and skip proteins: {0}".format(data_lst));
    all_data = {'ID' : store_accesion_id, 
                'SEQUENCE': store_sequence_prot,
                'length':store_seq_Length
               }
    return all_data; 

#**Input Protein sequences in fasta format**
#***Note: We selected unseen portein sequences three  TRP channels and three  other channel proteins (non-TRP channels)***


In [19]:
#unseen protein sequences that were not in training and testing datasets used to buitl the model
fasta_inputs =  ">O35119_trp_channels \
\nMAQFYYKRNVNAPYRDRIPLRIVRAESELSPSEKAYLNAVEKGDYASVKKSLEEAEIYFK \
\nININCIDPLGRTALLIAIENENLELIELLLSFNVYVGDALLHAIRKEVVGAVELLLNHKK \
\nPSGEKQVPPILLDKQFSEFTPDITPIILAAHTNNYEIIKLLVQKGVSVPRPHEVRCNCVE \
\nCVSSSDVDSLRHSRSRLNIYKALASPSLIALSSEDPFLTAFQLSWELQELSKVENEFKSE \
\nYEELSRQCKQFAKDLLDQTRSSRELEIILNYRDDNSLIEEQSGNDLARLKLAIKYRQKEF \
\nVAQPNCQQLLASRWYDEFPGWRRRHWAVKMVTCFIIGLLFPVFSVCYLIAPKSPLGLFIR \
\nKPFIKFICHTASYLTFLFLLLLASQHIDRSDLNRQGPPPTIVEWMILPWVLGFIWGEIKQ \
\nMWDGGLQDYIHDWWNLMDFVMNSLYLATISLKIVAFVKYSALNPRESWDMWHPTLVAEAL \
\nFAIANIFSSLRLISLFTANSHLGPLQISLGRMLLDILKFLFIYCLVLLAFANGLNQLYFY \
\nYEETKGLSCKGIRCEKQNNAFSTLFETLQSLFWSIFGLINLYVTNVKAQHEFTDFVGATM \
\nFGTYNVISLVVLLNMLIAMMNNSYQLIADHADIEWKFARTKLWMSYFEEGGTLPTPFNVI \
\nPSPKSLWYLVKWIWTHLCKKKMRRKPESFGTIGRRAADNLRRHHQYQEVMRNLVKRYVAA \
\nMIREAKTEEGLTEENVKELKQDISSFRFEVLGLLRGSKLSTIQSANAASSASSADSDEKS \
\nHSEGNGKDKRKNLSLFDLTTLIHPRSAVIASERHNLSNGSALVVQEPPREKQRKVNFVAD \
\nIKNFGLFHRRSKQNAAEQNANQIFSVSEEITRQQAAGALERNIQLESKGLASRGDRSIPG \
\nLNEQCVLVDHRERNTDTLGLQVGKRVCSSFKSEKVVVEDTVPIIPKEKHAQEEDSSIDYD \
\nLSPTDTVAHEDYVTTRL \
>O35119_trp_channels \
\nMAQFYYKRNVNAPYRDRIPLRIVRAESELSPSEKAYLNAVEKGDYASVKKSLEEAEIYFK \
\nININCIDPLGRTALLIAIENENLELIELLLSFNVYVGDALLHAIRKEVVGAVELLLNHKK \
\nPSGEKQVPPILLDKQFSEFTPDITPIILAAHTNNYEIIKLLVQKGVSVPRPHEVRCNCVE \
\nCVSSSDVDSLRHSRSRLNIYKALASPSLIALSSEDPFLTAFQLSWELQELSKVENEFKSE \
\nYEELSRQCKQFAKDLLDQTRSSRELEIILNYRDDNSLIEEQSGNDLARLKLAIKYRQKEF \
\nVAQPNCQQLLASRWYDEFPGWRRRHWAVKMVTCFIIGLLFPVFSVCYLIAPKSPLGLFIR \
\nKPFIKFICHTASYLTFLFLLLLASQHIDRSDLNRQGPPPTIVEWMILPWVLGFIWGEIKQ \
\nMWDGGLQDYIHDWWNLMDFVMNSLYLATISLKIVAFVKYSALNPRESWDMWHPTLVAEAL \
\nFAIANIFSSLRLISLFTANSHLGPLQISLGRMLLDILKFLFIYCLVLLAFANGLNQLYFY \
\nYEETKGLSCKGIRCEKQNNAFSTLFETLQSLFWSIFGLINLYVTNVKAQHEFTDFVGATM \
\nFGTYNVISLVVLLNMLIAMMNNSYQLIADHADIEWKFARTKLWMSYFEEGGTLPTPFNVI \
\nPSPKSLWYLVKWIWTHLCKKKMRRKPESFGTIGRRAADNLRRHHQYQEVMRNLVKRYVAA \
\nMIREAKTEEGLTEENVKELKQDISSFRFEVLGLLRGSKLSTIQSANAASSASSADSDEKS \
\nHSEGNGKDKRKNLSLFDLTTLIHPRSAVIASERHNLSNGSALVVQEPPREKQRKVNFVAD \
\nIKNFGLFHRRSKQNAAEQNANQIFSVSEEITRQQAAGALERNIQLESKGLASRGDRSIPG \
\nLNEQCVLVDHRERNTDTLGLQVGKRVCSSFKSEKVVVEDTVPIIPKEKHAQEEDSSIDYD \
\nLSPTDTVAHEDYVTTRL \
>Q99J21_trp_channels \
\nMATPAGRRASETERLLTPNPGYGTQVGTSPAPTTPTEEEDLRRRLKYFFMSPCDKFRAKG \
\nRKPCKLMLQVVKILVVTVQLILFGLSNQLVVTFREENTIAFRHLFLLGYSDGSDDTFAAY \
\nTQEQLYQAIFYAVDQYLILPEISLGRYAYVRGGGGPWANGSALALCQRYYHRGHVDPAND \
\nTFDIDPRVVTDCIQVDPPDRPPDIPSEDLDFLDGSASYKNLTLKFHKLINVTIHFQLKTI \
\nNLQSLINNEIPDCYTFSILITFDNKAHSGRIPIRLETKTHIQECKHPSVSRHGDNSFRLL \
\nFDVVVILTCSLSFLLCARSLLRGFLLQNEFVVFMWRRRGREISLWERLEFVNGWYILLVT \
\nSDVLTISGTVMKIGIEAKNLASYDVCSILLGTSTLLVWVGVIRYLTFFHKYNILIATLRV \
\nALPSVMRFCCCVAVIYLGYCFCGWIVLGPYHVKFRSLSMVSECLFSLINGDDMFVTFAAM \
\nQAQQGHSSLVWLFSQLYLYSFISLFIYMVLSLFIALITGAYDTIKHPGGTGTEKSELQAY \
\nIEQCQDSPTSGKFRRGSGSACSLFCCCGRDSPEDHSLLVN \
>P03646_non_trp_channels \
\nMFGAIAGGIASALAGGAMSKLFGGGQKAASGGIQGDVLATDNNTVGMGDAGIKSAIQGSN \
\nVPNPDEAAPSFVSGAMAKAGKGLLEGTLQAGTSAVSDKLLDLVGLGGKSAADKGKDTRDY \
\nLAAAFPELNAWERAGADASSAGMVDAGFENQKELTKMQLDNQKEIAEMQNETQKEIAGIQ \
\nSATSRQNTKDQVYAQNEMLAYQQKESTARVASIMENTNLSKQQQVSEIMRQMLTQAQTAG \
\nQYFTNDQIKEMTRKVSAEVDLVHQQTQNQRYGSSHIGATAKDISNVVTDAASGVVDIFHG \
\nIDKAVADTWNNFWKDGKADGIGSNLSRK \
>P13583_non_trp_channels \
\nMSRIKAIIASVIICIIVCLSWAVNHYRDNAITYKEQRDKATSIIADMQKRQRDVAELDAR \
\nYTKELADANATIETLRADVSAGRKRLQVSATCPKSTTGASGMGDGESPRLTADAELNYYR \
\nLRSGIDRITAQVNYLQEYIRSQCLK \
>Q6IQ69_non_trp_channels \
\nMGAFIAKMLLPTISSLVFVPAASVAAKRGFHMEAMVYFFTMFFTAIYHACDGPGLSILCF \
\nMKYDILEYFSVYGTAISMWVTLLALGDFDEPKRSSLTMFGVLTAAVRIYQDRLGYGIYSG \
\nPIGTAVFMITVKWLQKMKEKKGLYPDKSVYTQQVGPGCCFGALALMLRFYFEEWDYAYVH \
\nSFYHVSLAMSFILLLPKKNRYAGTGRNAAKLNCYTLCCCV"


# call functions
get_array_fasta = read_fasta_input(fasta_inputs);


# Store in dataframe
df_fasta_format = pd.DataFrame(get_array_fasta) 
prot_id_test_name  = df_fasta_format['ID'].tolist()
print(prot_id_test_name) 

print(df_fasta_format);
print("Total fasta input: ", len(df_fasta_format.index))
print("Min len test: ", min(df_fasta_format['length'].tolist()))

['O35119_trp_channels ', 'O35119_trp_channels ', 'Q99J21_trp_channels ', 'P03646_non_trp_channels ', 'P13583_non_trp_channels ', 'Q6IQ69_non_trp_channels ']
                         ID  ... length
0      O35119_trp_channels   ...    977
1      O35119_trp_channels   ...    977
2      Q99J21_trp_channels   ...    580
3  P03646_non_trp_channels   ...    328
4  P13583_non_trp_channels   ...    145
5  Q6IQ69_non_trp_channels   ...    220

[6 rows x 3 columns]
Total fasta input:  6
Min len test:  145


#**Call to generate representations for given protein sequences**

In [20]:
full_length_features_trp_channels =  generate_portionwise_embeddings(df_fasta_format)
test_x = full_length_features_trp_channels.iloc[:,0:20480].values

>>  BERT pretrained directory: /content/model_folder/cased_L-24_H-1024_A-16
SUBSEQUENCE OF PROTEINS:
M A Q F Y Y K R N V N A P Y R D R I P L R I V R A E S E L S P S E K A Y L N A V E K G D Y A S V K K S L E E A E I Y F K I N I N C I D P L G R T A L L I A I E N E N L E L I E L L L S F N V Y V G D A L L H A I R K E V V G A V E L L L N H K K P S G E K Q V P P I L L D K Q F S E F T P D I T P I I L A A H T N N Y E I I K L L V Q K G V S V P R P H E V R C N C V E C V S S S D V D S L R H S R S R L N I Y K A L A S P S L I A L S S E D P F L T A F Q L S W E L Q E L S K V E N E F K S E Y E E L S R Q C K Q F A K D L L D Q T R S S R E L E I I L N Y R D D N S L I E E Q S G N D L A R L K L A I K Y R Q K E F V A Q P N C Q Q L L A S R W Y D E F P G W R R R H W A V K M V T C F I I G L L F P V F S V C Y L I A P K S P L G L F I R K P F I K F I C H T A S Y L T F L F L L L L A S Q H I D R S D L N R Q G P P P T I V E W M I L P W V L G F I W G E I K Q M W D G G L Q D Y I H D W W N L M D F V M N S L Y L A T I S

#**Path to the prediction model on saved on google drive**

In [21]:
WorkDir = "gdrive/My Drive/prediction_model/"

#**Load the model to output probabilities**

In [22]:
# import joblib
import pickle
store_prob_class1 = []
filenamemodel_1 = WorkDir+"finalized_bert_large_cased_model.sav"
loaded_model_class_1 = pickle.load(open(filenamemodel_1, 'rb'))

print(loaded_model_class_1.classes_);

prob_class1 = loaded_model_class_1.predict_proba(test_x)
print(prob_class1)

for val in prob_class1: 
    store_prob_class1.append(val[1]);

all_results = {'Fasta' : prot_id_test_name, 
                'Probability of a protein sequence to be a TRP channel': store_prob_class1
               }
print(all_results);

[0 1]
[[2.22210958e-01 7.77789042e-01]
 [2.22210958e-01 7.77789042e-01]
 [2.14952989e-01 7.85047011e-01]
 [9.99824259e-01 1.75741497e-04]
 [9.98098547e-01 1.90145310e-03]
 [9.80274353e-01 1.97256473e-02]]
{'Fasta': ['O35119_trp_channels ', 'O35119_trp_channels ', 'Q99J21_trp_channels ', 'P03646_non_trp_channels ', 'P13583_non_trp_channels ', 'Q6IQ69_non_trp_channels '], 'Probability of a protein sequence to be a TRP channel': [0.7777890424589913, 0.7777890424589913, 0.7850470113943702, 0.0001757414966556819, 0.001901453104509814, 0.01972564727573773]}
