<h3> Extracting protein sequences' features using ProtBert-BFD pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
import re
import numpy as np
import os
import requests
from tqdm.auto import tqdm
import pandas as pd
from sklearn.manifold import TSNE
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import skimage.measure

In [3]:
tqdm.pandas()

<b>2. Load the vocabulary and ProtBert-BFD Model<b>

In [4]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False )

In [5]:
model = AutoModel.from_pretrained("Rostlab/prot_bert_bfd")

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<b>3. Load the model into the GPU if avilabile<b>

In [6]:
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer,device=0 ) # device=0 for GPU, -1 for CPU

**4. Load sequences**

Files have to be loaded manually

In [7]:
df = pd.read_csv("knotted_out.csv.gz")
df

Unnamed: 0,ID,Max Topology,Sequence,Taxonomy,Length
0,A0A670K578,3_1,MMACLTVTEMEGPASSTLHQNGGVLGNAAVSMHTEPLLRVYLYHSQ...,Podarcis muralis (Wall lizard) (Lacerta muralis),826.0
1,A0A5E4A9T8,3_1,MASVGLQFQASAGDADPQSRPLLLLGQLHHLHRVPWSHVRGKLQPR...,Marmota monax (Woodchuck),305.0
2,A0A661VMP6,3_1,MHIEYPDRENFGQMLKEAGSNSLDFIVIVTAALNPLDTSRLKAYLS...,Chloroflexi bacterium,575.0
3,A0A817YQ90,3_1,MEVEVLNEPRIKHISVSPTQAESFYKPTTNHGPFIRSCDENNGRVV...,Rotaria sp. Silwood1,676.0
4,A0A5B3GN31,3_1,MEVTMSRLLVTNKSFELLEPYEGKLSRTVLRGESSRKGADLLDIDD...,Alistipes onderdonkii,571.0
...,...,...,...,...,...
644875,A0A7C2QMW0,3_1,MEAVTHLKHPAFRAACRLTESAPREAEHRFLMEGALQIAKALHAPL...,Armatimonadetes bacterium,263.0
644876,A0A7J2X2D7,3_1,MRLYIILVEIEGGINLGLITRLADNFDVEEIRLVNPKLTDEEYELA...,Thermoprotei archaeon,237.0
644877,W0U5M7,3_1,MHINSKDNPRVKLFRKLLSSKKAREEHGLFAVEGARNCVDTAYEAV...,Ruminococcus bicirculans,273.0
644878,W5YKZ6,3_1,MKLNDIRKLHQKKYRQEFRHYLVEGEHLVLELQKAVAQQPALVSAE...,Marinobacter similis,247.0


In [8]:
df['Sequence'] = df['Sequence'].apply(lambda sequence: " ".join(re.sub(r"[UZOB]", "X", sequence)))
print(df.head())
print(df.shape)

           ID Max Topology                                           Sequence  \
0  A0A670K578          3_1  M M A C L T V T E M E G P A S S T L H Q N G G ...   
1  A0A5E4A9T8          3_1  M A S V G L Q F Q A S A G D A D P Q S R P L L ...   
2  A0A661VMP6          3_1  M H I E Y P D R E N F G Q M L K E A G S N S L ...   
3  A0A817YQ90          3_1  M E V E V L N E P R I K H I S V S P T Q A E S ...   
4  A0A5B3GN31          3_1  M E V T M S R L L V T N K S F E L L E P Y E G ...   

                                           Taxonomy  Length  
0  Podarcis muralis (Wall lizard) (Lacerta muralis)   826.0  
1                         Marmota monax (Woodchuck)   305.0  
2                             Chloroflexi bacterium   575.0  
3                              Rotaria sp. Silwood1   676.0  
4                             Alistipes onderdonkii   571.0  
(644880, 5)


<b>5. Extracting sequences' features <b>

In [9]:
def get_embedding(seq):
    embedding = fe(seq)
    features =  np.array(embedding[0][1:len(seq)+1])
    features = skimage.measure.block_reduce(features, (1024, 1), np.average)
    #file = open("embedding_features_tmp.csv", 'a')
    #file.write(','.join(str(e) for e in features[0]))
    #file.write('\n')
    return np.array(features[0], dtype=float)

In [None]:
df['features'] = df['Sequence'].progress_apply(get_embedding)

 58%|█████▊    | 375537/644880 [6:48:23<2:55:03, 25.64it/s] 

In [12]:
df.head(n=10)

Unnamed: 0,ID,Max Topology,Sequence,Taxonomy,Length,features
0,A0A670K578,3_1,M M A C L T V T E M E G P A S S T L H Q N G G ...,Podarcis muralis (Wall lizard) (Lacerta muralis),826.0,"[0.014043582937123489, 0.038621869974324596, 0..."
1,A0A5E4A9T8,3_1,M A S V G L Q F Q A S A G D A D P Q S R P L L ...,Marmota monax (Woodchuck),305.0,"[0.010411723806811324, 0.0018406882605233044, ..."
2,A0A661VMP6,3_1,M H I E Y P D R E N F G Q M L K E A G S N S L ...,Chloroflexi bacterium,575.0,"[0.0062372844051612475, -0.000405240970907883,..."
3,A0A817YQ90,3_1,M E V E V L N E P R I K H I S V S P T Q A E S ...,Rotaria sp. Silwood1,676.0,"[0.010742585794261572, 0.007610015049067442, 0..."
4,A0A5B3GN31,3_1,M E V T M S R L L V T N K S F E L L E P Y E G ...,Alistipes onderdonkii,571.0,"[0.015856875767596534, -0.02628934294972396, -..."
5,A0A673XLT1,4_1,M G N L V F G T L R N H W K K S T V A A C A L ...,Salmo trutta (Brown trout),369.0,"[0.0032996517181480556, 0.005642520267144846, ..."
6,A0A2K5DPI3,3_1,M A P A E V V P W A V R S R Q L P L T S M A L ...,Aotus nancymaae (Ma's night monkey),1018.0,"[0.013117323592579666, 0.012851471518345647, -..."
7,A0A1G3ZHT5,3_1,M K I N I L N S V T I S V S L L S S H I L Y A ...,Verrucomicrobia bacterium RIFCSPHIGHO2_12_FULL...,1185.0,"[0.0019375661176184167, -0.03721625974982601, ..."
8,A0A1I4F7V1,3_1,M D H A A D G M S D P E D L A L W A L H C Y C ...,Streptosporangium canum,330.0,"[0.009935885378666853, -0.009912384758877124, ..."
9,A0A5C6XC45,3_1,M M F R T R P T L I A L L T V A L S A S L I A ...,Lujinxingia vulgaris,538.0,"[0.019024098611073015, 0.004611214507377781, -..."


In [13]:
df.to_csv("knotted_embeddings.csv.gz")



---



---

