# Generate embeddings for M2 model:

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm import tqdm

In [2]:
PATCH_SIZE = 40
HF_DATASET = f'roa7n/patched_1000_test_p_{PATCH_SIZE}'
OUTPUT = f'/home/jovyan/data/proteins_m2/patched_{PATCH_SIZE}_embeddings_backup.csv'
HF_OUTPUT = f'roa7n/patched_1000_test_p_{PATCH_SIZE}_m2_embeddings'

In [3]:
tqdm.pandas()

In [4]:
hf_dataset = load_dataset(HF_DATASET)
hf_dataset

Using custom data configuration roa7n--patched_1000_test_p_40-3d7a1ccf9152eda0
Found cached dataset parquet (/home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40-3d7a1ccf9152eda0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'sequence_str', 'label'],
        num_rows: 1663294
    })
})

In [5]:
df = hf_dataset['train'].to_pandas()
print(df.shape)

(1663294, 3)


In [6]:
df

Unnamed: 0,id,sequence_str,label
0,A0A533UME0_40_-1,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1
1,A0A533UME0_40_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYIYHES...,1
2,A0A533UME0_40_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIYHES...,1
3,A0A533UME0_40_2,MKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYHES...,1
4,A0A533UME0_40_3,MKLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXHES...,1
...,...,...,...
1663289,A0A6A4IYK5_40_292,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663290,A0A6A4IYK5_40_293,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663291,A0A6A4IYK5_40_294,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663292,A0A6A4IYK5_40_295,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1


In [7]:
!pip install scikit-image



In [8]:
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModel, pipeline
import re
import skimage.measure
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

In [9]:
torch.cuda.is_available()

True

In [10]:
# Run on GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


### Prepare dataset

In [11]:
tokenizerM2 = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False)
modelM2 = AutoModel.from_pretrained("Rostlab/prot_bert_bfd")
fe = pipeline('feature-extraction', model=modelM2, tokenizer=tokenizerM2, device=0) 

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
df['sequence_str'] = df['sequence_str'].apply(lambda sequence: ' '.join(re.sub(r'[UZOB]', 'X', sequence)))
df

Unnamed: 0,id,sequence_str,label
0,A0A533UME0_40_-1,M K L S I A I P D S S V S D E S T Q L G K S M ...,1
1,A0A533UME0_40_0,X X X X X X X X X X X X X X X X X X X X X X X ...,1
2,A0A533UME0_40_1,M X X X X X X X X X X X X X X X X X X X X X X ...,1
3,A0A533UME0_40_2,M K X X X X X X X X X X X X X X X X X X X X X ...,1
4,A0A533UME0_40_3,M K L X X X X X X X X X X X X X X X X X X X X ...,1
...,...,...,...
1663289,A0A6A4IYK5_40_292,M S Y N D G N W C L I E S D P G V F S E L I R ...,1
1663290,A0A6A4IYK5_40_293,M S Y N D G N W C L I E S D P G V F S E L I R ...,1
1663291,A0A6A4IYK5_40_294,M S Y N D G N W C L I E S D P G V F S E L I R ...,1
1663292,A0A6A4IYK5_40_295,M S Y N D G N W C L I E S D P G V F S E L I R ...,1


In [13]:
# test with smaller amount of data:
# df_tmp = df.loc[df['id'].str.contains('A0A533UME0_20')]

In [14]:
def get_embedding(seq):
    embedding = fe(seq)
    features =  np.array(embedding[0][1:len(seq)+1])
    features = skimage.measure.block_reduce(features, (1024, 1), np.average)
    return np.array(features[0], dtype=float)

In [15]:
# df_tmp['features'] = df_tmp['sequence_str'].progress_apply(get_embedding)
df['features'] = df['sequence_str'].progress_apply(get_embedding)

 45%|████▍     | 748427/1663294 [12:19:05<16:45:19, 15.17it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 47%|████▋     | 778211/1663294 [12:50:52<14:50:33, 16.56it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 47%|████▋     | 783552/1663294 [12:56:35<15:34:14, 15.69it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`-

In [None]:
df.to_csv(OUTPUT, encoding='utf-8', index=False)

In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder

# hf_dataset = Dataset.from_pandas(df_tmp)
hf_dataset = Dataset.from_pandas(df)

# set api for login and save token
api=HfApi()
api.set_access_token('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')

hf_dataset.push_to_hub(HF_OUTPUT)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Pushing dataset shards to the dataset hub:   0%|          | 0/30 [00:00<?, ?it/s]