# Predicting patches on M1 model:

In [1]:
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm import tqdm

In [5]:
PATCH_SIZE = 40
HF_DATASET = f'roa7n/patched_1000_test_p_{PATCH_SIZE}'
OUTPUT = f'/home/jovyan/data/proteins_m1/patched_{PATCH_SIZE}_preds_backup.csv'
HF_OUTPUT = f'roa7n/patched_1000_test_p_{PATCH_SIZE}_m1_predictions'

In [6]:
tqdm.pandas()

## Load data:

In [7]:
dss = load_dataset(HF_DATASET)
dss = dss['train']
df = pd.DataFrame(dss)
df

Using custom data configuration roa7n--patched_1000_test_p_40-1698dae7c9e6de75


Downloading and preparing dataset None/None to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40-1698dae7c9e6de75/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/28.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1663294 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/roa7n___parquet/roa7n--patched_1000_test_p_40-1698dae7c9e6de75/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,sequence_str,label
0,A0A533UME0_40_-1,MKLSIAIPDSSVSDESTQLGKSMKISLIARACAIFRVQTVYIYHES...,1
1,A0A533UME0_40_0,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYIYHES...,1
2,A0A533UME0_40_1,MXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIYHES...,1
3,A0A533UME0_40_2,MKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXYHES...,1
4,A0A533UME0_40_3,MKLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXHES...,1
...,...,...,...
1663289,A0A6A4IYK5_40_292,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663290,A0A6A4IYK5_40_293,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663291,A0A6A4IYK5_40_294,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1
1663292,A0A6A4IYK5_40_295,MSYNDGNWCLIESDPGVFSELIREFGCSGVQVEEIWSLEAGQFEDL...,1


# Evaluate M1

In [8]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollator, Trainer, TrainingArguments
from datasets import load_metric, Features, Value
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, recall_score
from math import exp

In [9]:
def tokenize_function(s):
    seq_split = ' '.join(s['sequence_str'])
    return tokenizerM1(seq_split)

In [10]:
tokenizerM1 = AutoTokenizer.from_pretrained('EvaKlimentova/knots_distillprotbert_alphafold')
modelM1 = AutoModelForSequenceClassification.from_pretrained('EvaKlimentova/knots_distillprotbert_alphafold')

In [11]:
tokenized_dataset = dss.map(tokenize_function, remove_columns=['id', 'sequence_str'], num_proc=4)
tokenized_dataset.set_format('pt')
tokenized_dataset

       

#1:   0%|          | 0/415824 [00:00<?, ?ex/s]

#2:   0%|          | 0/415823 [00:00<?, ?ex/s]

#0:   0%|          | 0/415824 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/415823 [00:00<?, ?ex/s]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1663294
})

In [13]:
training_args = TrainingArguments('/home/jovyan/models/m1/outputs', fp16=True, per_device_eval_batch_size=50, report_to='none')  

trainer = Trainer(
    modelM1,
    training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizerM1
)

predictions, _, _ = trainer.predict(tokenized_dataset)
predictions = [np.exp(p[1]) / np.sum(np.exp(p), axis=0) for p in predictions]
df['m1_preds'] = predictions

Using cuda_amp half precision backend
***** Running Prediction *****
  Num examples = 1663294
  Batch size = 50
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [16]:
df.to_csv(OUTPUT, encoding='utf-8', index=False)

In [17]:
df['m1_preds'] = df['m1_preds'].astype(np.float32)

In [18]:
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder

hf_dataset = Dataset.from_pandas(df)
hf_dataset

Dataset({
    features: ['id', 'sequence_str', 'label', 'm1_preds'],
    num_rows: 1663294
})

In [19]:
# set api for login and save token
api=HfApi()
api.set_access_token('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')

hf_dataset.push_to_hub(HF_OUTPUT)



Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]