In [1]:
!pip install protein-bert pyfastx

Collecting protein-bert
  Downloading protein_bert-1.0.1-py3-none-any.whl (39 kB)
Collecting pyfastx
  Downloading pyfastx-1.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (903 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m903.2/903.2 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyfaidx
  Downloading pyfaidx-0.7.2.1-py3-none-any.whl (28 kB)
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyfastx, pyfaidx, protobuf, protein-bert
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviou

In [2]:
import pyfastx
import pandas as pd

from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

# Prepare training data

In [3]:
train_set_file_path = '../input/cafa-5-protein-function-prediction/Train/train_terms.tsv'
train_set = pd.read_csv(train_set_file_path, sep='\t').dropna().drop_duplicates()

UNIQUE_LABELS = train_set['term'].unique()

## Load FASTA sequences into the train set

In [4]:
# Copy train FASTA to writable directory to build index later
![ ! -f train_sequences.fasta ] && cp ../input/cafa-5-protein-function-prediction/Train/train_sequences.fasta ./

# This will build an index file at /kaggle/working/train_sequences.fasta.fxi
fa = pyfastx.Fasta('train_sequences.fasta')

# Each fa[x].seq still does a disk queries so we avoid repeating them
# This will only make 140k queries instead of 5 millions (99.97% of them are duplicated)
seqs = {x: fa[x].seq for x in train_set['EntryID'].unique()}
train_set['seq'] = train_set['EntryID'].map(lambda x: seqs[x])

# Train/validate split

In [5]:
train_set, valid_set = train_test_split(train_set, stratify = train_set['term'], test_size = 0.1, random_state = 0)

# Finetune ProteinBERT

In [None]:
# Pretrained ProteinBERT weights
![ ! -f ./epoch_92400_sample_23500000.pkl ] && wget ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/epoch_92400_sample_23500000.pkl

OUTPUT_TYPE = OutputType(is_seq = False, output_type = 'categorical')
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

pretrained_model_generator, input_encoder = load_pretrained_model(
    local_model_dump_dir = './',
    local_model_dump_file_name = 'epoch_92400_sample_23500000.pkl'
)

model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['term'], valid_set['seq'], valid_set['term'], \
        seq_len = 2048, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)


--2023-04-19 19:04:21--  ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/epoch_92400_sample_23500000.pkl
           => ‘epoch_92400_sample_23500000.pkl’
Resolving ftp.cs.huji.ac.il (ftp.cs.huji.ac.il)... 132.65.116.15
Connecting to ftp.cs.huji.ac.il (ftp.cs.huji.ac.il)|132.65.116.15|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /users/nadavb/protein_bert ... done.
==> SIZE epoch_92400_sample_23500000.pkl ... 191800918
==> PASV ... done.    ==> RETR epoch_92400_sample_23500000.pkl ... done.
Length: 191800918 (183M) (unauthoritative)


2023-04-19 19:04:36 (13.1 MB/s) - ‘epoch_92400_sample_23500000.pkl’ saved [191800918]

[2023_04_19-19:04:40] Training set: Filtered out 2113976 of 4827476 (43.8%) records of lengths exceeding 510.
[2023_04_19-19:09:18] Validation set: Filtered out 235324 of 536387 (43.9%) records of lengths exceeding 510.
[2023_04_19-19:09:44] Training with frozen pretrained layers...


In [None]:
# indices = pretrained_model_generator.create_model(seq_len = 512).predict(
# input_encoder.encode_X(['MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS\
# WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFY\
# LKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY\
# YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD\
# AGEGEN'], seq_len = 512)
# )