<a href="https://colab.research.google.com/github/MartinekV/DL-for-bio-course/blob/master/05_protein_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library installation

In [None]:
# Switch to GPU runtine
! pip install transformers evaluate datasets requests pandas -q
# Starter notebook source https://huggingface.co/blog/deep-learning-with-proteins

# Data creation and preparation

Classifying proteins by their cellular localization - given their sequence, can we predict if they're going to be found in the cytosol (the fluid inside the cell) or embedded in the cell membrane?

In [3]:
import requests
# https://www.uniprot.org/
query_url ="https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Csequence%2Ccc_subcellular_location&format=tsv&query=%28%28organism_id%3A9606%29%20AND%20%28reviewed%3Atrue%29%20AND%20%28length%3A%5B80%20TO%20500%5D%29%29"

In [None]:
from io import BytesIO
import pandas

uniprot_request = requests.get(query_url)
bio = BytesIO(uniprot_request.content)
df = pandas.read_csv(bio, compression='gzip', sep='\t')
df

In [5]:
df = df.dropna()  # Drop proteins with missing columns
cytosolic = df['Subcellular location [CC]'].str.contains("Cytosol") | df['Subcellular location [CC]'].str.contains("Cytoplasm")
membrane = df['Subcellular location [CC]'].str.contains("Membrane") | df['Subcellular location [CC]'].str.contains("Cell membrane")
# Ignoring proteins that have both
cytosolic_df = df[cytosolic & ~membrane]
membrane_df = df[membrane & ~cytosolic]

In [6]:
cytosolic_sequences = cytosolic_df["Sequence"].tolist()
cytosolic_labels = [0 for protein in cytosolic_sequences]

In [7]:
membrane_sequences = membrane_df["Sequence"].tolist()
membrane_labels = [1 for protein in membrane_sequences]

In [8]:
sequences = cytosolic_sequences + membrane_sequences
labels = cytosolic_labels + membrane_labels

In [None]:
sequences[:5]

## Splitting the data

In [25]:
from sklearn.model_selection import train_test_split

train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, labels, test_size=0.25, shuffle=True)

## Pytorch dataset and tokenization

In [None]:
from torch.utils.data import Dataset, TensorDataset
from transformers import AutoTokenizer

class MyProteinDataset(Dataset):
    def __init__(self, data, targets, tokenizer):
        self.data = data
        self.targets = targets
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        tokenized_x = self.tokenizer(x)
        return {**tokenized_x, 'labels':y}
    
    def __len__(self):
        return len(self.targets)

#Models on huggingface often come with tokenizers
model_checkpoint = "facebook/esm2_t6_8M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
train_dataset = MyProteinDataset(train_sequences, train_labels, tokenizer)

sample = train_dataset[0]
print(sample)

In [None]:
for k,v in sample.items():
  print(k,v)

In [38]:
test_dataset = MyProteinDataset(test_sequences, test_labels, tokenizer)

## Model loading

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model_checkpoint = "facebook/esm2_t6_8M_UR50D"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
# model.init_weights()

In [3]:
# Other larger protein language models
# https://huggingface.co/facebook/esm2_t36_3B_UR50D
# https://huggingface.co/Rostlab/prot_bert
# https://huggingface.co/yarongef/DistilProtBert
# https://github.com/nstrodt/UDSMProt

## Training

In [30]:
args = TrainingArguments(
    f"protein-model-finetuned", #our own directory name for saving the model 
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [31]:
from evaluate import load
import numpy as np

#huggingface likes their own metrics
metric = load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
#shuffles automatically between epochs
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()