# TRANSFER LEARNING




## Imbalanced Data

### Model Fine-tuning & Prediction

In [None]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re

model_name = 'Rostlab/prot_bert_bfd' # protbertbfd model

class DeepLocDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert_bfd', max_length=64):
        # load data
        self.datasetFolderPath = 'Bcell_data/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'input_train.csv')
        self.valFilePath = os.path.join(self.datasetFolderPath, 'input_val.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'input_test.csv')

        # pretrained tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        if split=="train":
          self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        elif split=="valid":
          self.seqs, self.labels = self.load_dataset(self.valFilePath)
        else:
          self.seqs, self.labels = self.load_dataset(self.testFilePath)

        self.max_length = max_length

    def load_dataset(self,path):
        df = pd.read_csv(path,names=['peptide_seq','labels'],skiprows=1)
        self.labels_dic = {0:'Soluble', 1:'Membrane'}
        seq = list(df['peptide_seq'])
        label = list(df['labels'])
        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq) # replace unknown residues with X

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length) # truncate/pad sequences to same length

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=64)
val_dataset = DeepLocDataset(split="valid", tokenizer_name=model_name, max_length=64)
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=64)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after each epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    fp16_opt_level="02",             # mixed precision mode
    run_name="ProBert-BFD-MS",       # experiment name
    seed=3                           # Seed for experiment reproducibility 3x3
)

trainer = Trainer(
    model_init=model_init,                # the pretrained Transformers model to be fine-tuned
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics = compute_metrics,    # evaluation metrics
)

trainer.train()
trainer.save_model('models/')

predictions, label_ids, metrics = trainer.predict(test_dataset)
print(predictions)
with open('predictions.txt', 'w') as f1:
    for i in predictions:
        f1.write(str(np.argmax(i, axis=0)))

### Evaluation Metrics

In [None]:
import numpy as np

metrics = {'Precision':[], 'Recall': [], 'F1':[], 'Accuracy': []}
true_p, false_p, true_n, false_n = 0, 0, 0, 0

with open('groundtruth.txt','r') as g: 
  groundtruth = np.array(list(g.read()), dtype=int)
  
with open('predictions.txt','r') as p:
  predictions = np.array(list(p.read()), dtype=int)

for i in range(len(groundtruth)):
  if groundtruth[i] == 0:
    if predictions[i] == 0:
      true_n += 1
    else:
      false_p += 1
  else:
    if predictions[i] == 0:
      false_n += 1
    else:
      true_p += 1

#prec = true_p/(true_p + false_p)
#rec = true_p/(true_p + false_n)
#metrics['Precision'].append(prec)
#metrics['Recall'].append(rec)
#metrics['F1'].append(2*prec*rec / (prec + rec))
metrics['Accuracy'].append((true_p+true_n) / len(groundtruth))
print(true_p, false_p, true_n, false_n)
print(metrics)

Result: 

All predictions = 0

Accuracy: 0.7307692307692307

## Weighted Classification

### Model Fine-tuning & Prediction

In [None]:
# pip -q install transformers seqeval
import torch
from torch import nn
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification
from torch.utils.data import Dataset
import os
import pandas as pd
import requests
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re

model_name = 'Rostlab/prot_bert_bfd'

class DeepLocDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, split="train", tokenizer_name='Rostlab/prot_bert_bfd', max_length=64):
        self.datasetFolderPath = 'Bcell_data/'
        self.trainFilePath = os.path.join(self.datasetFolderPath, 'input_train.csv')
        self.valFilePath = os.path.join(self.datasetFolderPath, 'input_val.csv')
        self.testFilePath = os.path.join(self.datasetFolderPath, 'input_test.csv')

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        if split=="train":
          self.seqs, self.labels = self.load_dataset(self.trainFilePath)
        elif split=="valid":
          self.seqs, self.labels = self.load_dataset(self.valFilePath)
        else:
          self.seqs, self.labels = self.load_dataset(self.testFilePath)

        self.max_length = max_length

    def load_dataset(self,path):
        df = pd.read_csv(path,names=['peptide_seq','labels'],skiprows=1)
        self.labels_dic = {0:'Soluble', 1:'Membrane'} # DO NOT CHANGE, WILL RESULT IN ALL PREDICTIONS = 0 (OR 1)
        seq = list(df['peptide_seq'])
        label = list(df['labels'])
        assert len(seq) == len(label)
        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

train_dataset = DeepLocDataset(split="train", tokenizer_name=model_name, max_length=64)
val_dataset = DeepLocDataset(split="valid", tokenizer_name=model_name, max_length=64)
test_dataset = DeepLocDataset(split="test", tokenizer_name=model_name, max_length=64)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def model_init():
  return AutoModelForSequenceClassification.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,               # How often to print logs
    do_train=True,                   # Perform training
    do_eval=True,                    # Perform evaluation
    evaluation_strategy="epoch",     # evalute after eachh epoch
    gradient_accumulation_steps=64,  # total number of steps before back propagation
    fp16=True,                       # Use mixed precision
    fp16_opt_level="02",             # mixed precision mode
    run_name="ProBert-BFD-MS",       # experiment name
    seed=3                           # Seed for experiment reproducibility 3x3
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
class_weights = torch.tensor([0.27, 1], dtype=torch.float, device=device)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
        
trainer = WeightedTrainer(
    model_init=model_init,                # the instantiated ?? Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics = compute_metrics,    # evaluation metrics
)

trainer.train()
trainer.save_model('models/')

predictions, label_ids, metrics = trainer.predict(test_dataset)
print(predictions)
with open('predictions_w.txt', 'w') as f1:
    for i in predictions:
        f1.write(str(np.argmax(i, axis=0)))

### Evaluation Metrics

In [None]:
import numpy as np

metrics = {'Precision':[], 'Recall': [], 'F1':[], 'Accuracy': []}
true_p, false_p, true_n, false_n = 0, 0, 0, 0

with open('groundtruth.txt','r') as g: 
  groundtruth = np.array(list(g.read()), dtype=int)
  
with open('predictions_w.txt','r') as p:
  predictions = np.array(list(p.read()), dtype=int)

for i in range(len(groundtruth)):
  if groundtruth[i] == 0:
    if predictions[i] == 0:
      true_n += 1
    else:
      false_p += 1
  else:
    if predictions[i] == 0:
      false_n += 1
    else:
      true_p += 1

prec = true_p/(true_p + false_p)
rec = true_p/(true_p + false_n)
metrics['Precision'].append(prec)
metrics['Recall'].append(rec)
metrics['F1'].append(2*prec*rec / (prec + rec))
metrics['Accuracy'].append((true_p+true_n) / len(groundtruth))
print(true_p, false_p, true_n, false_n)
print(metrics)

Results (max_length=64)

Precision: 0.375609756097561

Recall: 0.55

F1: 0.4463768115942029

Accuracy: 0.6326923076923077

Results (max_length=50): 

Precision: 0.39805825242718446

Recall: 0.29285714285714287

F1: 0.3374485596707819

Accuracy: 0.6903846153846154

EPOCH1

'eval_loss': 0.6847864389419556, 

'eval_accuracy': 0.2710215427380125, 

'eval_f1': 0.42646254784034987, 

'eval_precision': 0.2710215427380125, 

'eval_recall': 1.0

EPOCH2

'eval_loss': 0.6855406165122986, 

'eval_accuracy': 0.2710215427380125, 

'eval_f1': 0.42646254784034987, 

'eval_precision': 0.2710215427380125, 

'eval_recall': 1.0

EPOCH3

'eval_loss': 0.6859998106956482, 

'eval_accuracy': 0.2710215427380125, 

'eval_f1': 0.42646254784034987, 

'eval_precision': 0.2710215427380125, 

'eval_recall': 1.0