# Protein Secondary Structure Prediction

## Environment Setup

In [1]:
!cp -r /kaggle/input/nlpfinal5/* /kaggle/working/

### Install required packages

In [2]:
!pip install datasets



## Dataset Prepration

### Load dataset

In [3]:
dataset_path = "/path/to/dataset"

In [4]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("csv", data_files=dataset_path).select_columns(['Functionality', 'Secondary Structures Q8'])['train'].train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
ds_testval = ds['test'].train_test_split(test_size=0.5)

In [6]:
dataset = DatasetDict({
    'train': ds['train'],
    'test': ds_testval['test'],
    'validation': ds_testval['train']
})

### Preprocess the dataset

In [7]:
import ast

def get_sec_struct(example):
    sec_struct_str = example.get('Secondary Structures Q8')
    if sec_struct_str is None or 'None' in sec_struct_str:
        return {'Q8_labels': []}

    return {
        'Q8_labels':[e[1] for e in ast.literal_eval(example['Secondary Structures Q8'])]
    }

In [8]:
extended_dataset = dataset.map(get_sec_struct)

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

In [9]:
extended_dataset

DatasetDict({
    train: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 10831
    })
    test: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 1354
    })
    validation: Dataset({
        features: ['Functionality', 'Secondary Structures Q8', 'Q8_labels'],
        num_rows: 1354
    })
})

## Training


### Loading model

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoints = ['google-t5/t5-small', 'google/flan-t5-small', 'microsoft/kosmos-2.5', 'google-t5/t5-base']

tokenizer = AutoTokenizer.from_pretrained(checkpoints[0])
# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoints[0])
# tokenizer = AutoTokenizer.from_pretrained("./t5tunned")
model = AutoModelForSeq2SeqLM.from_pretrained("./t5tunned")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Prepare data

In [11]:
import torch
def preprocess_function(examples):
    inputs = tokenizer(examples['Functionality'], truncation=True)
    labels = tokenizer.convert_tokens_to_ids(examples['Q8_labels']) + [tokenizer.eos_token_id]

    inputs['labels'] = labels if len(labels) < tokenizer.model_max_length else labels[:tokenizer.model_max_length]
    # with tokenizer.as_target_tokenizer():
    #     inputs['labels'] = tokenizer(examples['Q8_labels'], truncation=True, padding=True).input_ids
    return inputs

In [12]:
tokenized_dataset = extended_dataset.map(preprocess_function).remove_columns(['Functionality', 'Secondary Structures Q8', 'Q8_labels'])

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

Map:   0%|          | 0/1354 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

2024-08-13 05:38:56.412016: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 05:38:56.412121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 05:38:56.546376: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Train

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/kaggle/working/results/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=70,
    # predict_with_generate=True,
    logging_steps=50,
    gradient_accumulation_steps=16,
    report_to=["tensorboard"],
)



In [16]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.7722,0.730827
1,0.7691,0.730414
2,0.7704,0.728175
3,0.7693,0.730582
4,0.7665,0.730651
5,0.7678,0.729263
6,0.7678,0.729151
8,0.7685,0.726932
9,0.7656,0.726331
10,0.7645,0.728508


TrainOutput(global_step=5880, training_loss=0.756540400316926, metrics={'train_runtime': 34368.5393, 'train_samples_per_second': 22.06, 'train_steps_per_second': 0.171, 'total_flos': 8.420607121588224e+16, 'train_loss': 0.756540400316926, 'epoch': 69.48301329394387})

## Samples

In [18]:
sample_index=3
''.join(extended_dataset['train']['Q8_labels'][sample_index])

'-----PPPP-HHHHTTEEEEEETTEEEEEEEETTEEEEEGGGG--GGG-SS--HHHHHHT--GGG-EEEETTEEE-EEEEEEETTEEEEEESS--TT--EEEE-PPPTT-EEEEEEEETTEEEEEEEEE--TTS-B-----TT-TT-EEEEEETTEEEEEEEEEEE-TTS-EEEE-TTSPBSSS--SS-S--------B-HHHHHHHHHHHHHTT--TT--S----HHHHHHHHHHTTBPPP-HHHHHHHHHHHHHH---HHHHHHHHHHHHHH--TT--BTTBSS------HHHHHHH-----'

In [19]:
generated_out = model.generate(input_ids=torch.tensor(tokenized_dataset['train']['input_ids'][sample_index], device=model.device).unsqueeze(0), max_length=512)
generated_out[0]

tensor([ 0, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 

In [20]:
tokenizer.decode(generated_out[0], skip_special_tokens=True)

'-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

In [21]:
tokenizer.convert_ids_to_tokens(generated_out[0])

['<pad>',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',
 '-',


In [22]:
model.save_pretrained("/kaggle/working/t5tunned")