In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/1_bestmodel.pth
/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/3_bestmodel.pth
/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/model.py
/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/4_bestmodel.pth
/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/0_bestmodel.pth
/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/2_bestmodel.pth
/kaggle/input/siatprotein2023/sample_submission.csv
/kaggle/input/siatprotein2023/training.csv
/kaggle/input/siatprotein2023/test.csv
/kaggle/input/siatprotein2023/Rhla.xlsx


In [2]:
import numpy as np
import pandas as pd
import os
import shutil
import torch as th
from torch.utils.data import DataLoader, Dataset

In [3]:
class MyDataset(Dataset):
    def __init__(self, feats):
        self.feats = feats

    def __getitem__(self, idx):
        return self.feats[idx]

    def __len__(self):
        return len(self.feats)
    
class ProcessData():
    def __init__(self, test_fpath, batch_size):
        voc_str = "ACDEFGHIKLMNPQRSTVWY"
        voc_list = list(voc_str)
        self.voc_dict = dict(zip(voc_list, [x for x in range(len(voc_list))]))

        test_df = pd.read_csv(test_fpath, index_col=0)
        self.test_index = test_df.index.tolist()
        self.test_values = test_df.values.ravel()
        self.batch_size = batch_size

    def voc_to_idx(self, seqs_array):
        seqs_list = seqs_array.tolist()
        seqs_indices = []
        for seq in seqs_list:
            seq = list(seq)
            seq_symmbol = np.array([self.voc_dict[x] for x in seq]).reshape(1, -1)
            seqs_indices.append(seq_symmbol)
        seqs_indices = np.concatenate(seqs_indices, axis=0)
        return seqs_indices

    def process_data(self):
        test_feats = th.from_numpy(self.voc_to_idx(self.test_values).astype(np.int32))
        datasets = MyDataset(test_feats)
        test_dataloader = DataLoader(datasets, batch_size=self.batch_size, shuffle=False)
        return test_dataloader

def run_a_batch_predict(model, dataloader, device):
    model.eval()
    with th.no_grad():
        pred_values_list = []
        for idx, data in enumerate(dataloader):
            feats = data.to(th.long)
            if device.startswith("cuda"):
                feats = feats.cuda()
            pred_activity, pred_selectivity = model(feats)
            pred_values = th.cat([pred_activity, pred_selectivity], axis=1).cpu().detach().numpy()
            pred_values_list.append(pred_values)
    
        pred_values = np.concatenate(pred_values_list, axis=0)

        return pred_values

In [4]:
test_fpath = "/kaggle/input/siatprotein2023/test.csv"
batch_size = 32
device = "cpu"
shutil.copyfile("/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/model.py", "/kaggle/working/model.py")

test_data = ProcessData(test_fpath, batch_size)
test_dataloader = test_data.process_data()
test_index = test_data.test_index

all_pred_values_list = []
for i in range(5):
    model_fpath = f"/kaggle/input/mha-pe-ln-adam-0-0001lr-4bz-4head-16embed/{i}_bestmodel.pth"
    model = th.load(model_fpath, map_location=th.device(device))
    pred_values = run_a_batch_predict(model, test_dataloader, device)
    all_pred_values_list.append(pred_values.reshape(1, -1, 2))
all_pred_values_array = np.concatenate(all_pred_values_list, axis=0)
mean_pred_values = np.mean(all_pred_values_array, axis=0)

results_df = pd.DataFrame()
results_df['SequenceID'] = test_index
results_df["Activity"] = mean_pred_values[:, 0]
results_df["Selectivity"] = mean_pred_values[:, 1]
results_df.to_csv('predictions.csv', index=False)