In [None]:
import torch

USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')

In [None]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm
import torch
import pickle
import gc

# EDA

- Train, Test의 단백질 종류 갯수 확인
- 단백질 하나당 go term의 최대, 최소 갯수
- 단백질 간 거리 벡터의 용량 확인 (142246 x )

## Train Data

In [None]:
#go-basic.obo

obo_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo"

with open(obo_file_path, "r") as obo_file:
    obo_content = obo_file.read()

# Print the first 1000 characters as an example
print(obo_content[:1000])

In [None]:
# train_sequence.fast

fasta_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta"

with open(fasta_file_path, "r") as fasta_file:
    lines = fasta_file.readlines()

for line in lines[:10]:  
    print(line.strip())

In [None]:
# Fasta 파일 중 ID, sequence data 불러오기
fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id
    sequence_data = record.seq
    sequences.append((sequence_id, sequence_data))

# sequences[0] : ('P20536', Seq('MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLK...FIY'))

In [None]:
#train_taxonomy.tsv, train_terms.tsv
train_taxonomy = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv", sep= "\t")
train_terms = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv", sep= "\t") 
train_terms.head()

In [None]:
# 총 단백질 갯수 및 go term 갯수 확인
print("Protein #: {}, term #: {}".format(train_terms['EntryID'].unique().shape, train_terms['term'].unique().shape))

In [None]:
# 하나의 단백질이 갖는 go term의 갯수 확인
train_terms_collected = train_terms.groupby('EntryID')['term'].apply(list).reset_index(name='terms_collected')
value_counts = train_terms_collected['terms_collected'].apply(len)

'''
value_counts = []
for i in range(0, len(train_terms_collected)):
    counts = len(train_terms_collected.loc[i, 'terms_collected'])
    value_counts.append(counts)
len(value_counts)
'''
train_terms_collected['value_counts'] = train_terms_collected['terms_collected'].apply(len)
print("value_count min: {}, max: {}".format(train_terms_collected['value_counts'].min(), train_terms_collected['value_counts'].max()))

In [None]:
# Embedding vector shape 비교

ids_train_ems = np.load("/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_ids.npy")
embeds_ems_train = np.load("/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_embeddings.npy")
print("ems       ids# : {}, embedding_dim : {}".format(len(ids_train_ems), embeds_ems_train.shape))

ids_train_protbert = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy")
embeds_protbert_train = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy")
print("protbert  ids# : {}, embedding_dim : {}".format(len(ids_train_protbert), embeds_protbert_train.shape))

In [None]:
# 메모리 확인 
import numpy as np

# Create a numpy array
data = embeds_ems_train

# Calculate the memory capacity consumed by the array
memory_consumed_bytes = data.nbytes

# Convert bytes to more human-readable format
def format_bytes(size):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return f"{size:.2f} {unit}"
        size /= 1024.0
    return f"{size:.2f} PB"

memory_consumed_human_readable = format_bytes(memory_consumed_bytes)

print("Memory consumed by array:", memory_consumed_human_readable)


## Test Data

In [None]:
# testsuperset-taxon-list.tsv
test_taxonomy = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv", sep="\t", encoding="ISO-8859-1")

In [None]:
# testsuperset.fasta
fasta_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta"

with open(fasta_file_path, "r") as fasta_file:
    lines = fasta_file.readlines()

for line in lines[:10]:
    print(line.strip())

In [None]:
# Fasta 파일 중 ID, sequence data 불러오기
fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id
    sequence_data = record.seq
    sequences.append((sequence_id, sequence_data))

# sequences[0] : ('P20536', Seq('MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLK...FIY'))

'''
len(sequences) 141865 인데, dictionary form으로 전환하면  141864가 나옴. 
중복 데이터가 있다고 판단하여 아래 코드를 수행해봄
* 중복 데이터는 다른 사람이 만들어 놓은 embedding vector 갯수가 141854라 알게되었음.
'''

In [None]:
# 중복 데이터 확인 
from tqdm import tqdm

x = [] # 처음 등장한 값인지 판별하는 리스트
new_a = [] # 중복된 원소만 넣는 리스트

for i in tqdm(sequences):
    if i not in x: 
        x.append(i)
    else:
        if i not in new_a: # 이미 중복 원소로 판정된 경우는 제외
            new_a.append(i)

print(new_a) # [1, 2] # 2회 이상 등장한 값들만 담긴 리스트
# 중복 데이터: [('A0A1D6E0S8', Seq('MPSRSPACRPRGRNRRSAADAVARPLALALILVSTLPRAAHSQDLALPPVQPRG...SFC'))]

# Go term 예측: 1과 2 방법을 혼합하여 예측한다.
- Vector distance가 가까운 단백질은 같은 go term을 공유한다고 간주하고, 나머지는 Newural network system으로 예측한다.
- 가까운 vector distance의 명확한 기준은 없으므로 이용자가 임의로 정한다.

1. Sequence vector distances.
2. Neural Network

# 1. Sequence vector distances.
- 단백질 sequence vector간 거리가 가까울 수록 비슷한 단백질.
- 비슷한 단백질을 하는 역할이 비슷함.
- 단백질 간 거리가 가까우면 비슷한 go term을 공유한다.
- 참조 문헌: https ://github.com/Rostl ab/goPre dSim

In [None]:
# Train x Test (142246 * 141864) tensor 용량 계산
import torch

# Define the tensor shape and data type
tensor_shape = (142246, 141864)
data_type = torch.float32

# Create a tensor of the specified shape and data type
tensor = torch.empty(tensor_shape, dtype=data_type)

# Calculate the memory consumption in bytes
memory_bytes = tensor.element_size() * tensor.numel()

# Convert bytes to gigabytes
memory_gb = memory_bytes / (1024 ** 3)

print("Memory Consumption of the Tensor: {:.2f} GB".format(memory_gb))

# Memory Consumption of the Tensor: 75.17 GB

## Pickle 파일 만들기
- Output 용량의 한계로 0 - 15k 까지만 만들었음.

In [None]:
# Data load
ids_train = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy")
embeds_protbert_train = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy")

# Protein Id, embedding vectors dictionary
dict_train_id_embeds = {}
for i in zip(ids_train, embeds_protbert_train):
    dict_train_id_embeds[i[0]] = i[1]
print(type(dict_train_id_embeds), len(dict_train_id_embeds))

In [None]:
# portbert embedding의 경우 141865개가 있다.
# len(ids_test)와 dictionary의 key값이 달라 확인해봄*
# 이는 중복된 ID가 있기 때문인데 실제 대회측에서 준 test ID 갯수도 141864개이다.
# Data load
ids_test = np.load("/kaggle/input/protbert-embeddings-for-cafa5/test_ids.npy")
embeds_protbert_test = np.load("/kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy")

# Protein Id, embedding vectors dictionary
dict_test_id_embeds = {}
for i in zip(ids_test, embeds_protbert_test):
    dict_test_id_embeds[i[0]] = i[1]
print(type(dict_test_id_embeds), len(dict_test_id_embeds))

In [None]:
import numpy as np
import gc

# Convert dictionary values to NumPy arrays
raw_data = np.array(list(dict_train_id_embeds.values()))
raw_data_query = np.array(list(dict_test_id_embeds.values()))

# Convert NumPy arrays to tensors
import torch

tensor_raw_data = torch.tensor(raw_data).to('cuda')
tensor_raw_data_query = torch.tensor(raw_data_query).to('cuda')

# GPU 위에 올렸으므로 ram위에 올라간 raw_data, raw_data_query를 지운다.
del raw_data
del raw_data_query
del dict_train_id_embeds
del ids_train
del embeds_protbert_train
del dict_test_id_embeds
del ids_test
del embeds_protbert_test
gc_collect = gc.collect()

print(torch.cuda.is_available(), tensor_raw_data.is_cuda, tensor_raw_data_query.is_cuda)
print(tensor_raw_data.shape, tensor_raw_data_query.shape)
print(gc_collect)

In [None]:
from Bio import SeqIO

file_path = "/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta"

# Open the FASTA file and iterate through its records
with open(file_path, "r") as fasta_file:
    records = SeqIO.parse(fasta_file, "fasta")
    
    id_set = set()
    for record in records:
        # Add the record ID to the set
        id_set.add(record.id)

# Print the number of unique IDs
num_ids = len(id_set)
print("Number of unique IDs:", num_ids)

In [None]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
with torch.no_grad():
    batch_size_d = 5000
    
    pbar = tqdm(range(0, 20000, batch_size_d)) #len(tensor_raw_data)
    
    for i in pbar: #140000 , len(tensor_raw_data) : 142246
        batch_data = tensor_raw_data[i:i+batch_size_d]
        query_tensor = torch.cdist(batch_data, tensor_raw_data_query, p=2).cpu()
        
        distance_tensor_name = f'distance_tensor_{i}.pkl'
        
        with open(distance_tensor_name, 'wb') as f:
            pickle.dump(query_tensor, f)
        
        print(distance_tensor_name)
        
        del query_tensor
        del distance_tensor_name
        torch.cuda.empty_cache()
        collected = gc.collect()
        print(f"Garbage collector: collected {collected} objects")

In [None]:
!nvidia-smi
torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()

In [None]:
download_file('/kaggle/working/distance_tensor_15000.pkl', 'distance_tensor_15000')

In [None]:
#del download_file('/kaggle/working/distance_tensor_0.pkl', 'distance_tensor_0')
gc.collect()

In [None]:
''' 
download_file('/kaggle/working/distance_tensor_0.pkl', 'distance_tensor_0')
download_file('/kaggle/working/distance_tensor_5000.pkl', 'distance_tensor_5000')
download_file('/kaggle/working/distance_tensor_10000.pkl', 'distance_tensor_10000')
download_file('/kaggle/working/distance_tensor_15000.pkl', 'distance_tensor_15000')
download_file('/kaggle/working/distance_tensor_20000.pkl', 'distance_tensor_20000')
download_file('/kaggle/working/distance_tensor_25000.pkl', 'distance_tensor_25000')
download_file('/kaggle/working/distance_tensor_30000.pkl', 'distance_tensor_30000')
download_file('/kaggle/working/distance_tensor_35000.pkl', 'distance_tensor_35000')

'''


In [None]:
pickle_file_path = '/kaggle/working/distance_tensor_0.pkl'

# Load the pickle file
with open(pickle_file_path, 'rb') as f:
    loaded_data = pickle.load(f)

# Now you can use the loaded_data as needed
print(loaded_data, loaded_data.shape)

## Pickle 파일 연결

In [None]:
import pickle

'''
file_path_1 = '/kaggle/input/distance-tensor-0to20k/distance_tensor_0.pkl'
with open(file_path_1, 'rb') as file:
    distance_tensor_0 = pickle.load(file)

file_path_2 = '/kaggle/input/distance-tensor-0to20k/distance_tensor_5000.pkl'
with open(file_path_2, 'rb') as file:
    distance_tensor_5000 = pickle.load(file)
        
file_path_1 = '/kaggle/input/distance-tensor-0to20k/distance_tensor_15000.pkl'
with open(file_path_1, 'rb') as file:
    distance_tensor_15000 = pickle.load(file) 
'''

file_path_1 = '/kaggle/input/distance-tensor-0to20k/distance_tensor_10000.pkl'
with open(file_path_1, 'rb') as file:
    distance_tensor_10000 = pickle.load(file)

   
#print(distance_tensor_0.shape, distance_tensor_5000.shape)
print(distance_tensor_10000.shape)
#print(distance_tensor_15000.shape)

In [None]:
#distance_tensor_0to10k = torch.cat((distance_tensor_0, distance_tensor_5000), dim=0)
distance_tensor_0to15k = torch.cat((distance_tensor_0to10k, distance_tensor_10000), dim=0)
#distance_tensor_0to20k = torch.cat((distance_tensor_0to15k, distance_tensor_15000), dim=0)

#print(distance_tensor_0to10k.shape)
print(distance_tensor_0to15k.shape)
#print(distance_tensor_0to20k.shape)

In [None]:
#del distance_tensor_5000
#gc.collect()

In [None]:
file_path = 'distance_tensor_0to15k.pkl'

# Save the data to a pickle file
with open(file_path, 'wb') as f:
    pickle.dump(distance_tensor_0to15k, f)

In [None]:
download_file('/kaggle/working/distance_tensor_0to15k.pkl', 'distance_tensor_0to15k.pkl')

In [None]:
!zip -r file.zip /kaggle/working/distance_tensor_0to20k.pkl

In [None]:
# Tensor 자르고 붙이는 연습
'''
#garbage collector가 gpu에서 삭제
#no grad 적용/ # tensor_raw_data_query 쪼개서 이중 for문으로 만들기
#garbage collector가 gpu에서 삭제

with torch.no_grad():
    result_tensor_1 = []
    
    batch_size_d = 100
    batch_size_q = 100
    
    pbar = tqdm(range (5000,10000, batch_size_d))
    for i in pbar:
        batch_data = tensor_raw_data[i:i+batch_size_d]
        query_tensor = [] # query_tensor는 비워줘야 한다.
        
        for j in range(0,len(tensor_raw_data_query), batch_size_q):
            batch_query = tensor_raw_data_query[j:j+batch_size_q]
            distance_query_seg = torch.cdist(batch_data, batch_query, p=2)
            query_tensor.append(distance_query_seg)
            
        distance_data_seg = torch.cat(query_tensor, dim=1)#.cpu()
        result_tensor_1.append(distance_data_seg)
        
    pbar.close()
    distance_tensor_2 = torch.cat(result_tensor_1, dim=0)#.cpu()
    #print("distance_tensor_1:", distance_tensor_1)

with open('distance_tensor_2.pkl', 'wb') as f:
	pickle.dump(distance_tensor_2, f)
'''


## Embedding vector 거리 구하기
- goPreSim에서 구한 방식으로는 computing 능력이 부족하여 할 수 가 없음.
- 따라서, torch에서 batch로 나누어 약 14만 개중 2만개의 train protein vector와 약 14만개의 test vector간의 거리를 구함

In [None]:
# Distance vector loading
file_path = '/kaggle/input/distance-tensor-con-0to15k/distance_tensor_0to15k.pkl'

with open(file_path, 'rb') as file:
    distance_tensor_0to15k = pickle.load(file)
print(distance_tensor_0to15k.shape, distance_tensor_0to15k.min(), distance_tensor_0to15k.max())

In [None]:
# test data의 단백질 id loading
ids_test = np.load("/kaggle/input/protbert-embeddings-for-cafa5/test_ids.npy")
ids_test_dict = {}
for id_test in ids_test:
    ids_test_dict[id_test] = 0
len(ids_test_dict)

In [None]:
# index로 단백질 id를 찾을 수 있게 해주는 사전
ids_train = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy")
ids_train_list = ids_train[:15000]
ids_train_numbering = {}
for i in range(0,len(ids_train_list)):
    ids_train_numbering[i] = ids_train[i]

In [None]:
# Distance vector loading
file_path = '/kaggle/input/dict-train-id-goterm/dict_train_id_go_term'

with open(file_path, 'rb') as file:
    dict_train_id_go_term = pickle.load(file)

In [None]:
# goPredSim Data: 
!git clone https://github.com/Rostlab/goPredSim

In [None]:
# goPredSim에서 만든 코드를 이용해 parents go term을 불러오기 위한 함수 loading
import sys
sys.path.append('/kaggle/working/goPredSim')
from gene_ontology import GeneOntology

onto_file = "/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo"
go = GeneOntology(onto_file)

In [None]:
# 현재 문제점: 연산이 너무 느림 -> gpu 연산으로 해결하는 방법이 있을 것 같다.
# ### dist에 들어갈 tensor를 변경 'dists = distance_tensor_0to15k[:,i].tolist()'

from collections import defaultdict
from tqdm import tqdm

predictions = defaultdict(defaultdict)
hit_ids = defaultdict(defaultdict)
hits = [0.7] 

for i in tqdm(range(80000, len(ids_test_dict))): #(0:80000), (80000,len(ids_test_dict))
    query = ids_test[i].split()[0] #ids_test[i] 이렇게만 해도 되나 일단 원본을 최대한 유지
    dists = distance_tensor_0to15k[:,i].tolist()  
    dists = np.array(dists)
    
    for h in hits:
        prediction = dict() #line by line으로 만들 때는 prediction = dict(hits)에러가 난다.
        
        h = float(h)
        indices = np.nonzero(dists <= h)[0] #[0]이 없으면 array(,형태로나옴)

        '''
        h = int(h)
        indices_tmp = np.argpartition(dists, h)[0:h]
        dists_tmp = [dists[i] for i in indices_tmp]
        max_dist = np.amax(dists_tmp)
        indices = np.nonzero(dists <= max_dist)[0]

        '''
                
        num_hits = len(indices)
        
        for ind in indices:
            lookup_id = ids_train_numbering[ind]        #self.lookup_db.ids[ind]  # ids_train_numbering
            go_terms = dict_train_id_go_term[lookup_id]
            dist = dists[ind]                                #self.go_db[lookup_id]
            dist = 0.5 / (0.5 + dist)
            
            for g in go_terms:
                if g in prediction.keys():
                    prediction[g] += dist/num_hits
                else:
                    prediction[g] = dist/num_hits
            
            if query not in hit_ids[h].keys():
        
                hit_ids[h][query] = dict() #  h 값과 query 값으로 중첩된 딕셔너리 구조가 생성
                hit_ids[h][query][lookup_id] = round(dist, 2) # 소수 둘째 자리 까지 반올림 하여 저장!

         # 낮은 확률 삭제
        keys_for_deletion = set()
        for p in prediction:
            ri = round(float(prediction[p]), 2) # 은 현재 키에 해당하는 값을 반올림하여 ri 변수에 할당
            if ri == 0.00:
                keys_for_deletion.add(p)
            else:
                prediction[p] = ri

        for k in keys_for_deletion:
            del prediction[k]

        # 부모 go term을 삭제 하는 이유: 더 정확한 예측을 위해서 삭제한다.!
        parent_terms = []
        for p in prediction.keys():
            parents = go.get_parent_terms(p) # 이 함수는 가지고 와야함
            parent_terms += parents

        keys_for_deletion = set()
        for p in prediction.keys():
            if p in parent_terms:
                keys_for_deletion.add(p)

        for k in keys_for_deletion:
            del prediction[k]

        predictions[h][query] = prediction 

tqdm.close

print(predictions, hit_ids)

7만개 = 10 기가 2 시간 50%

In [None]:
out_file = "/kaggle/working/submission_2 .tsv"  # Change extension to .tsv

try:
    with open(out_file, 'w') as out:
        for p in predictions.keys():
            prediction = predictions[p]
            for protein_ID in prediction.keys():  # prediction.keys() == protein_IDs 
                go_terms_ri = prediction[protein_ID]
                for go_term in go_terms_ri.keys():
                    ri = go_terms_ri[go_term]
                    out.write('{}\t{}\t'.format(protein_ID, go_term))  # protein_ID, go_term
                    out.write('{:0.2f}\n'.format(float(ri)))  # ri probability
    print(f"Data written to '{out_file}' successfully.")
except Exception as e:
    print("An error occurred:", e)

In [None]:
file_path = '/kaggle/input/submission-2/submission_2 .tsv'

line_count = 0

with open(file_path, 'r') as file:
    for line in file:
        line_count += 1

print("Number of lines in the file:", line_count)

## Sequence vector 거리 기준 TSV 파일

In [None]:
# Sequence vector간 거리 기준으로 정답 예측

input_file_paths = [
    '/kaggle/input/submission-1/submission_1.tsv',
    '/kaggle/input/submission-2/submission_2 .tsv'
]

output_file_path = '/kaggle/working/submission.tsv'

with open(output_file_path, 'w') as output_file:
    for input_file_path in input_file_paths:
        with open(input_file_path, 'r') as input_file:
            for line in input_file:
                output_file.write(line)

print("Combined TSV files into:", output_file_path)

In [None]:
# TSV 파일을 pandas dataframe으로 변경
file_path = '/kaggle/input/combined-submission/submission.tsv'
vector_distance_df = pd.read_csv(file_path, sep='\t')

# Set the column names
column_names = ['Protein Id', 'GO Term Id', 'Prediction']
vector_distance_df.columns = column_names
print(vector_distance_df['Protein Id'].nunique(), len(vector_distance_df))
'''
Test protein ID 갯수가 141864임을 감안하면 약 3만개 정도는 거리가 0.01 미만으로 go term이 예측
되지 않았음을 알 수 있다.
'''

In [None]:
# 특정 확률 이상만 추출
selected_vd_df = vector_distance_df[vector_distance_df['Prediction'] > 0.8]
# Print the selected rows
print(selected_vd_df['Protein Id'].nunique(), len(selected_vd_df))

# 2. Neural Network
- 같은 팀원의 코드를 참고하여 작성함 (@ YIMINJAE98)
- Tensorflow 기반으로 작성 하였음.
- 1번과 2번을 독립적으로 활용 할 수 있도록 data load 부터 예측까지 모든 코드를 넣음.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt
import progressbar
import gc

### Data 불러오기

In [None]:
#Load Data 
train_terms = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv',sep='\t')
train_embeddings = np.load('/kaggle/input/t5embeds/train_embeds.npy')
train_id = np.load('/kaggle/input/t5embeds/train_ids.npy')
print(train_id.shape,train_embeddings.shape)

In [None]:
# Pandas dataframe으로 변환
column_num = train_embeddings.shape[1] # embedding vector한 개 당 차원 (1,1024)
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
train_df['ID'] = train_id

In [None]:
'''
go term 정답레이블을 0 ,1 형태의 행렬로 나타내는 코드.
go term label이 1500개인 이유는 정답 제출시 하나의 단백질당 예측가능한 최대 go-term 숫자가 1500개로 제한
(실제로 가장 자주 사용되는 go term 1500 개를 활요하면 더 유용할 것같다고 생각이 드나, 
대회 일정상 임의로 1500 개를 자름)
'''

'''
# Set the limit for label
num_of_labels = 1500
train_size = train_id.shape[0] # len(X)

# Take value counts in descending order and fetch first 1500 `GO term ID` as labels
labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()

# Fetch the train_terms data for the relevant labels only
train_terms_updated = train_terms.loc[train_terms['term'].isin(labels)]

# Setup progressbar settings.
bar = progressbar.ProgressBar(maxval=num_of_labels, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

# Create an empty dataframe of required size for storing the labels,
train_labels = np.zeros((train_size ,num_of_labels))
series_train_protein_ids = pd.Series(train_id)

# Loop through each label
for i in range(num_of_labels):
    # For each label, fetch the corresponding train_terms data
    n_train_terms = train_terms_updated[train_terms_updated['term'] ==  labels[i]]
    
    # Fetch all the unique EntryId aka proteins related to the current label(GO term ID)
    label_related_proteins = n_train_terms['EntryID'].unique()
    
    # In the series_train_protein_ids pandas series, if a protein is related
    # to the current label, then mark it as 1, else 0.
    # Replace the ith column of train_Y with with that pandas series.
    train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    
    # Progress bar percentage increase
    bar.update(i+1)

# Notify the end of progress bar 
bar.finish()

# Convert train_Y numpy into pandas dataframe
labels_df = pd.DataFrame(data = train_labels, columns = labels)
print(labels_df.shape)
'''

In [None]:
# labels_df pickle 파일 loading, 
file_path = "/kaggle/input/labels-df/labels_df_ver1.pkl"

with open(file_path, 'rb') as file:
    labels_df = pickle.load(file)


features_input = train_df.loc[:, train_df.columns != 'ID'].values # 입력 data
labels_input = labels_df.values  # 정답 data 

### 신경망 설계

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Conv1D, MaxPooling1D, Flatten, Dense, Bidirectional, Dropout, Add,Input, Embedding
import tensorflow as tf


# Use MirroredStrategy for multi-GPU training
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    # Create a sequential model
    model_CNN_LSTM = Sequential()
    
    # Add a Conv1D layer for spatial pattern detection
    model_CNN_LSTM.add(Conv1D(32, kernel_size=3, input_shape = (1024,1), activation='relu'))
    model_CNN_LSTM.add(MaxPooling1D(pool_size=2))
    
    # Add an LSTM layer for sequence modeling
    model_CNN_LSTM.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    
    # Add another Dense layer for non-linear transformations
    model_CNN_LSTM.add(Dense(128, activation='relu'))

    # Add a fully connected layer for classification
    model_CNN_LSTM.add(Dense(1500, activation='sigmoid'))
    
    # Compile the model
    model_CNN_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Print the model summary
    model_CNN_LSTM.summary()

### Data loading 및 학습

In [None]:
# Use MirroredStrategy for multi-GPU training
with strategy.scope():
    history_CNN = model_CNN_LSTM.fit(features_input, labels_input, epochs=20, batch_size=1024)

In [None]:
history_df = pd.DataFrame(history_CNN.history)
# Plot the loss with custom x-axis and y-axis range
plt.figure(figsize=(10, 5))
plt.plot(history_df['loss'])
plt.title('Cross-entropy')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xlim(0, len(history_df))  # Set x-axis range (epochs)
plt.ylim(0, max(history_df['loss']))  # Set y-axis range (loss values)
plt.show()

# Plot the accuracy with custom x-axis and y-axis range
plt.figure(figsize=(10, 5))
plt.plot(history_df['accuracy'])
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.xlim(0, len(history_df))  # Set x-axis range (epochs)
plt.ylim(0, 1)  # Set y-axis range (accuracy values between 0 and 1)
plt.show()

In [None]:
# 가중치 저장
from tensorflow.keras.models import save_model
model_CNN_LSTM.save("model_CNN_LSTM_ReLU_softmax.h5")

In [None]:
# 가중치 불러오기
from tensorflow.keras.models import load_model
model_CNN_LSTM = load_model("/kaggle/input/cafa5-test/model_CNN_LSTM_softmax.h5")

In [None]:
del train_terms 
del train_embeddings
del train_id 
del features_input
del labels_input 
del labels_df

gc_collect = gc.collect()
print(gc_collect)

### 예측

In [None]:
# Test data 불러오기
test_protein_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
test_embeddings = np.load('/kaggle/input/t5embeds/test_embeds.npy')

In [None]:
# Embedding vector를 data frame으로 전환
column_num = test_embeddings.shape[1]
test_df = pd.DataFrame(test_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(test_df.shape)

In [None]:
predictions =  model_CNN_LSTM.predict(test_df)

In [None]:
# prediction 데이터 저장
with open('predictions.pkl', 'wb') as predictions_file:
    pickle.dump(predictions, predictions_file)

download_file('/kaggle/working/predictions.pkl', 'predictions')

In [None]:
# predicion 데이터 불러오기
with open('/kaggle/input/predictions/predictions.pkl', 'rb') as predictions_file:
    loaded_predictions = pickle.load(predictions_file)
print(loaded_predictions.shape)

In [None]:
# 1500 개의 go term을 추린다.
num_of_labels = 1500
train_terms = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv',sep='\t')
labels = train_terms['term'].value_counts().index[:num_of_labels].tolist()

In [None]:
neural_network_df = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

l = []
for k in list(test_protein_ids):
    l += [ k] * loaded_predictions.shape[1]  

neural_network_df['Protein Id'] = l
neural_network_df['GO Term Id'] = labels * loaded_predictions.shape[0]
neural_network_df['Prediction'] = loaded_predictions.ravel()

# neural_network_df.to_csv("submission.tsv",header=False, index=False, sep="\t")
# Nueral network 예측 부분만 정답으로 사용하고 싶으면 위 코드를 실행 시키면 된다.

In [None]:
print(neural_network_df['Protein Id'].nunique(), len(neural_network_df))

In [None]:
selected_nn_df = neural_network_df[neural_network_df['Prediction'] > 0.64]

# Print the selected rows
print(selected_nn_df['Protein Id'].nunique(), len(selected_nn_df)) # (0.64, 141864) // (0.65, 141795)

# Answer Note
- sequence vector distance와 neural network 예측을 합친다.

In [None]:
selected_vd_df = vector_distance_df[vector_distance_df['Prediction'] > 0.8]
selected_nn_df = neural_network_df[neural_network_df['Prediction'] > 0.64]

print(selected_vd_df['Protein Id'].nunique(), len(selected_vd_df))
print(selected_nn_df['Protein Id'].nunique(), len(selected_nn_df))

In [None]:
selected_vd_df[selected_vd_df['Protein Id'] == 'Q9DCD0']

In [None]:
selected_nn_df.head())

In [None]:
progress_bar = tqdm(total=len(selected_nn_df), desc="Converting to Dictionary")

selected_nn_dict = {}
for index, row in selected_nn_df.iterrows():
    protein_id = row['Protein Id']
    go_term_id = row['GO Term Id']
    prediction = row['Prediction']
    
    if protein_id not in selected_nn_dict:
        selected_nn_dict[protein_id] = {}
    
    selected_nn_dict[protein_id][go_term_id] = prediction
    
    progress_bar.update(1)  

progress_bar.close()

#print(selected_nn_dict)

In [None]:
progress_bar = tqdm(total=len(selected_vd_df), desc="Converting to Dictionary")

selected_vd_dict = {}
for index, row in selected_vd_df.iterrows():
    protein_id = row['Protein Id']
    go_term_id = row['GO Term Id']
    prediction = row['Prediction']
    
    if protein_id not in selected_vd_dict:
        selected_vd_dict[protein_id] = {}
    
    selected_vd_dict[protein_id][go_term_id] = prediction
    
    progress_bar.update(1)  

progress_bar.close()

#print(selected_vd_dict)

In [None]:
vd_nn_dict = selected_nn_dict.copy()

# Create a tqdm instance to track the progress
progress_bar = tqdm(total=len(selected_vd_dict), desc="Updating Dictionary")

# Iterate through the keys and nested dictionaries of selected_vd_dict and update selected_nn_dict
for protein_id in selected_vd_dict:
    if protein_id in vd_nn_dict: #selected_nn_dict
        for go_term_id, prediction in selected_vd_dict[protein_id].items():
            if go_term_id in vd_nn_dict[protein_id]: #selected_nn_dict
                vd_nn_dict[protein_id][go_term_id] = prediction #selected_nn_dict
            else:
                vd_nn_dict[protein_id][go_term_id] = prediction #selected_nn_dict
    else:
        vd_nn_dict[protein_id] = selected_vd_dict[protein_id]
        #selected_nn_dict[protein_id] = selected_vd_dict[protein_id]
    
    progress_bar.update(1)  # Update the progress bar

# Close the progress bar
progress_bar.close()

print(len(vd_nn_dict))

In [None]:
out_file = "/kaggle/working/submission.tsv"  # Change extension to .tsv

try:
    with open(out_file, 'w') as out:
        for protein_ID in vd_nn_dict.keys():  # prediction.keys() == protein_IDs 
            go_terms_ri = vd_nn_dict[protein_ID]
            for go_term in go_terms_ri.keys():
                ri = go_terms_ri[go_term]
                out.write('{}\t{}\t'.format(protein_ID, go_term))  # protein_ID, go_term
                out.write('{:0.2f}\n'.format(float(ri)))  # ri probability
    print(f"Data written to '{out_file}' successfully.")
except Exception as e:
    print("An error occurred:", e)

In [None]:
file_path = "/kaggle/working/submission.tsv"
df = pd.read_csv(file_path, sep='\t')

# Display the DataFrame
len(df)

In [100]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/predictions/predictions.pkl
/kaggle/input/vd-nn-submission/submission.tsv
/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy
/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy
/kaggle/input/protbert-embeddings-for-cafa5/test_ids.npy
/kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy
/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_ids.npy
/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_embeddings.npy
/kaggle/input/cafa-5-ems-2-embeddings-numpy/test_ids.npy
/kaggle/input/cafa-5-ems-2-embeddings-numpy/test_embeddings.npy
/kaggle/input/distance-tensor-con-0to15k/distance_tensor_0to15k.pkl
/kaggle/input/labels-df/labels_df_ver1.pkl
/kaggle/input/submission-1/submission_1.tsv
/kaggle/input/submission-2/submission_2 .tsv
/kaggle/input/distance-tensor-0to20k/distance_tensor_15000.pkl
/kaggle/input/distance-tensor-0to20k/distance_tensor_5000.pkl
/kaggle/input/distance-tensor-0to20k/distance_tensor_10000.pkl
/kaggle/input/distance-tensor-0to20k

In [101]:
submission = pd.read_csv('/kaggle/working/submission.tsv',
    sep='\t', header=None)

submission.to_csv('submission.tsv',
    sep='\t', header=False, index=False)

In [102]:
submission

Unnamed: 0,0,1,2
0,Q9CQV8,GO:0005575,0.66
1,Q9CQV8,GO:0110165,0.65
2,P62259,GO:0005575,0.65
3,P62259,GO:0110165,0.64
4,P62259,GO:0050815,1.00
...,...,...,...
319835,C0HK73,GO:0110165,0.64
319836,C0HK74,GO:0005575,0.66
319837,C0HK74,GO:0110165,0.65
319838,A0A3G2FQK2,GO:0005575,0.66
