# EDA 목표
- Train, Test의 단백질 종류 갯수 확인
- 단백질 하나당 go term의 최대, 최소 갯수 
- 단백질 간 거리 벡터의 용량 확인 (142246 x )

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO

## Train Data

In [2]:
#go-basic.obo

obo_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo"

with open(obo_file_path, "r") as obo_file:
    obo_content = obo_file.read()

# Print the first 1000 characters as an example
print(obo_content[:1000])

format-version: 1.2
data-version: releases/2023-01-01
subsetdef: chebi_ph7_3 "Rhea list of ChEBI terms representing the major species at pH 7.3."
subsetdef: gocheck_do_not_annotate "Term not to be used for direct annotation"
subsetdef: gocheck_do_not_manually_annotate "Term not to be used for direct manual annotation"
subsetdef: goslim_agr "AGR slim"
subsetdef: goslim_aspergillus "Aspergillus GO slim"
subsetdef: goslim_candida "Candida GO slim"
subsetdef: goslim_chembl "ChEMBL protein targets summary"
subsetdef: goslim_drosophila "Drosophila GO slim"
subsetdef: goslim_flybase_ribbon "FlyBase Drosophila GO ribbon slim"
subsetdef: goslim_generic "Generic GO slim"
subsetdef: goslim_metagenomics "Metagenomics GO slim"
subsetdef: goslim_mouse "Mouse GO slim"
subsetdef: goslim_pir "PIR GO slim"
subsetdef: goslim_plant "Plant GO slim"
subsetdef: goslim_pombe "Fission yeast GO slim"
subsetdef: goslim_synapse "synapse GO slim"
subsetdef: goslim_yeast "Yeast GO slim"
subsetdef: prokaryote_subset

In [3]:
# train_sequence.fast

fasta_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta"

with open(fasta_file_path, "r") as fasta_file:
    lines = fasta_file.readlines()

for line in lines[:10]:  
    print(line.strip())

>P20536 sp|P20536|UNG_VACCC Uracil-DNA glycosylase OS=Vaccinia virus (strain Copenhagen) OX=10249 GN=UNG PE=1 SV=1
MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLKQPLRNK
RVCVCGIDPYPKDGTGVPFESPNFTKKSIKEIASSISRLTGVIDYKGYNLNIIDGVIPWN
YYLSCKLGETKSHAIYWDKISKLLLQHITKHVSVLYCLGKTDFSNIRAKLESPVTTIVGY
HPAARDRQFEKDRSFEIINVLLELDNKVPINWAQGFIY
>O73864 sp|O73864|WNT11_DANRE Protein Wnt-11 OS=Danio rerio OX=7955 GN=wnt11 PE=2 SV=1
MTEYRNFLLLFITSLSVIYPCTGISWLGLTINGSSVGWNQTHHCKLLDGLVPDQQQLCKR
NLELMHSIVRAARLTKSACTSSFSDMRWNWSSIESAPHFTPDLAKGTREAAFVVSLAAAV
VSHAIARACASGDLPSCSCAAMPSEQAAPDFRWGGCGDNLRYYGLQMGSAFSDAPMRNRR
SGPQDFRLMQLHNNAVGRQVLMDSLEMKCKCHGVSGSCSVKTCWKGLQDISTISADLKSK


In [4]:
# Fasta 파일 중 ID, sequence data 불러오기
fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id
    sequence_data = record.seq
    sequences.append((sequence_id, sequence_data))

# sequences[0] : ('P20536', Seq('MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLK...FIY'))

In [5]:
#train_taxonomy.tsv, train_terms.tsv
train_taxonomy = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv", sep= "\t")
train_terms = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv", sep= "\t") 
train_terms.head()

Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO
2,A0A009IHW8,GO:0072523,BPO
3,A0A009IHW8,GO:0044270,BPO
4,A0A009IHW8,GO:0006753,BPO


In [6]:
# 총 단백질 갯수 및 go term 갯수 확인
print("Protein #: {}, term #: {}".format(train_terms['EntryID'].unique().shape, train_terms['term'].unique().shape))

Protein #: (142246,), term #: (31466,)


In [7]:
# 하나의 단백질이 갖는 go term의 갯수 확인
train_terms_collected = train_terms.groupby('EntryID')['term'].apply(list).reset_index(name='terms_collected')
value_counts = train_terms_collected['terms_collected'].apply(len)

'''
value_counts = []
for i in range(0, len(train_terms_collected)):
    counts = len(train_terms_collected.loc[i, 'terms_collected'])
    value_counts.append(counts)
len(value_counts)
'''
train_terms_collected['value_counts'] = train_terms_collected['terms_collected'].apply(len)
print("value_count min: {}, max: {}".format(train_terms_collected['value_counts'].min(), train_terms_collected['value_counts'].max()))

value_count min: 2, max: 815


In [8]:
# Embedding vector shape 비교

ids_train_ems = np.load("/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_ids.npy")
embeds_ems_train = np.load("/kaggle/input/cafa-5-ems-2-embeddings-numpy/train_embeddings.npy")
print("ems       ids# : {}, embedding_dim : {}".format(len(ids_train_ems), embeds_ems_train.shape))

ids_train_protbert = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy")
embeds_protbert_train = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy")
print("protbert  ids# : {}, embedding_dim : {}".format(len(ids_train_protbert), embeds_protbert_train.shape))

ems       ids# : 142246, embedding_dim : (142246, 1280)
protbert  ids# : 142246, embedding_dim : (142246, 1024)


In [9]:
# 메모리 확인 
import numpy as np

# Create a numpy array
data = embeds_ems_train

# Calculate the memory capacity consumed by the array
memory_consumed_bytes = data.nbytes

# Convert bytes to more human-readable format
def format_bytes(size):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024.0:
            return f"{size:.2f} {unit}"
        size /= 1024.0
    return f"{size:.2f} PB"

memory_consumed_human_readable = format_bytes(memory_consumed_bytes)

print("Memory consumed by array:", memory_consumed_human_readable)


Memory consumed by array: 694.56 MB


## Test Data

In [10]:
# testsuperset-taxon-list.tsv
test_taxonomy = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv", sep="\t", encoding="ISO-8859-1")

In [11]:
# testsuperset.fasta
fasta_file_path = "/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta"

with open(fasta_file_path, "r") as fasta_file:
    lines = fasta_file.readlines()

for line in lines[:10]:
    print(line.strip())

>Q9CQV8	10090
MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS
WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFY
LKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY
YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD
AGEGEN
>P62259	10090
MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASW
RIISSIEQKEENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVF
YYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVF


In [12]:
# Fasta 파일 중 ID, sequence data 불러오기
fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta'

sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id
    sequence_data = record.seq
    sequences.append((sequence_id, sequence_data))

# sequences[0] : ('P20536', Seq('MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLK...FIY'))

'''
len(sequences) 141865 인데, dictionary form으로 전환하면  141864가 나옴. 
중복 데이터가 있다고 판단하여 아래 코드를 수행해봄
* 중복 데이터는 다른 사람이 만들어 놓은 embedding vector 갯수가 141854라 알게되었음.
'''

'\nlen(sequences) 141865 인데, dictionary form으로 전환하면  141864가 나옴. \n중복 데이터가 있다고 판단하여 아래 코드를 수행해봄\n* 중복 데이터는 다른 사람이 만들어 놓은 embedding vector 갯수가 141854라 알게되었음.\n'

In [13]:
# 중복 데이터 확인 
from tqdm import tqdm

x = [] # 처음 등장한 값인지 판별하는 리스트
new_a = [] # 중복된 원소만 넣는 리스트

for i in tqdm(sequences):
    if i not in x: 
        x.append(i)
    else:
        if i not in new_a: # 이미 중복 원소로 판정된 경우는 제외
            new_a.append(i)

print(new_a) # [1, 2] # 2회 이상 등장한 값들만 담긴 리스트
# 중복 데이터: [('A0A1D6E0S8', Seq('MPSRSPACRPRGRNRRSAADAVARPLALALILVSTLPRAAHSQDLALPPVQPRG...SFC'))]

100%|██████████| 141865/141865 [09:11<00:00, 257.11it/s]

[('A0A1D6E0S8', Seq('MPSRSPACRPRGRNRRSAADAVARPLALALILVSTLPRAAHSQDLALPPVQPRG...SFC'))]





# Go term 예측

## Go term 예측 전략
- 단백질 sequence vector간 거리가 가까울 수록 비슷한 단백질.
- 비슷한 단백질을 하는 역할이 비슷함.
- 단백질 간 거리가 가까우면 비슷한 go term을 공유한다.

## Go term 예측을 위해 필요한 사항 
- 효과적 예측을 위해서는 단백질 간의 거리가 까운 샘플이 train data에 존재 해야한다.
- 즉, 최대한 다양한 종류의 단백질(벡터간의 거리가 다양한 단백질)이 train data 안에 있어야 한다.
- 시간 관계상 다양한 단백질을 샘플링하는 코드를 짜기 어려워 단순히 최대한 많은 데이터를 넣기로 함.
- pretest 결과 Train - Test간 모든 거리는 142246 * 141864 tensor로 용량을 많이 차지함.

In [14]:
# Train x Test (142246 * 141864) tensor 용량 계산
import torch

# Define the tensor shape and data type
tensor_shape = (142246, 141864)
data_type = torch.float32

# Create a tensor of the specified shape and data type
tensor = torch.empty(tensor_shape, dtype=data_type)

# Calculate the memory consumption in bytes
memory_bytes = tensor.element_size() * tensor.numel()

# Convert bytes to gigabytes
memory_gb = memory_bytes / (1024 ** 3)

print("Memory Consumption of the Tensor: {:.2f} GB".format(memory_gb))

# Memory Consumption of the Tensor: 75.17 GB

Memory Consumption of the Tensor: 75.17 GB
