In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import sentencepiece as spm

In [2]:
DATA_DIR = "../data"
all_models_sorted_file = os.path.join(DATA_DIR, 'CD_models_curated_sorted')
all_curnames_file = os.path.join(DATA_DIR, 'simplified_Curname_set.csv')
all_superfamily_file = os.path.join(DATA_DIR, 'super_families_curated_sorted')
all_title_strings_file = os.path.join(DATA_DIR, 'Title_Strings_sorted_uniq')

In [3]:
# Idea: Creating a numerical representation for domain models
# by assigning each model an index
# ----------------------------------------------------------------------
# By collecting all domain models into a list and assigning each model
# an index, we create a numerical representation for each domain model.
# This approach allows us to represent categorical data (domain models)
# in a numerical format, which is essential for many machine learning
# algorithms. It reduces the dimensionality of the feature space compared
# to one-hot encoding, while preserving the inherent ordering or relationship
# between domain models. Additionally, numeric indices are more memory-efficient
# and easier to interpret than one-hot encoded vectors. However, it's crucial
# to ensure consistent index assignment across different datasets and experiments
# to maintain the integrity of our data preprocessing pipeline.


In [4]:
print(all_models_sorted_file)

all_domain_models = []
with open(all_models_sorted_file, 'r') as f:
    for line in f:
        all_domain_models.append(line.strip())

print(len(all_domain_models))
print(all_domain_models[:10])

domain_model_indices = {value: index + 1 for index, value in enumerate(all_domain_models)}

with open(os.path.join(DATA_DIR, 'CD_models_indices.pickle'), 'wb') as f:
    pickle.dump(domain_model_indices, f)
    print('Saved as pickle file')


../data/CD_models_curated_sorted
28055
['CHL00001', 'CHL00002', 'CHL00004', 'CHL00005', 'CHL00008', 'CHL00009', 'CHL00011', 'CHL00012', 'CHL00013', 'CHL00014']
Saved as pickle file


In [5]:
print(all_superfamily_file)

all_superfamilies = []
with open(all_superfamily_file, 'r') as f:
     for line in f:
         all_superfamilies.append(line.strip())

superfamily_indices = {value: index + 1 for index, value in enumerate(all_superfamilies)}

with open(os.path.join(DATA_DIR, 'super_families_indices.pickle'), 'wb') as f:
    pickle.dump(domain_model_indices, f)
    print('Saved as pickle file')

../data/super_families_curated_sorted
Saved as pickle file


In [6]:
# prepare to vetorize title strings

# --- Train the SentencePiece Model ---
spm.SentencePieceTrainer.train(
    input=all_title_strings_file,
    model_prefix='titles',
    vocab_size=6566,
    character_coverage=0.9995
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/Title_Strings_sorted_uniq
  input_format: 
  model_prefix: titles
  model_type: UNIGRAM
  vocab_size: 6566
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential

In [7]:
# --- Train the SentencePiece Model ---
spm.SentencePieceTrainer.train(
    input=all_curnames_file,
    model_prefix='curnames',
    vocab_size=6566,
    character_coverage=0.9995
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/simplified_Curname_set.csv
  input_format: 
  model_prefix: curnames
  model_type: UNIGRAM
  vocab_size: 6566
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_different

In [8]:
# --- Load the Trained Model ---
sp_titles = spm.SentencePieceProcessor(model_file='titles.model')

# Get the pieces and their corresponding IDs
vocab_titles = {sp_titles.id_to_piece(i): i for i in range(sp_titles.get_piece_size())}

sp_curnames = spm.SentencePieceProcessor(model_file='curnames.model')
vocab_curnames = {sp_curnames.id_to_piece(i): i for i in range(sp_curnames.get_piece_size())}

# Function to convert a protein name into a target vector
def name_to_target_vector(name, sp, vocab):
  subwords = sp.encode(name, out_type=str)
  target_vector = [vocab.get(token, vocab['<unk>']) for token in subwords]
  return target_vector

In [9]:
# Specify the file path of the CSV file
sparcle_data_file_path = os.path.join(DATA_DIR, 'SPARCLE_IDS_curated_simplified_modTitleStrings.csv')

# Specify the columns you want to read from the CSV file
columns_to_read = ['CurName_simplified', 'SpecificArch', 'superfamilyarch', 'TitleStrings'] 

# Read specific columns of the CSV file into a DataFrame
df = pd.read_csv(sparcle_data_file_path, usecols=columns_to_read)

# Display the DataFrame
df.head(20)


Unnamed: 0,CurName_simplified,superfamilyarch,SpecificArch,TitleStrings
0,"(2,3-dihydroxybenzoyl)adenylate synthase",amp-binding,cog1021,non-ribosomal peptide synthetase component e (...
1,"(2,3-dihydroxybenzoyl)adenylate synthase",amp-binding,tigr02275,"2,3-dihydroxybenzoate-amp ligase"
2,"(2,3-dihydroxybenzoyl)adenylate synthase",dhb_amp_lig,tigr02275,"2,3-dihydroxybenzoate-amp ligase"
3,"(2,3-dihydroxybenzoyl)adenylate synthase",ente,cog1021,non-ribosomal peptide synthetase component e (...
4,"(2,3-dihydroxybenzoyl)adenylate synthase",ente,prk10946,"(2,3-dihydroxybenzoyl)adenylate synthase"
5,"(2,3-dihydroxybenzoyl)adenylate synthase",prk07788,prk10946,"(2,3-dihydroxybenzoyl)adenylate synthase"
6,"(2E,6E)-farnesyl diphosphate synthase",isoprenoid_biosyn_c1,prk10581,"(2e,6e)-farnesyl diphosphate synthase"
7,(2Fe-2S) ferredoxin domain-containing protein,trx_superfamily,cd02980,
8,(2Fe-2S) ferredoxin domain-containing protein,trx_superfamily chelatase_class_ii,cl02784 cd02980,chelatase_class_ii |
9,(2Fe-2S)-binding protein,coxs,cog2080,"aerobic- carbon monoxide dehydrogenase, small ..."


In [10]:
# to get the length of the longest vector that the encoded specific arch generated

max_len = 0
for dms in df['SpecificArch']:
    if not pd.isna(dms):
        a = [domain_model_indices.get(m, 0) for m in dms.split(' ')]
        if len(a) > max_len:
            max_len = len(a)

print(max_len)

20


In [11]:
# to get the length of the longest vector that the encoded superfamilyarch generated

max_len = 0

for sps in df['superfamilyarch']:
    if not pd.isna(sps):
        vec = sps.split(' ')
        if max_len < len(vec):
            max_len = len(vec)

print(max_len)
    

9


In [12]:
# to get the length of the longest vector that the encoded titlestring generated
max_len = 0
for title in df['TitleStrings']:
    if not pd.isna(title):
        vec = name_to_target_vector(title, sp_titles, vocab_titles)
        if max_len < len(vec):
            max_len = len(vec)

print(max_len)

297


In [13]:
# to get the length of the longest vector that the encoded curname generated
max_len = 0
for name in df['CurName_simplified']:
    if not pd.isna(name):
        vec = name_to_target_vector(name, sp_curnames, vocab_curnames)
        if max_len < len(vec):
            max_len = len(vec)

print(max_len)

37


In [14]:
# This code converts domain-based architecture strings (from a DataFrame's 'SpecificArch' column) into fixed-length numerical representations. 
# It assumes the following:
#   * Each architecture string consists of space-separated domain names.
#   * A dictionary 'domain_model_indices' exists, mapping domain names to numerical indices. 
#   * A dictionary 'superfamily_indices' exists, mapping domain names to numerical indices. 
#   * A sentencepiece model is loaded


result = []


for index, row in df.iterrows():
    dms = row['SpecificArch']
    a1 = [0] * 30  # we use 30 positions to hold the indices of individual SpecificArch
    if not pd.isna(dms):
        arr = [domain_model_indices.get(m, 0) for m in dms.split(' ')]
        a1[:len(arr)] = arr

    sfa = row['superfamilyarch']
    a2 = [0] * 20 # we use 20 positions to hold the numeric values representing the tokenized superfamilyarch
    if not pd.isna(sfa):
        arr = [superfamily_indices.get(s, 0) for s in sfa.split(' ')]
        a2[:len(arr)] = arr

    title = row['TitleStrings']
    a3 = [0] * 400
    if not pd.isna(title):
        vec = name_to_target_vector(title.lower(), sp_titles, vocab_titles)
        a3[0:len(vec)] = vec
    
    curname = row['CurName_simplified']
    a4 = [0] * 60
    if not pd.isna(curname):
        vec = name_to_target_vector(curname, sp_curnames, vocab_curnames)
        a4[0:len(vec)] = vec

    result.append([a1 + a2 + a3, a4])


In [15]:
print(result[20:25])

[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 872, 3249, 4642, 2414, 497, 501, 8, 341, 437, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:

with open(os.path.join(DATA_DIR, 'input_output_list.pickle'), 'wb') as f:
    pickle.dump(result, f)

In [17]:
# with open(os.path.join(DATA_DIR, 'input_output_list.pickle'), 'rb') as f:
#     a = pickle.load(f)

In [None]:
sparcle_data_uncurated_file_path = os.path.join(DATA_DIR, 'SPARCLE_IDS_UNcurated_TITLES_modTitleStrings.csv')

columns_to_read = ['ArchId', 'CurName', 'SpecificArch', 'superfamilyarch', 'TitleStrings']

# Read specific columns of the CSV file into a DataFrame
df = pd.read_csv(sparcle_data_uncurated_file_path, usecols=columns_to_read)

encoded_features = {}
encoded_outputs = {}

for index, row in df.iterrows():
    dms = row['SpecificArch']
    a1 = [0] * 30  # we use 30 positions to hold the indices of individual SpecificArch
    if not pd.isna(dms):
        arr = [domain_model_indices.get(m, 0) for m in dms.split(' ')]
        a1[:len(arr)] = arr

    sfa = row['superfamilyarch']
    a2 = [0] * 20  # we use 20 positions to hold the numeric values representing the tokenized superfamilyarch
    if not pd.isna(sfa):
        arr = [superfamily_indices.get(s, 0) for s in sfa.split(' ')]
        a2[:len(arr)] = arr

    title = row['TitleStrings']
    a3 = [0] * 400
    if not pd.isna(title):
        vec = name_to_target_vector(title.lower(), sp_titles, vocab_titles)
        a3[0:len(vec)] = vec

    curname = row['CurName']
    a4 = [0] * 60
    if not pd.isna(curname):
        vec = name_to_target_vector(curname, sp_curnames, vocab_curnames)
        a4[0:len(vec)] = vec

    encoded_features[str(int(row['ArchId']))] = a1 + a2 + a3
    encoded_outputs[str(int(row['ArchId']))] = a4

with open(os.path.join(DATA_DIR, 'encoded_features_uncurated.pickle'), 'wb') as f:
    pickle.dump(encoded_features, f)

with open(os.path.join(DATA_DIR, 'encoded_outputs_uncurated.pickle'), 'wb') as f:
    pickle.dump(encoded_outputs, f)