In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import sentencepiece as spm

In [2]:
# Idea: Creating a numerical representation for domain models
# by assigning each model an index
# ----------------------------------------------------------------------
# By collecting all domain models into a list and assigning each model
# an index, we create a numerical representation for each domain model.
# This approach allows us to represent categorical data (domain models)
# in a numerical format, which is essential for many machine learning
# algorithms. It reduces the dimensionality of the feature space compared
# to one-hot encoding, while preserving the inherent ordering or relationship
# between domain models. Additionally, numeric indices are more memory-efficient
# and easier to interpret than one-hot encoded vectors. However, it's crucial
# to ensure consistent index assignment across different datasets and experiments
# to maintain the integrity of our data preprocessing pipeline.


In [3]:
DATA_DIR = "../data"
all_models_sorted_file = os.path.join(DATA_DIR, 'CD_models_curated_sorted')

print(all_models_sorted_file)

all_domain_models = []
with open(all_models_sorted_file, 'r') as f:
    all_domain_models = [d.strip() for d in f.readlines()]

print(len(all_domain_models))
print(all_domain_models[:10])

domain_model_indices = {value: index + 1 for index, value in enumerate(all_domain_models)}

with open(os.path.join(DATA_DIR, 'CD_models_indices.pickle'), 'wb') as f:
    pickle.dump(domain_model_indices, f)
    print('Saved as pickle file')


../data/CD_models_curated_sorted
28055
['CHL00001', 'CHL00002', 'CHL00004', 'CHL00005', 'CHL00008', 'CHL00009', 'CHL00011', 'CHL00012', 'CHL00013', 'CHL00014']
Saved as pickle file


In [4]:
# Encode Superfamily Strings
# ----------------------------------------------------------------------

superfamily_strings_file = os.path.join(DATA_DIR, 'super_families_curated_sorted')

all_superfamily_strings = []

with open(superfamily_strings_file, 'r') as f:
    all_superfamily_strings = [d.strip() for d in f.readlines()]

print(len(all_superfamily_strings))
print(all_superfamily_strings[:10])

superfamily_string_indices = {value: index + 1 for index, value in enumerate(all_superfamily_strings)}

with open(os.path.join(DATA_DIR, 'super_families_indices.pickle'), 'wb') as f:
    pickle.dump(domain_model_indices, f)
    print('Saved as pickle file')



13883
['/NonD', '1-cysPrx_C', '10', '14-3-3', '16S_RimM', '17', '18S_RNA_Rcl1p', '2-oxoacid_dh', '2-oxogl_dehyd_N', '2-ph_phosp']
Saved as pickle file


In [5]:
# Specify the file path of the CSV file
sparcle_data_file_path = os.path.join(DATA_DIR, 'SPARCLE_IDS_curated.csv')

# Specify the columns you want to read from the CSV file
columns_to_read = ['ArchId', 'CurName', 'SpecificArch', 'superfamilyarch', 'TitleStrings'] 

# Read specific columns of the CSV file into a DataFrame
df = pd.read_csv(sparcle_data_file_path, usecols=columns_to_read)

# Display the DataFrame
df.head(20)


Unnamed: 0,ArchId,CurName,SpecificArch,superfamilyarch,TitleStrings
0,3,FtsZ/tubulin family protein,,Tubulin_FtsZ_Cetz-like,
1,16,TCP-1/cpn60 chaperonin family protein,,chaperonin_like,
2,2,envelope glycoprotein 120,,GP120,
3,30,ABC transporter permease,,TM_PBP2,
4,21,homeobox domain-containing protein,,homeodomain,
5,37,surface antigen,,vMSA,
6,76,PTS sugar transporter subunit IIA,,PTS_IIA_fru,
7,7,protein Nef,,F-protein,
8,13,rhodanese-like domain-containing protein,,RHOD,
9,22,thiamine pyrophosphate-dependent enzyme,,TPP_enzymes,


In [6]:
# This code converts domain-based architecture strings (from a DataFrame's 'SpecificArch' column) into fixed-length numerical representations. 
# It assumes the following:
#   * Each architecture string consists of space-separated domain names.
#   * A dictionary 'domain_model_indices' exists, mapping domain names to numerical indices. 
#   * The output is a list of NumPy arrays, where each array represents an architecture with domain indices at the beginning, padded with zeros if needed.
#   * The code also tracks the maximum length of any architecture encountered. 


encoded_features = {}
max_len = 0

for index, row in df.iterrows():
    dms = row['SpecificArch']
    arr = [0] * 31  # we use the first 31 positions to hold the indices of individual SpecificArch
    if not pd.isna(dms):
        a = [domain_model_indices.get(m, 0) for m in dms.split(' ')]
        if len(a) > max_len:
            max_len = len(a)
        arr[:len(a)] = a
    
    spa = row['superfamilyarch']
    # the 32th will be numeric representation of the superfamily arch
    if pd.isna(spa):
        arr.append(0)
    else:
        arr.append(superfamily_string_indices.get(spa, 0))

    encoded_features[str(row['ArchId'])] = arr

print(max_len)
print(encoded_features['55'])

20
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6278]


In [7]:
# prepare vetorize title strings

# --- Load the Trained Model ---
sp = spm.SentencePieceProcessor(model_file='titles_and_curnames.model')

# --- Tokenize a Sample Name ---
vocab_size = sp.get_piece_size()

# Get the pieces and their corresponding IDs
vocab = {sp.id_to_piece(i): i for i in range(vocab_size)}

# Function to convert a protein name into a target vector
def name_to_target_vector(name):
  subwords = sp.encode(name, out_type=str)
  target_vector = [vocab.get(token, vocab['<unk>']) for token in subwords]
  return target_vector


In [8]:
# to determine how long the list should be to hold the numeric representation of title

max_len = 0
for title in df['TitleStrings']:
    if not pd.isna(title):
        vec = name_to_target_vector(title)
        if max_len < len(vec):
            max_len = len(vec)

print(max_len)

322


In [9]:
for index, row in df.iterrows():
    title = row['TitleStrings']
    a = [0] * 400
    if not pd.isna(title):
        vec = name_to_target_vector(title)
        a[0:len(vec)] = vec
    
    encoded_features[str(row['ArchId'])] += a



print(encoded_features['55'])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6278, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
with open(os.path.join(DATA_DIR, 'archid_encoded_features_dict.pickle'), 'wb') as f:
    pickle.dump(encoded_features, f)