In [1]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
print(sys.path[-1])

/home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin


In [2]:
import pandas as pd
import numpy as np
from src.utils import (
    prep_kinase_data_list,
    create_kinase_dataset,
    extract_features_for_ml
)

In [3]:
BASE_DIR = os.path.dirname(os.getcwd())
DATA_DIR  = os.path.join(BASE_DIR, 'data')
METADATA_DIR = os.path.join(BASE_DIR, 'metadata')
FEATURES_DIR = os.path.join(BASE_DIR, 'features')

os.makedirs(FEATURES_DIR, exist_ok=True)

print(BASE_DIR)
print(DATA_DIR)
print(METADATA_DIR)
print(FEATURES_DIR)

/home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin
/home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data
/home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/metadata
/home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/features


In [4]:
df = pd.read_csv(
    os.path.join(METADATA_DIR, 'kincore.tab'), sep='\t'
)
df.head()

Unnamed: 0,Organism,Group,Gene,UniprotID,PDB,Method,Resolution,Rfac,FreeRfac,DomainBoundary,...,DihedralLabel,ActivityLabel,Chelix-Saltbr,ActLoop,Ligand,LigandType,DFG_Phe,Phosphorylation,Mutations,ProteinName
0,Aplysia californica,CAMK,TWITCHIN,Q16980_APLCA,1KOBA,XRAY,2.3,0.2,0.282,47-302,...,ABAminus,Inactive,out-out,in-out,No_ligand,No_ligand,189,,,Twitchin-like protein
1,Aplysia californica,CAMK,TWITCHIN,Q16980_APLCA,1KOBB,XRAY,2.3,0.2,0.282,47-302,...,ABAminus,Inactive,out-out,in-out,No_ligand,No_ligand,189,,,Twitchin-like protein
2,Arabidopsis thaliana,CAMK,CIPK23,CIPKN_ARATH,4CZTA,XRAY,2.3,0.179,0.234,31-286,...,BLBminus,Inactive,out-out,out-out,CPS:61316,Type1,173,,,CBL-interacting serine/threonine-protein kinas...
3,Arabidopsis thaliana,CAMK,CIPK23,CIPKN_ARATH,4CZTB,XRAY,2.3,0.179,0.234,31-286,...,BLBminus,Inactive,out-out,out-out,CPS:61314,Type1,173,,,CBL-interacting serine/threonine-protein kinas...
4,Arabidopsis thaliana,CAMK,CIPK23,CIPKN_ARATH,4CZUA,XRAY,1.9,0.194,0.233,31-286,...,BLBminus,Inactive,out-out,out-out,CPS:61321,Type1,173,,T190D,CBL-interacting serine/threonine-protein kinas...


In [5]:
# Prepare kinase data list from directory structure
kinase_data_list = prep_kinase_data_list(
    in_path=DATA_DIR,
    df=df,
    gene_col='Gene',        # column name containing individual gene names
    group_col='Group',      # column name containing gene groups/families
    pdb_col='PDB'           # column name containing PDB IDs
)

# Create kinases with parallel processing
kinases = create_kinase_dataset(
    kinase_data_list,
    max_workers=None,          # Use 8 parallel workers
    C_alpha_only=False,      # Use only C-alpha atoms for faster computation
    pixel_size=0.5,         # Higher resolution persistence images
    birth_range=(0, 15),
    pers_range=(0, 10)
)

# Extract features for machine learning
X, y, metadata = extract_features_for_ml(kinases, include_dims=[0, 1])

Scanning directory: /home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data
DataFrame shape: (9697, 25)

=== Processing Summary ===
Total PDB files found: 2608
Successfully processed: 2608
Skipped - insufficient path depth: 0
Skipped - invalid activity label: 0
Skipped - parsing errors: 0
Skipped - not found in DataFrame: 0
Skipped - file access errors: 0

=== Activity Distribution ===
Inactive: 1300
Active: 1308


Processing kinases:  67%|██████▋   | 1743/2608 [19:23<09:18,  1.55it/s]  

File /home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data/CDK2/Active/8VQ4_chainA.pdb could not be parsed with exception:
invalid literal for int() with base 10: 'A'
Exiting...
Failed to create kinase for CDK2_8VQ4_A: 'Kinase' object has no attribute 'mw'


Processing kinases:  68%|██████▊   | 1775/2608 [19:42<05:58,  2.32it/s]

File /home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data/CDK2/Active/8VQ3_chainA.pdb could not be parsed with exception:
invalid literal for int() with base 10: 'A'
Exiting...
Failed to create kinase for CDK2_8VQ3_A: 'Kinase' object has no attribute 'mw'


Processing kinases: 100%|██████████| 2608/2608 [27:36<00:00,  1.57it/s]


In [6]:
np.savez_compressed(
    os.path.join(FEATURES_DIR, 'kincore.npz'), 
    X=X, y=y, metadata=np.array(metadata, dtype=object)
)

In [7]:
# To load:
data = np.load(os.path.join(FEATURES_DIR, 'kincore.npz'), allow_pickle=True)
X, y, metadata = data['X'], data['y'], data['metadata']

In [8]:
X

array([[ 4.01040683e+01,  4.06351437e+01,  3.22987920e+01, ...,
         1.89018042e-18,  1.89018042e-18,  1.89018042e-18],
       [ 3.91975976e+01,  3.97664762e+01,  3.16800348e+01, ...,
        -7.48967080e-18, -7.48967080e-18, -7.48967080e-18],
       [ 4.08737981e+01,  4.12745176e+01,  3.26818206e+01, ...,
         4.37951885e-17,  4.37951885e-17,  4.37951885e-17],
       ...,
       [ 5.29056475e+01,  5.35387471e+01,  4.24837907e+01, ...,
         7.91285278e-18,  7.91285278e-18,  7.91285278e-18],
       [ 5.26759555e+01,  5.33410951e+01,  4.23556362e+01, ...,
         1.45928203e-17,  1.45928203e-17,  1.45928203e-17],
       [ 1.47304657e+02,  1.44955115e+02,  1.12070638e+02, ...,
        -1.27129031e-17, -1.27129031e-17, -1.27129031e-17]],
      shape=(2606, 1200))

In [9]:
y

array([1, 0, 0, ..., 1, 1, 0], shape=(2606,))

In [10]:
metadata

array([{'name': 'ULK1_8SV9_A', 'activity': 'active', 'pdb_id': '8SV9', 'chain': 'A', 'gene': 'ULK1', 'group': 'OTHER', 'n_atoms': 2156, 'sasa': 13439.264702432181, 'normalized_sasa': 0.01828926146715473, 'molecular_weight': 734817.2437999999},
       {'name': 'ULK1_6MNH_A', 'activity': 'inactive', 'pdb_id': '6MNH', 'chain': 'A', 'gene': 'ULK1', 'group': 'OTHER', 'n_atoms': 2104, 'sasa': 13825.38159328916, 'normalized_sasa': 0.019155003765938358, 'molecular_weight': 721763.4495},
       {'name': 'PIM1_4RC4_A', 'activity': 'inactive', 'pdb_id': '4RC4', 'chain': 'A', 'gene': 'PIM1', 'group': 'CAMK', 'n_atoms': 2218, 'sasa': 12987.290012468957, 'normalized_sasa': 0.017235817790979067, 'molecular_weight': 753505.8777},
       ...,
       {'name': 'SRPK1_5XV7_A', 'activity': 'active', 'pdb_id': '5XV7', 'chain': 'A', 'gene': 'SRPK1', 'group': 'CMGC', 'n_atoms': 2849, 'sasa': 17350.916379337636, 'normalized_sasa': 0.0176782028196462, 'molecular_weight': 981486.4416},
       {'name': 'SRPK1_7ZK

In [11]:
# ========================================== #
# Same as above, but only carbon alpha atoms #
# ========================================== #

# Create kinases with parallel processing
kinases = create_kinase_dataset(
    kinase_data_list,
    max_workers=None,          # Use 8 parallel workers
    C_alpha_only=True,      # Use only C-alpha atoms for faster computation
    pixel_size=0.5,         # Higher resolution persistence images
    birth_range=(0, 15),
    pers_range=(0, 10)
)

# Extract features for machine learning
X, y, metadata = extract_features_for_ml(kinases, include_dims=[0, 1])

np.savez_compressed(
    os.path.join(FEATURES_DIR, 'kincore_Calphaonly.npz'), 
    X=X, y=y, metadata=np.array(metadata, dtype=object)
)

Processing kinases:  67%|██████▋   | 1744/2608 [05:23<01:55,  7.48it/s]

File /home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data/CDK2/Active/8VQ4_chainA.pdb could not be parsed with exception:
invalid literal for int() with base 10: 'A'
Exiting...
Failed to create kinase for CDK2_8VQ4_A: 'Kinase' object has no attribute 'mw'


Processing kinases:  68%|██████▊   | 1775/2608 [05:28<02:08,  6.48it/s]

File /home/cotsios/dsit/2nd-semester/algos-in-struct-bio/project/MLtopKin/data/CDK2/Active/8VQ3_chainA.pdb could not be parsed with exception:
invalid literal for int() with base 10: 'A'
Exiting...
Failed to create kinase for CDK2_8VQ3_A: 'Kinase' object has no attribute 'mw'


Processing kinases: 100%|██████████| 2608/2608 [07:44<00:00,  5.61it/s]
