# **Prepare Strain-space (S-space) feature** matrix

This notebook post-processes the Strain-space **type II signatures** (`S_sign2.tsv`) into a modeling-ready table.

## Inputs
- `data/features/strain_space_ss/S_sign2.tsv`

## Outputs
- `data/features/strain_space_ss/sspace.csv`

  One row per compound (InChIKey) with 128 S-space features renamed to `s_0..s_127`, scaled to [-1, 1].


### Notes / assumptions

- `S_sign2.tsv` may contain extra index columns depending on how it was exported; this notebook drops common artifacts (e.g., `Unnamed: 0`, `index`).
- S-space embeddings are rescaled with MinMaxScaler to [-1, 1] for compatibility with other feature blocks and similarity computations.
- Feature columns are renamed to `s_0..s_127` and InChIKeys are standardized to uppercase.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from halo.paths import SS_FEATURES

In [None]:
sspace = pd.read_csv(SS_FEATURES / "S_sign2.tsv", sep="\t").copy()

In [None]:
sspace.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,actoxuheucptew-ceuobaopsa-n,0.279253,0.325789,0.300881,0.108358,-0.614722,0.033203,-0.195606,0.296044,-0.082143,...,-0.556732,-0.165233,0.162611,0.153988,0.135849,-0.179818,-0.030269,-0.248934,-0.170565,-0.069774
1,agoydepgaoxock-kcbohyoisa-n,0.004763,0.15986,-0.141753,-0.121233,-0.061586,-0.001962,-0.351597,0.098202,-0.32459,...,-0.26641,0.096389,-0.172638,0.15444,-0.074052,0.000581,0.395834,-0.211503,-0.304872,-0.089315
2,aojjsuzboxzqnb-tzssrymlsa-n,0.018066,0.595996,0.131569,0.072174,-0.420807,0.198102,-0.009198,0.248987,0.111922,...,-0.219347,-0.369608,0.119339,0.378647,0.27726,-0.274133,0.29298,-0.324133,0.04115,0.025046
3,aujrcfubupvwsz-xtzhgvarsa-m,0.017075,0.159697,-0.111766,-0.188458,-0.047327,-0.029275,-0.316442,0.094704,-0.334855,...,-0.211532,0.113873,-0.20551,0.119897,-0.066508,-0.01451,0.412465,-0.173437,-0.355429,-0.067068
4,bjnllbuohpvgft-cayrisatsa-n,-0.011996,0.586473,0.077567,0.069967,-0.351931,0.171041,-0.126305,0.236439,0.170339,...,-0.261116,-0.327031,0.07283,0.361365,0.258141,-0.260423,0.268768,-0.304416,-0.036788,0.008771


In [None]:
sspace = sspace.rename(columns={"Unnamed: 0": "inchikey"})
sspace = sspace.drop(columns=["index"], errors="ignore")
sspace = sspace.drop(columns=[0], errors="ignore")
sspace.head()

Unnamed: 0,inchikey,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,actoxuheucptew-ceuobaopsa-n,0.279253,0.325789,0.300881,0.108358,-0.614722,0.033203,-0.195606,0.296044,-0.082143,...,-0.556732,-0.165233,0.162611,0.153988,0.135849,-0.179818,-0.030269,-0.248934,-0.170565,-0.069774
1,agoydepgaoxock-kcbohyoisa-n,0.004763,0.15986,-0.141753,-0.121233,-0.061586,-0.001962,-0.351597,0.098202,-0.32459,...,-0.26641,0.096389,-0.172638,0.15444,-0.074052,0.000581,0.395834,-0.211503,-0.304872,-0.089315
2,aojjsuzboxzqnb-tzssrymlsa-n,0.018066,0.595996,0.131569,0.072174,-0.420807,0.198102,-0.009198,0.248987,0.111922,...,-0.219347,-0.369608,0.119339,0.378647,0.27726,-0.274133,0.29298,-0.324133,0.04115,0.025046
3,aujrcfubupvwsz-xtzhgvarsa-m,0.017075,0.159697,-0.111766,-0.188458,-0.047327,-0.029275,-0.316442,0.094704,-0.334855,...,-0.211532,0.113873,-0.20551,0.119897,-0.066508,-0.01451,0.412465,-0.173437,-0.355429,-0.067068
4,bjnllbuohpvgft-cayrisatsa-n,-0.011996,0.586473,0.077567,0.069967,-0.351931,0.171041,-0.126305,0.236439,0.170339,...,-0.261116,-0.327031,0.07283,0.361365,0.258141,-0.260423,0.268768,-0.304416,-0.036788,0.008771


In [None]:
sspace.columns

Index(['inchikey', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '118', '119', '120', '121', '122', '123', '124', '125', '126', '127'],
      dtype='object', length=129)

In [None]:
na_rows = sspace[sspace.isna().any(axis=1)]
na_rows.shape
na_rows.head()

Unnamed: 0,inchikey,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127


In [None]:
sspace.isna().values.any()

np.False_

### **Cheking if features are scaled**

In [None]:
sspace_feat_cols = [c for c in sspace.columns if c not in ['inchikey']]
sspace[sspace_feat_cols] = sspace[sspace_feat_cols].apply(pd.to_numeric, errors='coerce')
inchies = sspace['inchikey'].copy()
sspace[sspace_feat_cols].isna().sum().sum()

np.int64(0)

In [None]:
sspace.head()

Unnamed: 0,inchikey,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,actoxuheucptew-ceuobaopsa-n,0.279253,0.325789,0.300881,0.108358,-0.614722,0.033203,-0.195606,0.296044,-0.082143,...,-0.556732,-0.165233,0.162611,0.153988,0.135849,-0.179818,-0.030269,-0.248934,-0.170565,-0.069774
1,agoydepgaoxock-kcbohyoisa-n,0.004763,0.15986,-0.141753,-0.121233,-0.061586,-0.001962,-0.351597,0.098202,-0.32459,...,-0.26641,0.096389,-0.172638,0.15444,-0.074052,0.000581,0.395834,-0.211503,-0.304872,-0.089315
2,aojjsuzboxzqnb-tzssrymlsa-n,0.018066,0.595996,0.131569,0.072174,-0.420807,0.198102,-0.009198,0.248987,0.111922,...,-0.219347,-0.369608,0.119339,0.378647,0.27726,-0.274133,0.29298,-0.324133,0.04115,0.025046
3,aujrcfubupvwsz-xtzhgvarsa-m,0.017075,0.159697,-0.111766,-0.188458,-0.047327,-0.029275,-0.316442,0.094704,-0.334855,...,-0.211532,0.113873,-0.20551,0.119897,-0.066508,-0.01451,0.412465,-0.173437,-0.355429,-0.067068
4,bjnllbuohpvgft-cayrisatsa-n,-0.011996,0.586473,0.077567,0.069967,-0.351931,0.171041,-0.126305,0.236439,0.170339,...,-0.261116,-0.327031,0.07283,0.361365,0.258141,-0.260423,0.268768,-0.304416,-0.036788,0.008771


In [None]:
min_vals = sspace[sspace_feat_cols].min(axis=0).values
max_vals = sspace[sspace_feat_cols].max(axis=0).values

# 4) check range
scaled_check = np.all((min_vals >= -1) & (max_vals <= 1))

if scaled_check:
    print("All features are within [-1, 1].")
else:
    print("Some features fall outside [-1, 1].")

out_of_range = np.sum((min_vals < -1) | (max_vals > 1))
print(f"{out_of_range} out of {len(sspace_feat_cols)} features exceed [-1, 1].")

Some features fall outside [-1, 1].
1 out of 128 features exceed [-1, 1].


In [None]:
sspace_feat_cols = [c for c in sspace.columns if c != 'inchikey']
sspace[sspace_feat_cols] = sspace[sspace_feat_cols].apply(pd.to_numeric, errors='coerce')

scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(sspace[sspace_feat_cols])
sspace = pd.DataFrame(X_scaled, columns=sspace_feat_cols, index=sspace.index)
sspace.insert(0, 'inchikey', inchies)

In [None]:
eps = 1e-8

min_vals = sspace[sspace_feat_cols].min(axis=0).values
max_vals = sspace[sspace_feat_cols].max(axis=0).values

scaled_check = np.all((min_vals >= -1 - eps) & (max_vals <= 1 + eps))

if scaled_check:
    print("All features are within [-1, 1] up to numerical precision.")
else:
    print("Some features fall outside [-1, 1] beyond tolerance.")

out_of_range = np.sum((min_vals < -1 - eps) | (max_vals > 1 + eps))
print(f"{out_of_range} out of {len(sspace_feat_cols)} features exceed [-1, 1] beyond tolerance.")

All features are within [-1, 1] up to numerical precision.
0 out of 128 features exceed [-1, 1] beyond tolerance.


In [None]:
def rename_sspace_columns(df):
    feature_cols = [col for col in df.columns if col != "inchikey"]
    if len(feature_cols) != 128:
        raise ValueError(f"S-space must have 128 feature columns, found {len(feature_cols)}.")
    
    rename_map = {old: f"s_{i}" for i, old in enumerate(feature_cols)}
    return df.rename(columns=rename_map)

sspace = rename_sspace_columns(sspace)
sspace['inchikey'] = sspace['inchikey'].astype(str).str.strip().str.upper()
sspace.head()

Unnamed: 0,inchikey,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,s_118,s_119,s_120,s_121,s_122,s_123,s_124,s_125,s_126,s_127
0,ACTOXUHEUCPTEW-CEUOBAOPSA-N,0.619295,-0.527338,0.717995,0.703418,-0.895585,-0.166294,-0.25888,0.526837,-0.347351,...,-0.729726,0.093762,0.392845,-0.59213,0.144605,0.361739,-0.820366,0.169121,-0.08926,-0.318074
1,AGOYDEPGAOXOCK-KCBOHYOISA-N,-0.374471,-0.846434,-0.449727,-0.352559,0.457922,-0.293561,-0.764916,-0.126592,-0.941954,...,0.217401,0.867442,-0.543921,-0.590901,-0.626865,0.931273,0.133112,0.382996,-0.561204,-0.38332
2,AOJJSUZBOXZQNB-TZSSRYMLSA-N,-0.326308,-0.007706,0.27133,0.536994,-0.421081,0.430491,0.34583,0.371418,0.128595,...,0.370936,-0.510624,0.271933,0.01884,0.664347,0.063978,-0.097041,-0.260554,0.654689,-0.001472
3,AUJRCFUBUPVWSZ-XTZHGVARSA-M,-0.329896,-0.846747,-0.370618,-0.661752,0.492813,-0.392411,-0.650873,-0.138144,-0.967129,...,0.396431,0.919146,-0.635774,-0.684842,-0.59914,0.88363,0.170327,0.600499,-0.738857,-0.309036
4,BJNLLBUOHPVGFT-CAYRISATSA-N,-0.435147,-0.026019,0.128866,0.526843,-0.252543,0.332554,-0.034067,0.329974,0.271863,...,0.234672,-0.384713,0.141976,-0.028159,0.594077,0.107262,-0.15122,-0.147894,0.380821,-0.055811


In [None]:
# saving sspace for future use:
sspace.to_csv(SS_FEATURES / "sspace.csv", index=False)