In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# Loading data

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
from sklearn.feature_extraction.text import CountVectorizer


class LabelVectorizer(CountVectorizer):
    def __init__(self):
        super().__init__(stop_words=None)

    def fit(self, X, y=None):
        super().fit(''.join(X).split('|'))
        return self
    

class IndexVectorizer(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self._n_features_out = X.map(lambda x: max(eval(x))).max() + 1
        return self

    def transform(self, X):
        result = np.zeros((len(X), self._n_features_out))
        for i, x in enumerate(X):
            result[i, eval(x)] = 1
        return result

In [3]:
from sklearn.compose import ColumnTransformer

data_path = Path('dataset/sflln')

features = pd.read_csv(data_path / 'drug_features.csv', header=0)
  
ct = ColumnTransformer([
    ('Structure', IndexVectorizer(), 'STRUCTURE'),
    ('Target', LabelVectorizer(), 'TARGET'),
    ('Enzyme', LabelVectorizer(), 'ENZYME'),
    ('Pathway', LabelVectorizer(), 'PATH'),
])
processed_features = ct.fit_transform(features) == 0

with open(data_path / 'interactions.txt') as f:
    interactions = np.array(eval(f.read()), dtype=bool)
    # make sure it is symmetric
    interactions = np.maximum(interactions, interactions.transpose())

In [4]:
def drug_pair_to_index(first_id: int, second_id: int) -> int:
    return first_id * interactions.shape[0] + second_id

def index_to_drug_pair(index: int) -> tuple[int, int]:
    return index // interactions.shape[0], index % interactions.shape[0]

def get_feature_name_from_index(ct: ColumnTransformer, index: int) -> str:
    return ct.get_feature_names_out()[index % len(ct.get_feature_names_out())]

In [71]:
index = np.triu_indices_from(interactions, k=1)
y = interactions[index]

In [74]:
X = np.hstack([
    np.concatenate((processed_features[a], processed_features[b]))
    for a, b in zip(*index)
], dtype=bool).reshape((y.shape[0], -1))

In [7]:
from sklearn.model_selection import train_test_split

train_X, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, np.arange(X.shape[0]), test_size=0.2, random_state=42)

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=42).fit(train_X, y_train)
pred = clf.predict(X_test)

In [9]:
from sklearn.metrics import matthews_corrcoef

print(matthews_corrcoef(y_test, pred))

0.6721664735756001


In [86]:
sample_id = 1
feature = clf.tree_.feature
threshold = clf.tree_.threshold
node_indicator = clf.decision_path(X_test)
leaf_id = clf.apply(X_test)

# obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]

first_id, second_id = index[0][test_indices[1]], index[1][test_indices[1]]
first_name, second_name = features['DRUG ID'][first_id], features['DRUG ID'][second_id]
print(f'Predicting interaction {first_name} <==> {second_name}')
for node_id in node_index:
    # continue to the next node if it is a leaf node
    if leaf_id[sample_id] == node_id:
        continue

    feature_names = ct.get_feature_names_out()
    feature_id = feature[node_id]
    belongs_to_first = feature_id >= len(feature_names)
    feature_id %= len(feature_names)

    print('{drug_name} {has} {feature}'.format(
            drug_name=first_name if belongs_to_first else second_name,
            has='has' if belongs_to_first else 'does not have',
            feature=feature_names[feature_id]
        )
    )

    print(
        'decision node {node} : (X_test[{sample}, {feature}] = {value}) '
        ' {threshold})'.format(
            node=node_id,
            sample=sample_id,
            feature=feature[node_id],
            value=X_test[sample_id, feature[node_id]],
            threshold=threshold[node_id],
        )
    )

Predicting interaction DB00822 <==> DB00851
Enzyme__p08684
decision node 0 : (X_test[1, 3663] = True)  0.5)
Structure__indexvectorizer570
decision node 49602 : (X_test[1, 2606] = False)  0.5)
Pathway__hsa04726
decision node 49603 : (X_test[1, 1920] = True)  0.5)
Enzyme__p10632
decision node 54651 : (X_test[1, 1634] = False)  0.5)
Enzyme__p11712
decision node 54652 : (X_test[1, 3676] = False)  0.5)
Enzyme__p33261
decision node 54653 : (X_test[1, 1687] = False)  0.5)
Pathway__hsa04727
decision node 54654 : (X_test[1, 1921] = True)  0.5)
Structure__indexvectorizer797
decision node 54688 : (X_test[1, 2833] = True)  0.5)
Enzyme__p33261
decision node 54786 : (X_test[1, 3723] = False)  0.5)
Structure__indexvectorizer791
decision node 54787 : (X_test[1, 2827] = True)  0.5)
Structure__indexvectorizer421
decision node 54797 : (X_test[1, 421] = True)  0.5)
Structure__indexvectorizer650
decision node 54809 : (X_test[1, 650] = True)  0.5)
Pathway__hsa04725
decision node 54831 : (X_test[1, 1919] = T

In [85]:
get_feature_name_from_index(ct, feature[node_id])

'Pathway__hsa05416'

In [None]:
ct.get_feature_names_out()

array(['structure_vectorizer__indexvectorizer0',
       'structure_vectorizer__indexvectorizer1',
       'structure_vectorizer__indexvectorizer2', ...,
       'pathway_vectorizer__hsa05414', 'pathway_vectorizer__hsa05416',
       'pathway_vectorizer__hsa05418'], dtype=object)

In [None]:
ct.n_features_out_

AttributeError: 'ColumnTransformer' object has no attribute 'n_features_out_'