In [1]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys
from sklearn.preprocessing import StandardScaler
import numpy as np
from rdkit.Chem import rdmolops
from tqdm import tqdm
from IPython.display import clear_output
import seaborn as sns 
import matplotlib.pyplot as plt
import os

In [2]:
tox21 = pd.read_csv('data/tox21.csv')
smiles_list = tox21['SMILES'].tolist()
label_columns = ['SR-HSE','NR-AR', 'SR-ARE', 'NR-Aromatase', 'NR-ER-LBD', 'NR-AhR', 'SR-MMP',\
       'NR-ER', 'NR-PPAR-gamma', 'SR-p53', 'SR-ATAD5', 'NR-AR-LBD']
targets = tox21[label_columns]


In [3]:
# Functions for Feature Extraction
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {
        "MolecularWeight": Descriptors.MolWt(mol),
        "LogP": Descriptors.MolLogP(mol),
        "HBD": Descriptors.NumHDonors(mol),
        "HBA": Descriptors.NumHAcceptors(mol),
        "TPSA": Descriptors.TPSA(mol),
    }
    return descriptors

def compute_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    return {
        "MorganFP": list(morgan_fp),
        "MACCSFP": list(maccs_fp),
    }

def compute_graph_features(graph):
    features = {
        "AverageDegree": sum(dict(graph.degree()).values()) / len(graph.nodes()),
        "Density": nx.density(graph),
        "ClusteringCoefficient": nx.average_clustering(graph),
    }
    if nx.is_connected(graph):
        features["Diameter"] = nx.diameter(graph)
    else:
        features["Diameter"] = None
    return features

def compute_node2vec_embeddings(graph, dimensions=64):
    node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=30, num_walks=50, workers=12, seed=42)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    
    node_embeddings = np.array([model.wv[str(node)] for node in graph.nodes()])
    graph_embedding = node_embeddings.mean(axis=0)  
    return graph_embedding

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    adjacency_matrix = rdmolops.GetAdjacencyMatrix(mol)
    graph = nx.from_numpy_array(adjacency_matrix)
    for i, atom in enumerate(mol.GetAtoms()):
        graph.nodes[i]['atom_type'] = atom.GetSymbol()
    return graph

In [4]:
if not os.path.exists('data/features.csv'):
    features = []
    for smile in tqdm(smiles_list):
        try:
            mol = Chem.MolFromSmiles(smile)
            graph = smiles_to_graph(smile)
            
            descriptors = compute_descriptors(smile)
            fingerprints = compute_fingerprints(smile)
            graph_features = compute_graph_features(graph)
            node2vec_embeddings = compute_node2vec_embeddings(graph)
            
            combined_features = {**descriptors, **fingerprints, **graph_features, **{f"Node2Vec{i}": node2vec_embeddings[i] for i in range(len(node2vec_embeddings))}}
            features.append(combined_features)
        except Exception as e:
            print(f"Tus muertos pisados {smile}: {e}")
        clear_output(wait=True)

    feature_df = pd.DataFrame(features)
    feature_df.to_csv('data/features.csv', index=False)
else:
    feature_df = pd.read_csv('data/features.csv')


In [5]:
#feature_df.drop(
#    ['MorganFP', 'MACCSFP'], axis=1, inplace=True
#)


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

morgan = feature_df['MorganFP']
maccs = feature_df['MACCSFP']
feature_df.drop(['MorganFP', 'MACCSFP'], axis=1, inplace=True)

iterative_imputer = IterativeImputer(max_iter=10, random_state=42)
targets = pd.DataFrame(iterative_imputer.fit_transform(targets), columns=targets.columns)
feature_df = pd.DataFrame(iterative_imputer.fit_transform(feature_df), columns=feature_df.columns)


In [None]:
feature_df.columns

In [8]:
#feature_df = pd.DataFrame(df[: , :feature_df.shape[1]] , columns=feature_df.columns)
#targets = pd.DataFrame(df[: , feature_df.shape[1]:] , columns=targets.columns)

In [None]:
targets.map(lambda x: 1 if x >= .5 else 0 )

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer, f1_score, classification_report
from sklearn.decomposition import PCA

model = MultiOutputClassifier(XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42, scale_pos_weight=10, enable_categorical = True, eval_metric = 'logloss'), n_jobs=-1)


X,y = feature_df, targets.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter

Y = pd.DataFrame()

for column in y_train.columns:
    target_column = y_train[column]
    oversampler = SMOTE(random_state=42)
    _, y_resampled = oversampler.fit_resample(X_train, target_column)
    oversampled_data.append((X_resampled, y_resampled))


feature_df['MorganFP'] = morgan
feature_df['MACCSFP'] = maccs
feature_df['MACCSFP'] = feature_df['MACCSFP'].astype('category')
feature_df['MorganFP'] = feature_df['MACCSFP'].astype('category')

X_train = pd.DataFrame(oversampled_data[0][0], columns=X_train.columns)
y_train = pd.DataFrame({column: y for _, y in oversampled_data}, columns=y_train.columns)



print(f"Balanced dataset shape {Counter(y_train)}")

#pca = PCA(n_components=2)
#X_train_2d = pca.fit_transform(X_train)

#model.fit(X_train, y_train)




In [None]:
#X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_train, y_train, test_size=0.01, random_state=42)
#X_train = pd.concat([X_train, X_train_80, X_test_80, X_test], axis=0)
#y_train = pd.concat([y_train, y_train_80, y_test_80, y_test], axis=0)

model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

f1_scores = []
for i, column in enumerate(y.columns):
    f1 = f1_score(y_test.iloc[:, i], y_pred[:, i], average="macro", zero_division=0)
    f1_scores.append(f1)
    print(f"F1-Score for {column}: {f1:.4f}")
    
mean_f1 = sum(f1_scores) / len(f1_scores)
print(f"\nMean F1-Score across all outputs: {mean_f1:.4f}")

for i, column in enumerate(y.columns):
    print(f"\nClassification Report for {column}:\n")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

mat = multilabel_confusion_matrix(y_test, y_pred)

fig, axes = plt.subplots(3, 4, figsize=(15, 10))
axes = axes.ravel()

for i, (ax, label) in enumerate(zip(axes, label_columns)):
    sns.heatmap(mat[i], annot=True, fmt='d', ax=ax, cmap='coolwarm', cbar=False)
    ax.set_title(label)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')

plt.tight_layout()
plt.show()
