<a href="https://colab.research.google.com/github/MehrdadJalali-KIT/InverseLinkPredcition/blob/main/GCN_Sparcification_PredcitUnseen_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [2]:
# Mount drive
from google.colab import drive
import os

drive.mount('/content/drive')
# Change working path
os.chdir('/content/drive/MyDrive/Research/MOF/InverseLinkPredcition')

Mounted at /content/drive


In [76]:
import pandas as pd
import numpy as np
import networkx as nx
from rdkit import Chem
from rdkit.Chem import AllChem
from tensorflow.keras.models import load_model
def generate_fingerprint(smiles):
    """Generates a molecular fingerprint given a SMILES string."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), dtype=float)
    except:
        return np.zeros((1024,), dtype=float)

def label_encode_metal_names(metal_names):
    """Encodes metal names as integers."""
    metal_dict = {metal: idx for idx, metal in enumerate(np.unique(metal_names))}
    return np.array([metal_dict[metal] for metal in metal_names])

def preprocess_new_data(new_data):
    """Preprocesses new data for prediction."""
    linker_smiles = new_data['linker SMILES'].dropna().apply(generate_fingerprint)
    metal_names = new_data['metal']

    # Encode metal names without needing the metal dictionary
    metal_features = label_encode_metal_names(metal_names).reshape(-1, 1)

    other_features = new_data[['Largest Cavity Diameter', 'Largest Free Sphere']].values.astype('float32')
    features = np.concatenate((linker_smiles.values.tolist(), metal_features, other_features), axis=1)
    return features

# Load the previously saved model
model = load_model('gcn_model_0.9.h5')

# Assuming new_data is a DataFrame similar to the training summary_data
new_data = pd.read_csv('new_data.csv')


# Preprocess the new data
features = preprocess_new_data(new_data)


adjacency_matrix = np.load('adjacency_matrix_0.9.npy')


In [77]:
# Assuming that the number of nodes in adjacency_matrix is correct
num_nodes = adjacency_matrix.shape[0]
if features.shape[0] < num_nodes:
    # Pad the feature matrix with zeros
    padding = np.zeros((num_nodes - features.shape[0], features.shape[1]))
    features = np.vstack([features, padding])

predictions = model.predict([features, adjacency_matrix])
predicted_labels = np.argmax(predictions, axis=1)





In [78]:
label_dict = {0: 'nonporous', 1: 'small pore', 2: 'medium pore', 3: 'large pore'}
# Assuming the number of rows in new_data corresponds to the number of new instances
num_new_instances = new_data.shape[0]

# Extract labels only for the new instances
predicted_labels_new = predicted_labels[:num_new_instances]

# Map integer labels to readable labels
readable_labels_new = [label_dict[label] for label in predicted_labels_new]

# Printing features and corresponding labels
for i in range(num_new_instances):
    print("Features:", features[i])
    print("Predicted Label:", readable_labels_new[i])
    print()



Features: [ 0.          0.          0.         ...  1.         27.74954033
 27.7488308 ]
Predicted Label: large pore

Features: [0.      0.      0.      ... 3.      8.58041 8.58041]
Predicted Label: small pore

Features: [0.         0.         0.         ... 0.         5.64118004 5.64118004]
Predicted Label: small pore

Features: [ 0.          0.          0.         ...  2.         11.39286041
 11.39286041]
Predicted Label: small pore



Features: [ 0.          0.          0.         ...  1.         27.74954033
 27.7488308 ]
Predicted Label: large pore

Features: [0.      0.      0.      ... 3.      8.58041 8.58041]
Predicted Label: large pore

Features: [0.         0.         0.         ... 0.         5.64118004 5.64118004]
Predicted Label: small pore

Features: [ 0.          0.          0.         ...  2.         11.39286041
 11.39286041]
Predicted Label: small pore    **for original**

Features: [ 0.          0.          0.         ...  1.         27.74954033
 27.7488308 ]
Predicted Label: large pore

Features: [0.      0.      0.      ... 3.      8.58041 8.58041]
Predicted Label: small pore

Features: [0.         0.         0.         ... 0.         5.64118004 5.64118004]
Predicted Label: small pore

Features: [ 0.          0.          0.         ...  2.         11.39286041
 11.39286041]
Predicted Label: small pore   **for 0.9**

Features: [ 0.          0.          0.         ...  1.         27.74954033
 27.7488308 ]
Predicted Label: large pore

Features: [0.      0.      0.      ... 3.      8.58041 8.58041]
Predicted Label: medium pore

Features: [0.         0.         0.         ... 0.         5.64118004 5.64118004]
Predicted Label: small pore

Features: [ 0.          0.          0.         ...  2.         11.39286041
 11.39286041]
Predicted Label: small pore    **for 0.95**

Features: [ 0.          0.          0.         ...  1.         27.74954033
 27.7488308 ]
Predicted Label: large pore

Features: [0.      0.      0.      ... 3.      8.58041 8.58041]
Predicted Label: large pore

Features: [0.         0.         0.         ... 0.         5.64118004 5.64118004]
Predicted Label: large pore

Features: [ 0.          0.          0.         ...  2.         11.39286041
 11.39286041]
Predicted Label: small pore      **0.98**