<a href="https://colab.research.google.com/github/MehrdadJalali-KIT/InverseLinkPredcition/blob/main/GCN_Sparcification_PredcitUnseen_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

# Function to generate a molecular fingerprint given a SMILES string
def generate_fingerprint(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return np.zeros((1024,), dtype=float)  # Return an array of zeros if molecule can't be parsed
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), dtype=float)
    except Exception as e:
        print(f"SMILES Parse Error: {e}")
        return np.zeros((1024,), dtype=float)  # Return an array of zeros in case of an error

# Function to preprocess unseen MOF data to match the format used in model training
def preprocess_unseen_data(unseen_data):
    linker_features = np.stack(unseen_data['SMILES'].apply(generate_fingerprint))
    metal_features = label_encode_metal_names(unseen_data['Metal']).reshape(-1, 1)
    other_features = unseen_data[['Largest Cavity Diameter', 'Largest Free Sphere']].values.astype('float32')
    features = np.concatenate((linker_features, metal_features, other_features), axis=1)
    return features

# Function to preprocess features
def preprocess_features(features):
    scaler = StandardScaler()
    return scaler.fit_transform(features)

# Function to predict PLD sizes for unseen MOF data
def predict_PLD(model, unseen_data):
    features = preprocess_unseen_data(unseen_data)
    features = preprocess_features(features)
    predictions = model.predict(features)
    PLD_categories = np.argmax(predictions, axis=1)
    PLD_labels = ['nonporous', 'small pore', 'medium pore', 'large pore']
    PLD_predictions = [PLD_labels[idx] for idx in PLD_categories]
    return PLD_predictions

# Function to label encode metal names as integers
def label_encode_metal_names(metal_names):
    """Encodes metal names as integers."""
    metal_dict = {metal: idx for idx, metal in enumerate(np.unique(metal_names))}
    return np.array([metal_dict[metal] for metal in metal_names])

if __name__ == "__main__":
    # Load the saved GCN model
    model = load_model("gcn_model.h5")

    # Define unseen MOF data with metal information and additional features
    unseen_data = pd.DataFrame({
        'SMILES': [
            'OC(=O)/C=C/c1ccc(cc1)',
            'CC(C)(C)OC(=O)NC1=CC=C(C=C1)N',
            'COC(=O)C1=CC=C(C=C1)NC(=O)OCC(C)C',
            'C1=CC=C(C=C1)NC(=O)OCC2=CC=CC=C2'
        ],
        'Metal': ['Fe', 'Fe', 'Zn', 'Ni'],  # Example metal information
        'Largest Cavity Diameter': [11.05942, 6.0, 7.0, 8.0],  # Example largest cavity diameter
        'Largest Free Sphere': [9.05073, 11.0, 12.0, 13.0]  # Example largest free sphere
    })

    # Predict PLD sizes for unseen data
    PLD_predictions = predict_PLD(model, unseen_data)
    print("Predicted PLD sizes for unseen data:")
    print(PLD_predictions)


Predicted PLD sizes for unseen data:
['small pore', 'nonporous', 'small pore', 'medium pore']


In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [1]:
# Mount drive
from google.colab import drive
import os

drive.mount('/content/drive')
# Change working path
os.chdir('/content/drive/MyDrive/Research/MOF/InverseLinkPredcition')

Mounted at /content/drive
