<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Descriptor%20Generators/GenerateProposedDescriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git

In [None]:
!pip install rdkit

In [None]:
%cd Average_Weighted_Path_Vector/Data\ Files

In [6]:
from rdkit import Chem
from rdkit.Chem import rdmolops
import numpy as np
import networkx as nx
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
from typing import Tuple
import os

In [7]:
def mol_from_smile(sm : str) -> Chem.Mol:
      try:
        mol = Chem.MolFromSmiles(sm)  # Convert SMILES to RDKit Mol
        return mol
      except:
        return None

def count_atom_types(mol : Chem.Mol) -> dict[str, int]:
    atom_counts = defaultdict(int)
    for atom in mol.GetAtoms():
        atom_symbol = atom.GetSymbol()
        atom_counts[atom_symbol] += 1

    return dict(atom_counts)


def mol_to_nx_graph(mol : Chem.Mol) -> nx.Graph:
    """Convert RDKit molecule to NetworkX graph with bond weights."""
    G = nx.Graph()
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        weight = bond.GetBondTypeAsDouble()
        G.add_edge(i, j, weight=weight)
    return G


def get_max_distance_atom_pairs(mol : Chem.Mol) -> Tuple[int, np.ndarray]:
    """Get atom index pairs with the maximum topological distance."""
    dmat = rdmolops.GetDistanceMatrix(mol)
    max_dist = int(np.max(dmat))
    max_indices = np.argwhere(dmat == max_dist)
    return max_dist, max_indices


def compute_avg_weighted_path_vector(mol : Chem.Mol) -> np.ndarray:
    """Main function to compute averaged weighted path vector descriptor."""
    G = mol_to_nx_graph(mol)
    max_dist, max_pairs = get_max_distance_atom_pairs(mol)

    if max_pairs.size == 0:
        return []

    # Get unique atoms involved in the longest paths
    max_dist_atoms = set(max_pairs.flatten())
    path_vectors = []

    for start_atom in max_dist_atoms:
        len_to_total_weight = defaultdict(float)

        try:
            lengths = dict(nx.single_source_shortest_path_length(G, start_atom))
        except nx.NodeNotFound:
            continue  # atom not in graph

        for end_atom, path_len in lengths.items():
            if start_atom == end_atom or path_len == 0:
                continue
            try:
                paths = nx.all_shortest_paths(G, source=start_atom, target=end_atom)
                for path in paths:
                    total_weight = 0.0
                    for i in range(len(path) - 1):
                        bond_data = G.get_edge_data(path[i], path[i + 1])
                        if bond_data:
                            total_weight += bond_data['weight']
                    len_to_total_weight[path_len] += total_weight
            except (nx.NetworkXNoPath, nx.NodeNotFound):
                continue

        if not len_to_total_weight:
            continue

        max_len = max(len_to_total_weight.keys())
        vector = [len_to_total_weight.get(i, 0.0) for i in range(1, max_len + 1)]
        path_vectors.append(vector)

    # Handle varying vector lengths by padding with zeros
    if not path_vectors:
        return []

    max_vector_len = max(len(vec) for vec in path_vectors)
    padded_vectors = [
        vec + [0.0] * (max_vector_len - len(vec)) for vec in path_vectors
    ]

    # Average across all start atoms
    avg_vector = np.mean(padded_vectors, axis=0).tolist()

    # Prepend number of vectors averaged
    return avg_vector

In [12]:
input_file = "Excel Files/Zang_Data.xlsx"
property = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]

output_dir = "Descriptors Data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
tqdm.pandas()

for prop in property:
    df = pd.read_excel(input_file, sheet_name=prop)
    df['Preferred_name'] = df['Preferred_name'].astype(str)

    print(f"\nProcessing SMILES from sheet:{prop} .... \n")

    df['mol'] = df['SMILES'].progress_apply(mol_from_smile)
    df['Desc'] = df['mol'].progress_apply(compute_avg_weighted_path_vector)
    df['AtomsC'] = df['mol'].progress_apply(count_atom_types)

    desc_list = df['Desc'].tolist()
    Desc_df = pd.DataFrame(desc_list)
    Desc_df.columns = [f"{prop}_{i}" for i in range(Desc_df.shape[1])]
    Desc_df.fillna(0, inplace=True)

    output_file = os.path.join(output_dir, f"{prop}_Desc_Paths.parquet")
    pd.concat([df.iloc[:, :9], Desc_df], axis=1).to_parquet(output_file, index=False)
    print(f"\nThe process of sheet:{prop} is done! and the result is saved to the file {output_file}.")

    desc_list_atoms_count = df['AtomsC'].tolist()
    Desc_Atoms_df = pd.DataFrame(desc_list_atoms_count)
    Desc_Atoms_df.fillna(0, inplace=True)

    output_file = os.path.join(output_dir, f"{prop}_Desc_Atoms_Count.parquet")
    pd.concat([df.iloc[:, :9], Desc_Atoms_df], axis=1).to_parquet(output_file, index=False)
    print(f"The process of sheet:{prop} is done! and the result is saved to the file {output_file}.")

    Final_Desc_df = pd.concat([Desc_df, Desc_Atoms_df], axis=1)

    output_file = os.path.join(output_dir, f"{prop}_Desc_Full.parquet")
    pd.concat([df.iloc[:, :9], Final_Desc_df], axis=1).to_parquet(output_file, index=False)
    print(f"The process of sheet:{prop} is done! and the result is saved to the file {output_file}.\n")

print(f'\nAll sheets have been processed successfully!')

## To inspect the values of some computed descriptors

In [None]:
desc_data = pd.read_parquet('Descriptors Data/MP_Desc_Full.parquet')
desc_data.head()