In [None]:
import rdkit
import pandas as pd

In [None]:
df_QM9 = pd.read_csv('qm9.csv')

In [None]:
df_Zinc = pd.read_csv('250k_rndm_zinc_drugs_clean_3.csv')
df_Zinc['smiles'] = df_Zinc['smiles'].apply(lambda x: x[:-1])

In [None]:
df_Zinc.smiles[0]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import rdchem
from collections import Counter

def get_cycle_lengths(smiles):
    mol = Chem.MolFromSmiles(smiles)
    cycle_lengths = [len(cycle) for cycle in Chem.GetSymmSSSR(mol)]
    return cycle_lengths

def get_molecule_size(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol.GetNumAtoms()

In [None]:
df_QM9['CycleLengths'] = df_QM9['smiles'].apply(get_cycle_lengths)
df_QM9['MolSize'] = df_QM9['smiles'].apply(get_molecule_size)

df_Zinc['CycleLengths'] = df_Zinc['smiles'].apply(get_cycle_lengths)
df_Zinc['MolSize'] = df_Zinc['smiles'].apply(get_molecule_size)

In [None]:
from rdkit.Chem import Draw

def plot_molecules(smiles_list, mols_per_row=10):
    # Convert SMILES strings to RDKit Molecules
    mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
    
    # Draw the molecules and display the image
    img = Draw.MolsToGridImage(mols, molsPerRow=mols_per_row, subImgSize=(250, 250), useSVG=True)
    
    return img



In [None]:
def plot_histogram_for_cycle_length_over_mol_size(data, cycle_length):
    # Filter the data to include only molecules with the given cycle length
    filtered_data = data[data['CycleLengths'].apply(lambda x: cycle_length in x)]

    # Calculate the ratio of the cycle count to the molecule size
    filtered_data['CycleSizeRatio'] = filtered_data['CycleLengths'].apply(lambda x: x.count(cycle_length)) / filtered_data['MolSize']

    # Plot the histogram
    filtered_data['CycleSizeRatio'].plot.hist(bins=20)
    plt.xlabel(f"Cycle Count / Molecule Size (Cycle Length: {cycle_length})")
    plt.ylabel("Frequency")
    plt.title(f"Distribution of Cycle Count / Molecule Size for Cycle Length {cycle_length} in QM9 Dataset")
    plt.show()

In [None]:
# Plot histograms for cycle length in different rows of the same figure
def plot_histograms(data):
    fig, axs = plt.subplots(3, 2, figsize=(10, 10))
    axs[0, 0].hist(data['CycleLengths'].apply(lambda x: x.count(3)), bins=20)
    axs[0, 0].set_title('Cycle Length 3')
    axs[0, 1].hist(data['CycleLengths'].apply(lambda x: x.count(4)), bins=20)
    axs[0, 1].set_title('Cycle Length 4')
    axs[1, 0].hist(data['CycleLengths'].apply(lambda x: x.count(5)), bins=20)
    axs[1, 0].set_title('Cycle Length 5')
    axs[1, 1].hist(data['CycleLengths'].apply(lambda x: x.count(6)), bins=20)
    axs[1, 1].set_title('Cycle Length 6')
    axs[2, 0].hist(data['CycleLengths'].apply(lambda x: x.count(7)), bins=20)
    axs[2, 0].set_title('Cycle Length 7')
    axs[2, 1].hist(data['CycleLengths'].apply(lambda x: x.count(8)), bins=20)
    axs[2, 1].set_title('Cycle Length 8')
    plt.show()


In [None]:
plot_histograms(df_QM9)

In [None]:
plot_histograms(df_Zinc)

In [None]:
# Plot the distribution of the type of atom in the molecules

def get_atom_counts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Counter([atom.GetSymbol() for atom in mol.GetAtoms()])

df_QM9['AtomCounts'] = df_QM9['smiles'].apply(get_atom_counts)

df_QM9['AtomCounts'].apply(pd.Series).fillna(0).astype(int).sum().plot.bar()




In [None]:
df_Zinc['AtomCounts'] = df_Zinc['smiles'].apply(get_atom_counts)

df_Zinc['AtomCounts'].apply(pd.Series).fillna(0).astype(int).sum().plot.bar()

In [None]:
def count_bonding_types(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Counter([bond.GetBondType() for bond in mol.GetBonds()])


In [None]:
df_QM9['BondingTypes'] = df_QM9['smiles'].apply(count_bonding_types)

df_Zinc['BondingTypes'] = df_Zinc['smiles'].apply(count_bonding_types)

In [None]:
df_QM9['BondingTypes'].apply(pd.Series).fillna(0).astype(int).sum().plot.bar()


In [None]:
df_Zinc['BondingTypes'].apply(pd.Series).fillna(0).astype(int).sum().plot.bar()

In [None]:
# plot size distribution of molecules

df_QM9['MolSize'].plot.hist(bins=20)

df_Zinc['MolSize'].plot.hist(bins=20)

In [None]:


def get_distribution_bonds(smiles, atom_type):
    #count the number of each type of bound for the given atom type
    #count  hydrogen as a bound
    mol = Chem.MolFromSmiles(smiles)
    return Counter([bond.GetBondType() for bond in mol.GetBonds() if bond.GetBeginAtom().GetSymbol() == atom_type])
    



def plot_distribution_neighbors_for_each_atom_type(data):
    
    atom_list = ['C', 'O', 'N', 'S', 'F']
    fig, axs = plt.subplots(3, 2, figsize=(10, 10))
    possible_values = [1, 12, 2, 3]
    for i, atom in enumerate(atom_list):
        data['AtomCounts'] = data['smiles'].apply(lambda x: get_distribution_bonds(x, atom))
        atom_counts_series = data['AtomCounts'].apply(pd.Series).fillna(0).astype(int).sum()
        
        # Get the bond count for each possible value in the specified order
        height = [atom_counts_series.get(value, 0) for value in possible_values]

        ax = axs[i // 2, i % 2]  # Get the correct subplot (ax) for the current atom
        ax.bar(possible_values, height)
        ax.set_title(f'Distribution of Number of Neighbors for {atom}')
        ax.set_xticks(possible_values)  # Set x-axis ticks to show the possible values
    fig.tight_layout()  # Adjust the layout for better visualization
    plt.show()
    return data

In [None]:
df_QM9['BondingTypes_ox'] = df_QM9['smiles'].apply(lambda x: get_distribution_bonds(x,atom_type = 'O'))
df_QM9['BondingTypes_ox'].apply(pd.Series).fillna(0).astype(int).sum().plot.bar()


In [None]:
#make a function to draw a molecule that have a bound of type 12 for an atom 'O'

def draw_molecule_with_bond_type(data, atom_type, bond_type):
    for smiles in data['smiles']:
        
        mol = Chem.MolFromSmiles(smiles)
        for bond in mol.GetBonds():
            if bond.GetBeginAtom().GetSymbol() == atom_type and bond.GetBondType() == bond_type:
                print(smiles)
                return Draw.MolToImage(mol, highlightAtoms=[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()], highlightBonds=[bond.GetIdx()], useSVG=True)
    return None

In [None]:
data = plot_distribution_neighbors_for_each_atom_type(df_QM9)

In [None]:
data = plot_distribution_neighbors_for_each_atom_type(df_Zinc)

In [None]:
# Plot the number of oxygen at the center of the molecule and at the end of the molecule

def get_oxygen_double_bonds(data):
    oxygen_double_bonds = []
    for smiles in data['smiles']:
        mol = Chem.MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            if atom.GetSymbol() == 'O':
                neighbors = atom.GetNeighbors()
                if len(neighbors) == 2:
                    oxygen_double_bonds.append(0)
                else:
                    oxygen_double_bonds.append(1)

    # compute the proportion of oxygen atoms with double bonds  
    return oxygen_double_bonds.count(1) / len(oxygen_double_bonds), oxygen_double_bonds.count(0) / len(oxygen_double_bonds)


In [None]:
get_oxygen_double_bonds(df_QM9)
get_oxygen_double_bonds(df_Zinc)

In [None]:
get_oxygen_double_bonds