In [81]:
import pandas as pd
from rdkit import Chem

In [82]:
def read_file_lines(file_path):
    lines = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                lines.append(line.strip())  # Strip removes leading/trailing whitespaces and newline characters
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except IOError:
        print(f"Error: Unable to read from file '{file_path}'.")
    return lines


def count_compounds_with_cycles(dataframe):
    """
    Counts the number of compounds in a pandas DataFrame that contain at least one cycle.

    Parameters:
        dataframe (pandas.DataFrame): DataFrame containing 'SMILES' column with SMILES strings.

    Returns:
        int: Total number of compounds containing at least one cycle.
    """
    num_compounds_with_cycles = 0

    for smile in dataframe['SMILES']:
        mol = Chem.MolFromSmiles(smile)

        # Check if the molecule contains any cycles
        if mol and mol.GetRingInfo().NumRings() > 0:
            num_compounds_with_cycles += 1

    return num_compounds_with_cycles


In [83]:
chembl_path = "../data/chembl_wln_only.txt"
pubchem_path = "../data/pubchem_wln_only.txt"
chemspider_path = "../data/chemspider_wln_only.txt"
book_path = "../data/smith_wln_only.txt"


chembl = pd.DataFrame(read_file_lines(chembl_path),columns=["WLN"])
pubchem = pd.DataFrame(read_file_lines(pubchem_path),columns=["WLN"])
chemspider = pd.DataFrame(read_file_lines(chemspider_path),columns=["WLN"])
book = pd.DataFrame(read_file_lines(book_path),columns=["WLN"])

print(f"{len(chembl)} compounds in chembl")
print(f"{len(pubchem)} compounds in pubchem")
print(f"{len(chemspider)} compounds in chemspider")
print(f"{len(book)} compounds in encoding book")

print()
pre_total = len(chembl) + len(chemspider) + len(pubchem) + len(book)
print(f"{pre_total} total compounds")

2935 compounds in chembl
6589 compounds in pubchem
15942 compounds in chemspider
422 compounds in encoding book

25888 total compounds
