In [1]:
import h5py

with h5py.File('alkanes_data_500.hdf5', "r") as f:
    sorted_keys = sorted(f.keys(), key=lambda x: int(x.split('_')[1]))
    print(sorted_keys[-1])
    print(f[sorted_keys[-1]].get('atomic_numbers')[()])
    print(f"nat: {len(f[sorted_keys[-1]].get('atomic_numbers')[()])}")
    print(f"number of C atoms: {len([x for x in f[sorted_keys[-1]].get('atomic_numbers')[()] if x == 6])}")
    print(f"shape of coordinates: {f[sorted_keys[-1]].get('coordinates').shape}")

alkane_355_carbons
[6 6 6 ... 1 1 1]
nat: 1067
number of C atoms: 355
shape of coordinates: (1067, 3)


In [25]:
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to generate alkane chains with hydrogens added
def generate_alkane_chain(length, max_iters=100):
    smiles = "C" * length  # Create a string of 'C's for the desired alkane length
    alkane = Chem.MolFromSmiles(smiles)
    alkane = Chem.AddHs(alkane)  # Add hydrogens to satisfy valency
    AllChem.EmbedMolecule(alkane, useRandomCoords=True)  # Generate 3D coordinates
    # AllChem.UFFOptimizeMolecule(alkane, maxIters=max_iters)  # Optimize 3D structure
    AllChem.UFFOptimizeMolecule(alkane)  # Optimize 3D structure
    return alkane

# Generate a list of alkanes from methane to decane
# alkane_chain_list = [generate_alkane_chain(i) for i in range(1, 11)]  # Lengths 1 to 10
alkane_chain_list = []
for i in tqdm(range(55, 100)):
    alkane_chain_list.append(generate_alkane_chain(i))

for i, alkane in enumerate(alkane_chain_list):
    print(f"Alkane with {len(alkane.GetAtoms())} atoms")

# Display atom positions and element numbers for each alkane
# for i, alkane in enumerate(alkane_chain_list, 1):
#     print(f"\nAlkane with {i} carbons:")
#     conf = alkane.GetConformer()
#     atom_positions = [conf.GetAtomPosition(atom.GetIdx()) for atom in alkane.GetAtoms()]
#     element_numbers = [atom.GetAtomicNum() for atom in alkane.GetAtoms()]
#     print("Element numbers:", element_numbers)

#     # Print positions and element numbers including hydrogens
#     for idx, (pos, elem_num) in enumerate(zip(atom_positions, element_numbers)):
#         print(f"Atom {idx+1}: Element {elem_num}, Position ({pos.x:.3f}, {pos.y:.3f}, {pos.z:.3f})")


100%|██████████| 45/45 [01:24<00:00,  1.89s/it]

Alkane with 167 atoms
Alkane with 170 atoms
Alkane with 173 atoms
Alkane with 176 atoms
Alkane with 179 atoms
Alkane with 182 atoms
Alkane with 185 atoms
Alkane with 188 atoms
Alkane with 191 atoms
Alkane with 194 atoms
Alkane with 197 atoms
Alkane with 200 atoms
Alkane with 203 atoms
Alkane with 206 atoms
Alkane with 209 atoms
Alkane with 212 atoms
Alkane with 215 atoms
Alkane with 218 atoms
Alkane with 221 atoms
Alkane with 224 atoms
Alkane with 227 atoms
Alkane with 230 atoms
Alkane with 233 atoms
Alkane with 236 atoms
Alkane with 239 atoms
Alkane with 242 atoms
Alkane with 245 atoms
Alkane with 248 atoms
Alkane with 251 atoms
Alkane with 254 atoms
Alkane with 257 atoms
Alkane with 260 atoms
Alkane with 263 atoms
Alkane with 266 atoms
Alkane with 269 atoms
Alkane with 272 atoms
Alkane with 275 atoms
Alkane with 278 atoms
Alkane with 281 atoms
Alkane with 284 atoms
Alkane with 287 atoms
Alkane with 290 atoms
Alkane with 293 atoms
Alkane with 296 atoms
Alkane with 299 atoms





In [7]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to generate an alkane in a zigzag pattern
def generate_zigzag_alkane(length, bond_length=1.54, angle_deg=109.5):
    mol = Chem.RWMol()  # Create an editable molecule
    carbon_atom = Chem.Atom(6)  # Atomic number for carbon
    
    # Add the first carbon atom
    mol.AddAtom(carbon_atom)
    positions = [(0, 0, 0)]  # Initial position for the first atom
    
    # Calculate angle and bond vector for zigzag pattern
    angle_rad = np.radians(angle_deg)
    bond_vector = np.array([bond_length, 0, 0])  # Initial bond along the x-axis
    
    # Add subsequent carbons in zigzag
    for i in range(1, length):
        if i % 2 == 0:
            bond_vector = np.array([bond_length * np.cos(angle_rad), bond_length * np.sin(angle_rad), 0])
        else:
            bond_vector = np.array([bond_length * np.cos(angle_rad), -bond_length * np.sin(angle_rad), 0])
        
        # Calculate new position by adding bond vector to previous position
        new_position = np.array(positions[-1]) + bond_vector
        positions.append(tuple(new_position))  # Save new position
        
        # Add a new carbon atom to the molecule
        mol.AddAtom(carbon_atom)
        mol.AddBond(i - 1, i, Chem.BondType.SINGLE)
    
    # Update properties before adding hydrogens
    mol.UpdatePropertyCache(strict=False)
    mol = mol.GetMol()  # Convert to a regular RDKit molecule
    mol = Chem.AddHs(mol)  # Add hydrogens to satisfy carbon valency
    
    # Embed positions into RDKit molecule with hydrogens
    conf = Chem.Conformer(mol.GetNumAtoms())  # Update conformer to match all atoms, including hydrogens
    for i, pos in enumerate(positions):
        conf.SetAtomPosition(i, pos)
    
    # Add the conformer to the molecule
    mol.AddConformer(conf)
    
    return mol, positions

# Generate an alkane with 100 carbons
alkane, positions = generate_zigzag_alkane(1000)

# Display some positions for verification
for i, pos in enumerate(positions[:10]):  # Print first 10 positions for a quick check
    print(f"Atom {i + 1}: Position {pos}")
print(f"\nGenerated alkane with {alkane.GetNumAtoms()} atoms (including hydrogens).")


Atom 1: Position (0, 0, 0)
Atom 2: Position (-0.5140625632200073, -1.4516678962819547, 0.0)
Atom 3: Position (-1.0281251264400146, 0.0, 0.0)
Atom 4: Position (-1.542187689660022, -1.4516678962819547, 0.0)
Atom 5: Position (-2.0562502528800293, 0.0, 0.0)
Atom 6: Position (-2.570312816100037, -1.4516678962819547, 0.0)
Atom 7: Position (-3.0843753793200444, 0.0, 0.0)
Atom 8: Position (-3.598437942540052, -1.4516678962819547, 0.0)
Atom 9: Position (-4.112500505760059, 0.0, 0.0)
Atom 10: Position (-4.626563068980067, -1.4516678962819547, 0.0)

Generated alkane with 3002 atoms (including hydrogens).
