<a href="https://colab.research.google.com/github/Luis-Menezes/CI-Chemistry/blob/main/handson_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Computational Intelligence in Chemistry - Hands-on dataset analysis

## Needed packages

In [25]:
!pip install --quiet rdkit

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [27]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem

## 1. Load and unpack QM9 dataset

In [1]:
!wget -O data.xyz.tar.bz2 https://www.dropbox.com/scl/fi/2ugqxr9fa9nob1byc8ura/dsgdb9nsd.xyz.tar.bz2?rlkey=pp2k6fy4360yldrypwghwbi6d&st=1cohswqh&dl=0

--2025-09-18 16:05:53--  https://www.dropbox.com/scl/fi/2ugqxr9fa9nob1byc8ura/dsgdb9nsd.xyz.tar.bz2?rlkey=pp2k6fy4360yldrypwghwbi6d
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ucecb17112347310ba933353761e.dl.dropboxusercontent.com/cd/0/inline/CxmoluCoJ9NjwR6Dhy0ct0cNMH1u472kLLRsA4czVa7XqCwafO9ovdp4gWp44MWDrSGhVoGhhe7Y9HCggg7u7kN6CYSVSSmrbpDBu8CP8DnQYDbKAWo1lv-TEJ2Y9YQE8lmIAhZDo1Z3Htar6KGRZ0Yz/file# [following]
--2025-09-18 16:05:53--  https://ucecb17112347310ba933353761e.dl.dropboxusercontent.com/cd/0/inline/CxmoluCoJ9NjwR6Dhy0ct0cNMH1u472kLLRsA4czVa7XqCwafO9ovdp4gWp44MWDrSGhVoGhhe7Y9HCggg7u7kN6CYSVSSmrbpDBu8CP8DnQYDbKAWo1lv-TEJ2Y9YQE8lmIAhZDo1Z3Htar6KGRZ0Yz/file
Resolving ucecb17112347310ba933353761e.dl.dropboxusercontent.com (ucecb17112347310ba933353761e.dl.dropboxusercontent.com)... 162.125.1.1

In [2]:
!mkdir qm9_files
!tar -xjf data.xyz.tar.bz2 -C qm9_files/

## 2. Open XYZ files and extract the properties and SMILES

In [3]:
qm9_folder = 'qm9_files/'
with open(qm9_folder+'dsgdb9nsd_113885.xyz') as file:
  for line in file:
    print(line)

19

gdb 113885	3.70802	0.80999	0.7081	2.6975	73.65	-0.2639	0.0141	0.278	1593.5236	0.156316	-460.174889	-460.164623	-460.163679	-460.212278	35.326	

C	-0.008898706	 1.435343511	-0.1229342875	-0.480889

C	-0.097211668	-0.0611740136	 0.0533750899	 0.431334

O	-0.522805097	-0.6329577977	 1.0223608396	-0.34442

O	 0.3874153201	-0.692641184	-1.0454419183	-0.247509

C	 0.3502508225	-2.1099666389	-1.0216443391	 0.118249

C	-0.6452778548	-2.8049992501	-1.9010293635	-0.305338

C	 0.8299955957	-2.806393386	-2.257736521	-0.04378

C	 1.3090797685	-2.0281448361	-3.4552388748	-0.093611

O	 1.0953742585	-2.8320000843	-4.6085691448	-0.419869

H	-0.3728027153	 1.9278328937	 0.7777832851	 0.149542

H	 1.0248947898	 1.7315350968	-0.3210999853	 0.149596

H	-0.6070917791	 1.7459702376	-0.9844780335	 0.149165

H	 0.576972916	-2.5190329042	-0.0453019279	 0.12476

H	-1.1102986284	-3.7153217605	-1.5406769745	 0.122235

H	-1.2860640388	-2.1730656326	-2.5065209428	 0.123942

H	 1.3703384329	-3.7344941519	-2.09474

In [47]:
def load_validate_mol(file):
  with open(file, 'r') as file:
    n_atoms = int(file.readline())

    properties_list = file.readline().split()
    properties = [float(prop) for prop in properties_list[2:]]
    prop_dict = {""}
    # print(properties)
    atoms = []
    for nlines in range(n_atoms):
      atom = file.readline().split()
      atoms.append(atom[0]) # get only the 'C'

      # Since we doesn't need coordinates for this assignment, we will ignore it for now
      # atom_coords = atom[1:3]
      # atom_coords = [float(coord) for coord in atom_coords]
      # print(atom)
    # print(atoms)
    file.readline() # ignores the frequencies

    smiles_list = file.readline().split()

    mol = Chem.MolFromSmiles(smiles_list[0]) # If it's None then smiles is invalid
    if mol is None:
        print("Invalid smiles for arquive: ", properties_list[1])
        return
    true_smiles = Chem.CanonSmiles(smiles_list[0])

    return n_atoms, atoms, properties, true_smiles


def get_full_dataset(folder):
  dataset_list = []
  # print(f"Lendo arquivos da pasta: {folder}")

  files_to_process = [f for f in os.listdir(folder) if f.endswith('.xyz')]
  for i, file in enumerate(files_to_process):

      if (i + 1) % 10000 == 0:
          print(f"  Processando arquivo {i+1}/{len(files_to_process)}...")

      file_path = os.path.join(folder, file)

      # Chama a função de processamento para um arquivo
      molecule_data = load_validate_mol(file_path)

      # Se a função retornou dados válidos (não None), adiciona à lista
      if molecule_data:
          dataset_list.append(molecule_data)
  return dataset_list




if __name__ == '__main__':
  full_dataset = get_full_dataset(qm9_folder)

  df = pd.DataFrame(full_dataset)
  # load_validate_mol(qm9_folder+'dsgdb9nsd_113885.xyz')

  Processando arquivo 10000/133885...
  Processando arquivo 20000/133885...
  Processando arquivo 30000/133885...
  Processando arquivo 40000/133885...
  Processando arquivo 50000/133885...
  Processando arquivo 60000/133885...
  Processando arquivo 70000/133885...
  Processando arquivo 80000/133885...
  Processando arquivo 90000/133885...
  Processando arquivo 100000/133885...
  Processando arquivo 110000/133885...
  Processando arquivo 120000/133885...
  Processando arquivo 130000/133885...


In [50]:
df.explode()

Unnamed: 0,0,1,2,3
0,16,"[O, C, O, C, C, N, C, C, C, H, H, H, H, H, H, H]",3.53937,O=C1OC2CN3CC2C13
0,16,"[O, C, O, C, C, N, C, C, C, H, H, H, H, H, H, H]",1.71866,O=C1OC2CN3CC2C13
0,16,"[O, C, O, C, C, N, C, C, C, H, H, H, H, H, H, H]",1.54859,O=C1OC2CN3CC2C13
0,16,"[O, C, O, C, C, N, C, C, C, H, H, H, H, H, H, H]",4.5699,O=C1OC2CN3CC2C13
0,16,"[O, C, O, C, C, N, C, C, C, H, H, H, H, H, H, H]",67.19,O=C1OC2CN3CC2C13
...,...,...,...,...
133884,14,"[N, C, N, C, O, C, N, C, O, H, H, H, H, H]",-470.072229,Nc1nc(O)c(C=O)[nH]1
133884,14,"[N, C, N, C, O, C, N, C, O, H, H, H, H, H]",-470.064402,Nc1nc(O)c(C=O)[nH]1
133884,14,"[N, C, N, C, O, C, N, C, O, H, H, H, H, H]",-470.063458,Nc1nc(O)c(C=O)[nH]1
133884,14,"[N, C, N, C, O, C, N, C, O, H, H, H, H, H]",-470.104336,Nc1nc(O)c(C=O)[nH]1
