In [1]:
! pip install rdkit



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem


In [3]:
#Import df
df_train = pd.read_csv("../Data/Clean_dataframe.csv")

#Create smiles to delete
df_train["smiles_to_delete"] = df_train["smiles"]


1 - We create one column per element

In [4]:
# There is no He, Li, Be, Ne, Ar, K, Sc, V, Mn, Fe, Ni, Cu, Kr, Rb, Sr, Y,
# Zr, Nb, Mo, Tc, Ru, Rh, Pd (and after) in the dataset

#One column per element
df_train["Number of B"] = 0
df_train["Number of C"] = 0
df_train["Number of N"] = 0
df_train["Number of O"] = 0
df_train["Number of F"] = 0
df_train["Number of Si"] = 0
df_train["Number of P"] = 0
df_train["Number of S"] = 0
df_train["Number of Cl"] = 0
df_train["Number of Br"] = 0
df_train["Number of I"] = 0

#In the dataset but very few amount (so drop these lines):
#Na (2 lines) Mg (1 line) Al (1 line) Ca (1 line) Ti (1 line) Cr (1 line)
# [Co] (1 line) Zn (1 line) Ge (1 line) As (4 lines) Se (7 lines) Ag (1 line)
# [Hg] (2 lines)


2 - We remove in "smiles_to_delete", the elements that appears less than 10 times in the dataset:

In [5]:
element_to_remove = ["Na", "Mg", "Al", "Ca", "Ti", "Cr", "[Co]", "Zn", "Ge",
                     "As", "Se", "Ag", "[Hg]"]

indices_to_remove = []

for index, row in df_train.iterrows():
    for element in element_to_remove:
        if element in row["smiles_to_delete"]:
            indices_to_remove.append(index)

# Delete the corresponding rows
df_train.drop(indices_to_remove, inplace=True)

# Reset index
df_train.reset_index(drop=True, inplace=True)


3 - We remove the special characters

In [6]:
char_to_remove = ["(", ")", "=", "1", "2", "3", "4", "5", "6", "7", "8", "9",
                  "0", "@", "[", "]", "-", "+", "H", "/", "#", ".", "\\", "%",
                  " "]

for index, row in df_train.iterrows():
    cleaned_smiles = ''.join([char for char in row["smiles_to_delete"] if char not in char_to_remove])
    df_train.loc[index, "smiles_to_delete"] = cleaned_smiles


4- We remove the important elements from smiles_to_delete and add the count in the column

In [7]:
element_to_count = ["Br", "B", "Cl", "C", "N", "O", "F", "Si", "P", "S", "I"]
# The order matters

for index, row in df_train.iterrows():
    for element in element_to_count:
        count = row["smiles_to_delete"].count(element.lower()) + \
            row["smiles_to_delete"].count(element)
        df_train.loc[index, "Number of " + element] = count
        row["smiles_to_delete"] = row["smiles_to_delete"].\
            replace(element.lower(), "").replace(element, "")

    df_train.loc[index, "smiles_to_delete"] = row["smiles_to_delete"]


5- Let's count the number and types of bond (Not H)

In [8]:
# Function for counting bonds by type
def count_bond_types(smiles):
    mol = Chem.MolFromSmiles(smiles)
    bond_counts = {}

    if mol:
        for bond in mol.GetBonds():
            begin_atom = bond.GetBeginAtom().GetSymbol()
            end_atom = bond.GetEndAtom().GetSymbol()
            bond_type = bond.GetBondTypeAsDouble()

            bond_key = f"{begin_atom}-{end_atom} ({bond_type})"
            if bond_key in bond_counts:
                bond_counts[bond_key] += 1
            else:
                bond_counts[bond_key] = 1

    return bond_counts

# Apply the function count_bond_types to every smiles
df_train['Bond Counts'] = df_train['smiles'].apply(count_bond_types)


[18:06:18] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:06:18] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:06:18] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:06:18] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:06:18] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:06:18] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:06:19] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

In [9]:
df_train['Bond Counts'] = df_train['Bond Counts'].apply(lambda x: {k: x.get(k, 0) for k in set().union(*df_train['Bond Counts'])})
df_train_bondtomerge = pd.DataFrame(df_train['Bond Counts'].to_list())

df_train = df_train.merge(df_train_bondtomerge, left_index=True, right_index=True)


6-Let's count the number of bond involving H

In [10]:
ch3_pattern = Chem.MolFromSmarts('[CH3]')
ch2_pattern = Chem.MolFromSmarts('[CH2]')
oh_pattern = Chem.MolFromSmarts('[OH]')
ch_pattern = Chem.MolFromSmarts('[CH]')
ch_pattern_ar = Chem.MolFromSmarts('[cH]')
bh_pattern = Chem.MolFromSmarts('[BH]')
bh_pattern_ar = Chem.MolFromSmarts('[bH]')
bh2_pattern = Chem.MolFromSmarts('[BH2]')
bh3_pattern = Chem.MolFromSmarts('[BH3]')
nh_pattern = Chem.MolFromSmarts('[NH]')
nh_pattern_ar = Chem.MolFromSmarts('[nH]')
nh2_pattern = Chem.MolFromSmarts('[NH2]')
nh3_pattern = Chem.MolFromSmarts('[NH3]')
sih_pattern = Chem.MolFromSmarts('[SiH]')
sih2_pattern = Chem.MolFromSmarts('[SiH2]')
sih3_pattern = Chem.MolFromSmarts('[SiH3]')
sih4_pattern = Chem.MolFromSmarts('[SiH4]')
ph_pattern = Chem.MolFromSmarts('[PH]')
ph_pattern_ar = Chem.MolFromSmarts('[pH]')
ph2_pattern = Chem.MolFromSmarts('[PH2]')
ph3_pattern = Chem.MolFromSmarts('[PH3]')
sh_pattern = Chem.MolFromSmarts('[SH]')
sh_pattern_ar = Chem.MolFromSmarts('[sH]')

# Function to count some functionnal groups
def count_h_containing_groups(smiles):
    mol = Chem.MolFromSmiles(smiles)

    if mol:
        group_counts = {
            'CH3': len(mol.GetSubstructMatches(ch3_pattern)),
            'CH2': len(mol.GetSubstructMatches(ch2_pattern)),
            'OH': len(mol.GetSubstructMatches(oh_pattern)),
            'CH': len(mol.GetSubstructMatches(ch_pattern)) + len(mol.GetSubstructMatches(ch_pattern_ar)),
            'BH': len(mol.GetSubstructMatches(bh_pattern)) + len(mol.GetSubstructMatches(bh_pattern_ar)),
            'Bh2': len(mol.GetSubstructMatches(bh2_pattern)),
            'Bh3': len(mol.GetSubstructMatches(bh3_pattern)),
            'NH': len(mol.GetSubstructMatches(nh_pattern)) + len(mol.GetSubstructMatches(nh_pattern_ar)),
            'NH2': len(mol.GetSubstructMatches(nh2_pattern)),
            'NH3': len(mol.GetSubstructMatches(nh3_pattern)),
            'SiH': len(mol.GetSubstructMatches(sih_pattern)),
            'SiH2': len(mol.GetSubstructMatches(sih2_pattern)),
            'SiH3': len(mol.GetSubstructMatches(sih3_pattern)),
            'SiH4': len(mol.GetSubstructMatches(sih4_pattern)),
            'PH': len(mol.GetSubstructMatches(ph_pattern)) + len(mol.GetSubstructMatches(ph_pattern_ar)),
            'PH2': len(mol.GetSubstructMatches(ph2_pattern)),
            'PH3': len(mol.GetSubstructMatches(ph3_pattern)),
            'SH': len(mol.GetSubstructMatches(sh_pattern))+ len(mol.GetSubstructMatches(sh_pattern_ar))
        }
        #Mixing the aliphatic and aromatic is not a very good idea,
        # but let's keep it for now
        return group_counts

    else:
        return {}

# Apply the function count_h_containing_groups to every smiles
df_train['Functional Groups with H'] = df_train['smiles'].apply(count_h_containing_groups)

# Replace missing values by 0
df_train['Functional Groups with H'] = df_train['Functional Groups with H'].apply(lambda x: {k: x.get(k, 0) for k in set().union(*df_train['Functional Groups with H'])})

# Create column for every key in the dictionnary
df_train_bondtomerge = pd.DataFrame(df_train['Functional Groups with H'].to_list())
df_train = df_train.merge(df_train_bondtomerge, left_index=True, right_index=True)


[18:09:02] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:09:02] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:09:02] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:09:02] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:09:02] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:09:03] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:09:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

7- Add molecular weight

In [11]:
from rdkit.Chem import Descriptors

def calculate_molecular_weight(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Descriptors.MolWt(mol)
    else:
        return None

df_train['Molecular_weight'] = df_train['smiles'].apply(calculate_molecular_weight)


[18:11:36] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:11:36] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:11:36] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

8- Count the number of aromatic rings

In [12]:
def count_aromatic_rings(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return len(Chem.GetSSSR(mol))
    return 0

df_train['Aromatic Rings Count'] = df_train['smiles'].apply(count_aromatic_rings)


[18:11:38] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:11:38] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:11:38] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:11:38] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:11:38] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:11:38] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:11:39] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

9- Count the length of the main chain

In [13]:
def calculate_main_chain_length(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # Find carbons
        carbon_atoms = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'C']
        if carbon_atoms:
            start_atom = max(carbon_atoms, key=lambda x: len(mol.GetAtomWithIdx(x).GetNeighbors()))
            visited_atoms = set()
            max_length = 0
            stack = [(start_atom, 0)]
            while stack:
                atom_idx, length = stack.pop()
                visited_atoms.add(atom_idx)
                max_length = max(max_length, length)

                for neighbor in mol.GetAtomWithIdx(atom_idx).GetNeighbors():
                    neighbor_idx = neighbor.GetIdx()
                    if neighbor_idx not in visited_atoms:
                        stack.append((neighbor_idx, length + 1))

            return max_length
    return 0

df_train['Main Chain Length'] = df_train['smiles'].apply(calculate_main_chain_length)


[18:11:42] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:11:42] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:11:42] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:11:42] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:11:42] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:11:42] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:11:43] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

10- Count the number of different elements

In [14]:
def count_unique_elements(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return len(set(atom.GetSymbol() for atom in mol.GetAtoms()))
    return 0

df_train['Nombre d\'éléments différents'] = df_train['smiles'].apply(count_unique_elements)


[18:11:47] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:11:47] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:11:47] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

11- Count the number of double bond

In [15]:
df_train['Nombre de doubles liaisons'] = df_train['smiles'].apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds(Chem.BondType.DOUBLE) if Chem.MolFromSmiles(x) is not None else 0)


[18:11:50] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:11:50] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:11:50] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:11:50] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:11:50] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:11:50] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:11:51] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

12- Add a colonne XlogP

In [16]:
from rdkit.Chem import Crippen

df_train['XLogP'] = df_train['smiles'].apply(lambda x: Crippen.MolLogP(Chem.MolFromSmiles(x)) if Chem.MolFromSmiles(x) is not None else 0)


[18:12:11] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 22 23 24
[18:12:11] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 7 8 10 11 12 13 15 16 17 19
[18:12:11] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 6 7 8 9 11 12 13 15 16 17 18
[18:12:11] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 21 22 23
[18:12:11] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 15 16 17 18 19 20 21 22
[18:12:11] Explicit valence for atom # 20 C, 5, is greater than permitted
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 16 17 18 19 20 21 22 23 24
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 17 23
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
[18:12:12] Can't kekulize mol.  Unkekulized atoms: 0 1 2 

In [None]:
# TODO : Add scaling as a new step


In [61]:
pd.set_option("display.max_columns", None)
df_train.head(3)


Unnamed: 0,name,smiles,mpC,smiles_to_delete,Number of B,Number of C,Number of N,Number of O,Number of F,Number of Si,Number of P,Number of S,Number of Cl,Number of Br,Number of I,Bond Counts,I-O (1.0),C-S (2.0),O-S (1.0),Si-Br (1.0),N-P (2.0),S-N (1.5),N-I (1.0),C-Si (1.0),N-N (1.0),S-O (2.0),Cl-N (1.0),C-N (1.5),B-C (1.0),O-N (1.0),I-C (1.0),B-Br (1.0),N-S (1.0),N-H (1.0),Si-O (1.0),Si-P (1.0),S-I (1.0),S-P (2.0),N-Br (1.0),P-C (1.0),C-S (1.5),N-Cl (1.0),H-C (1.0),Br-Si (1.0),P-N (2.0),N-C (1.0),Cl-O (1.0),C-O (1.0),C-N (3.0),N-O (1.5),I-I (1.0),O-C (1.0),O-C (2.0),P-S (2.0),P-Cl (1.0),O-C (1.5),P-Br (1.0),N-N (2.0),Cl-Si (1.0),S-C (1.0),Cl-P (1.0),P-S (1.0),Cl-B (1.0),C-N (2.0),C-P (2.0),O-O (1.0),Br-Br (1.0),N-P (1.0),C-N (1.0),C-O (3.0),Cl-C (1.0),F-S (1.0),S-P (1.0),O-B (1.0),C-C (1.0),P-F (1.0),C-O (1.5),S-S (1.0),P-N (1.0),S-Cl (1.0),N-O (2.0),O-N (2.0),F-C (1.0),S-N (1.0),O-I (1.0),S-O (1.0),F-Si (1.0),S-Br (1.0),C-O (2.0),B-B (1.0),F-P (1.0),N-C (3.0),S-C (2.0),C-B (1.0),B-N (1.0),N-C (2.0),Br-C (1.0),F-B (1.0),B-F (1.0),S-F (1.0),N-O (1.0),Si-C (1.0),Si-Cl (1.0),O-P (1.0),C-H (1.0),N-N (3.0),B-S (1.0),N-C (1.5),H-N (1.0),C-S (1.0),Si-Si (1.0),S-C (1.5),Br-P (1.0),O-C (3.0),O-H (1.0),C-Cl (1.0),C-C (1.5),N-Si (1.0),P-O (2.0),N-B (1.0),H-O (1.0),P-O (1.0),Si-N (1.0),S-S (1.5),P-Si (1.0),C-F (1.0),O-S (2.0),C-C (3.0),N-N (1.5),Br-B (1.0),B-Cl (1.0),C-C (2.0),C-Br (1.0),O-Si (1.0),O-N (1.5),Si-F (1.0),N-S (1.5),B-O (1.0),O-Cl (1.0),O-P (2.0),Cl-S (1.0),C-I (1.0),C-P (1.0),Functional Groups with H,OH,CH,CH2,NH2,SiH3,CH3,SiH,PH,Bh3,SiH4,SiH2,NH,Bh2,NH3,PH3,PH2,BH,SH,Molecular_weight,Aromatic Rings Count,Main Chain Length,Nombre d'éléments différents,Nombre de doubles liaisons,XLogP
0,3-i-pr-5-mephenyl-n-me carbamate,O=C(Oc1cc(C)cc(c1)C(C)C)NC,87.0,,0,12,1,2,0,0,0,0,0,0,0,"{'I-O (1.0)': 0, 'C-S (2.0)': 0, 'O-S (1.0)': ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{'OH': 0, 'CH': 4, 'CH2': 0, 'NH2': 0, 'SiH3':...",0,4,0,0,0,4,0,0,0,0,0,1,0,0,0,0,0,0,207.273,1,7,3,15,2.83662
1,b-2-furylacrylic acid,O=C(O)C=Cc1occc1,133.0,,0,7,0,3,0,0,0,0,0,0,0,"{'I-O (1.0)': 0, 'C-S (2.0)': 0, 'O-S (1.0)': ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"{'OH': 1, 'CH': 5, 'CH2': 0, 'NH2': 0, 'SiH3':...",1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,138.122,1,7,2,10,1.3774
2,cyclacillin,OC(=O)C2N3C(=O)C(NC(=O)C1(N)CCCCC1)C3SC2(C)C,182.5,,0,15,3,4,0,0,0,1,0,0,0,"{'I-O (1.0)': 0, 'C-S (2.0)': 0, 'O-S (1.0)': ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"{'OH': 1, 'CH': 3, 'CH2': 5, 'NH2': 1, 'SiH3':...",1,3,5,1,0,2,0,0,0,0,0,1,0,0,0,0,0,0,341.433,3,8,4,25,0.2797


13- Let's remove the useless columns

In [63]:
df_train = df_train.drop(columns=["smiles_to_delete", "Bond Counts", "Functional Groups with H"
                                  ])


NameError: name 'df_train' is not defined

14- Last step: export as csv

In [64]:
df_train.to_csv("final_dataframe_train.csv", index=False)
#Come back later for the test set


In [57]:
#A faire si je veux aller plus loin dans le projet :
# Il y a un moyen d'avoir les groupes fonctionnels avec rdkit (taper functional grope rdkit)
# Maintenant que j'ai tout les groupes avec H, faire une soustraction pour calculer les atomes sans H
# Maintenant que j'ai tout, calculer le nombre d'insaturation


In [None]:
# For now, we will keep it like this and see how far we can go


Feature engineering is not finish, but lets see some models:

***Model 1***

In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

X = df_train.drop(columns=["name", "smiles", "mpC", "Functional Groups with H", "Bond Counts"])
y = df_train["mpC"]

model = LinearRegression()
model.fit(X, y)

scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
# Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


ValueError: could not convert string to float: ''

In [None]:
model.score(X, y)


***Model 2***

In [64]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Créez un modèle de régression polynomiale de degré 2
degree = 2
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# Entraînez le modèle
model.fit(X, y)


In [65]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


Moyenne des scores RMSE : 613591899.9077444


***Model 3***

In [66]:
from sklearn.ensemble import RandomForestRegressor

# Créez un modèle de forêt aléatoire
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Entraînez le modèle
model.fit(X, y)


In [67]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


Moyenne des scores RMSE : 48.51759681715769


***Model 4***

In [68]:
from sklearn.neural_network import MLPRegressor

# Créez un modèle de réseau de neurones
model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000)

# Entraînez le modèle
model.fit(X, y)


In [69]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


Moyenne des scores RMSE : 48.96389922649845


***Model 5***

In [70]:
from sklearn.svm import SVR

# Créez un modèle SVR
model = SVR(kernel='linear', C=1.0)

# Entraînez le modèle
model.fit(X, y)


In [71]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


KeyboardInterrupt: 

In [None]:
import xgboost as xgb

# Créez un modèle XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)

# Entraînez le modèle
model.fit(X, y)

scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Métrique de performance : Erreur quadratique moyenne négative

# Le score sera négatif, donc prenez l'opposé pour obtenir l'erreur quadratique moyenne positive
rmse_scores = (-scores) ** 0.5

# Calculez la moyenne des scores RMSE
mean_rmse_score = rmse_scores.mean()
print("Moyenne des scores RMSE :", mean_rmse_score)


In [36]:
mae


48.9046834393692