Merge pull request #12 from Jonas-Verhellen/development

Jonas-Verhellen · Aug 5, 2020 · 5777d8a · 5777d8a
2 parents dfb2f2a + 975351b
commit 5777d8a
Show file tree

Hide file tree

Showing 108 changed files with 292 additions and 8,165 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 ## Description
 
-Argenomic is an open-source implementation of an illumination algorithm for optimization of small organic molecules. Argenomic provides a holistic overview of how high-performing molecules are distributed throughout a search space. This novel approach produces potent but qualitatively different molecules, illuminates the distribution of optimal solutions, and improves search efficiency compared to both machine learning and traditional genetic algorithm approaches. This implementation is based on an open-source, [graph-based genetic algorithm](https://github.com/jensengroup/GB-GA) for molecular optimisation, and influenced by state-of-the-art concepts from [soft robot design](https://github.com/resibots/pymap_elites). For more information, see the accompanying [blog post](https://jonas-verhellen.github.io/posts/2020/07/argenomic/). 
+Argenomic is an open-source implementation of an illumination algorithm for optimization of small organic molecules. Argenomic provides a holistic overview of how high-performing molecules are distributed throughout a search space. This novel approach produces potent but qualitatively different molecules, illuminates the distribution of optimal solutions, and improves search efficiency compared to both machine learning and traditional genetic algorithm approaches. This implementation is based on an open-source, [graph-based genetic algorithm](https://github.com/jensengroup/GB-GA) for molecular optimisation, and influenced by state-of-the-art concepts from [soft robot design](https://github.com/resibots/pymap_elites). For more information, see the accompanying [blog post](https://jonas-verhellen.github.io/posts/2020/07/argenomic/).
 
 <p align="center">
   <img src="https://github.com/Jonas-Verhellen/jonas-verhellen.github.io/blob/master/images/video.gif" />
@@ -17,7 +17,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti
 
 After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner:
 ```
-python3 illuminate.py configuration_file=./configuration/config.yaml generations=100
+python3 illuminate.py generations=100
 ```
 
 ### Installing

diff --git a/argenomic/infrastructure.py b/argenomic/infrastructure.py
@@ -1,5 +1,6 @@
 import os
 import csv
+import hydra
 import random
 import itertools
 
@@ -34,18 +35,20 @@ def update(self, fitness, molecule, descriptor):
 
 class archive:
     def __init__(self, archive_config, descriptor_config) -> None:
-        self.archive_name = archive_config.name
         self.archive_size = archive_config.size
-        kmeans = KMeans(n_clusters=self.archive_size)
-        kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, len(descriptor_config.properties)))
-        self.cvt_centers = kmeans.cluster_centers_
+        self.archive_accuracy = archive_config.accuracy
+        self.archive_dimensions = len(descriptor_config.properties)
+        self.cache_string = "cache_{}_{}.csv".format(self.archive_dimensions, self.archive_accuracy)
+        self.cvt_location = hydra.utils.to_absolute_path("data/cvt/" + self.cache_string)
+        if os.path.isfile(self.cvt_location):
+            self.cvt_centers = np.loadtxt(self.cvt_location)
+        else:
+            kmeans = KMeans(n_clusters=self.archive_size)
+            kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, self.archive_dimensions))
+            self.cvt_centers = kmeans.cluster_centers_
+            np.savetxt(self.cvt_location, self.cvt_centers)
         self.cvt = KDTree(self.cvt_centers, metric='euclidean')
         self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
-        if not os.path.isdir(self.archive_name):
-            os.mkdir(self.archive_name)
-        with open('{}/statistics.csv'.format(self.archive_name), 'w') as file:
-            file.write("## Argenomic Statistics File: {} \n".format(datetime.now()))
-            file.close()
         return None
 
     def cvt_index(self, descriptor: List[float]) -> int:
@@ -72,15 +75,20 @@ def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
     def store_archive(self, generation: float) -> None:
         elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
         data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
-        pd.DataFrame(data=data).to_csv("{}/archive_{}.csv".format(self.archive_name, generation), index=False)
+        pd.DataFrame(data=data).to_csv("archive_{}.csv".format(generation), index=False)
         return None
 
     def store_statistics(self, generation: float) -> None:
         elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
         fractional_size = len(elites_smiles)/self.archive_size
         statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
-        with open('{}/statistics.csv'.format(self.archive_name), 'a') as file:
-            csv.writer(file).writerow(statistics)
+        if os.path.isfile('statistics.csv'):
+            with open('statistics.csv', 'a') as file:
+                csv.writer(file).writerow(statistics)
+                file.close()
+        else:
+            with open('statistics.csv', 'w') as file:
+                file.close()
         print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
         print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
         return None
@@ -99,71 +107,71 @@ class arbiter:
     Includes the option to run the structural filters from ChEMBL.
     """
     def __init__(self, arbiter_config) -> None:
-      self.rules_dict = pd.read_csv("./data/smarts/alert_collection.csv")
-      self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
-      self.rules_list = self.rules_dict["smarts"].values.tolist()
-      self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
-      self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]
+        self.rules_dict = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/alert_collection.csv"))
+        self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
+        self.rules_list = self.rules_dict["smarts"].values.tolist()
+        self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
+        self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]
 
     def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
-      """
-      Applies the chosen filters (hologenicity, veber_infractions,
-      ChEMBL structural alerts, ...) to a list of molecules.
-      """
-      filtered_molecules = []
-      for molecule in molecules:
-        if self.molecule_validity(molecule):
-          filtered_molecules.append(molecule)
-      return filtered_molecules
+        """
+        Applies the chosen filters (hologenicity, veber_infractions,
+        ChEMBL structural alerts, ...) to a list of molecules.
+        """
+        filtered_molecules = []
+        for molecule in molecules:
+            if self.molecule_validity(molecule):
+                filtered_molecules.append(molecule)
+        return filtered_molecules
 
     def molecule_validity(self, molecule: Chem.Mol) -> bool:
-      """
-      Checks if a given molecule passes through the chosen filters (hologenicity,
-      veber_infractions, ChEMBL structural alerts, ...).
-      """
-      toxicity = self.toxicity(molecule)
-      hologenicity = self.hologenicity(molecule)
-      veber_infraction = self.veber_infraction(molecule)
-      validity = not (toxicity or hologenicity or veber_infraction)
-      if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
-        ring_infraction = self.ring_infraction(molecule)
-        validity = validity and not (ring_infraction)
-      return validity
+        """
+        Checks if a given molecule passes through the chosen filters (hologenicity,
+        veber_infractions, ChEMBL structural alerts, ...).
+        """
+        toxicity = self.toxicity(molecule)
+        hologenicity = self.hologenicity(molecule)
+        veber_infraction = self.veber_infraction(molecule)
+        validity = not (toxicity or hologenicity or veber_infraction)
+        if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
+            ring_infraction = self.ring_infraction(molecule)
+            validity = validity and not (ring_infraction)
+        return validity
 
     def toxicity(self, molecule: Chem.Mol) -> bool:
-      """
-      Checks if a given molecule fails the structural filters.
-      """
-      for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
+        """
+        Checks if a given molecule fails the structural filters.
+        """
+        for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
             if len(molecule.GetSubstructMatches(pattern)) > tolerance:
-              return True
-      return False
+                return True
+        return False
 
     @staticmethod
     def hologenicity(molecule: Chem.Mol) -> bool:
-      """
-      Checks if a given molecule fails the hologenicity filters.
-      """
-      fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
-      bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
-      chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
-      return chlorine_saturation or bromide_saturation or fluorine_saturation
+        """
+        Checks if a given molecule fails the hologenicity filters.
+        """
+        fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
+        bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
+        chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
+        return chlorine_saturation or bromide_saturation or fluorine_saturation
 
     @staticmethod
     def ring_infraction(molecule: Chem.Mol) -> bool:
-      """
-      Checks if a given molecule fails the ring infraction filters.
-      """
-      ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
-      macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
-      double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
-      return ring_allene or macro_cycle or double_bond_in_small_ring
+        """
+        Checks if a given molecule fails the ring infraction filters.
+        """
+        ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
+        macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
+        double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
+        return ring_allene or macro_cycle or double_bond_in_small_ring
 
     @staticmethod
     def veber_infraction(molecule: Chem.Mol) -> bool:
-      """
-      Checks if a given molecule fails the veber infraction filters.
-      """
-      rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
-      hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
-      return rotatable_bond_saturation or hydrogen_bond_saturation
+        """
+        Checks if a given molecule fails the veber infraction filters.
+        """
+        rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
+        hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
+        return rotatable_bond_saturation or hydrogen_bond_saturation
diff --git a/argenomic/operations.py b/argenomic/operations.py
@@ -1,3 +1,4 @@
+import hydra
 import random
 import logging
 import numpy as np
@@ -17,7 +18,7 @@ class mutator:
     according to the principles of positional analogue scanning.
     """
     def __init__(self) -> None:
-        self.mutation_data = pd.read_csv("./data/smarts/mutation_collection.tsv", sep='\t')
+        self.mutation_data = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/mutation_collection.tsv"), sep='\t')
 
     def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]:
         sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0]

diff --git a/configuration/config.yaml b/configuration/config.yaml
@@ -1,9 +1,12 @@
 ---
-data_file: ./data/smiles/guacamol_initial_rediscovery_thiotixene.smi
+data_file: data/smiles/guacamol_initial_rediscovery_thiotixene.smi
 batch_size: 40
 initial_size: 100
+workers: 1
+threads: 2
+generations: 75
 archive:
-  name: ./results/thiotixene
+  name: thiotixene
   size: 150
   accuracy: 25000
 descriptor: