In [3]:
import numpy as np
import rdkit
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.Chem.Draw
import pandas as pd
import sys
import json
import torch
import gpytorch
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.gaussian_process.kernels import (
    ConstantKernel, RBF, DotProduct, WhiteKernel, Matern, Exponentiation, ExpSineSquared, RationalQuadratic)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from reaction import ReactionAB
# from experiment import ExperimentalDataset
from features import MolFeatures
from theoretical import TheoSimulation

In [5]:
import re

In [7]:
use_simulation_data = True

label_names = ["PCE", "Jsc", "Voc", "FF"]
new_data = pd.read_csv("data/dataset.csv")

# For the model selection only the first initial data is availbable
new_data = new_data.iloc[:100, :]
samples, targets = [
    re.sub(r"[()' ]", "", x).split(",") for
    x in new_data["AB"].values
], new_data.loc[:, label_names].values

In [12]:
feat_cols = [
    "C",
    "N",
    "O",
    "H",
    "S",
    "F",
    # "Cl",
    "NumAtoms",
    "AtomIsInRing",
    "AtomIsAromatic",
    "NumBonds",
    "BondIsConjugated",
    "BondIsAromatic",
    "NumRotatableBonds",
    # "fr_C_O_noCOO",
    # "fr_Al_COO",
    # "fr_Ar_COO",
    # "fr_Al_OH",
    # "fr_Ar_OH",
    # "fr_NH2",
    # "fr_SH",
    # "fr_sulfide",
    # "fr_alkyl_halide"
    "ExactMolWt",
    "FpDensityMorgan3",
    "MolLogP",
    "MolMR",
    "FractionCSP3",
    "has_CN(C)C",
    # "has_cnc",
    # "has_C=NC",
    # "has_Thiophene",
    # "has_Pyrrole",
    # "has_Benzimidazole",
    # "has_Benzothiophene",
    # "has_Naphthalene",
    "has_Biphenyl"
]
theo_cols = [
    "dipole",
    "homo",
    "lumo",
    "gap",
    "energy",
    "a",
    "b",
    "c"
]

In [8]:
# Make reaction
reaction_engine = ReactionAB(file_name_a="Mol_Group_A.xlsx", file_name_b="Mol_Group_B.xlsx",
                             data_dir_path="./data")
mols = reaction_engine.run_combos(samples)

INFO: Reading excel files.
INFO: Reading structures.


100%|██████████| 100/100 [00:00<00:00, 755.66it/s]


In [13]:
feature_generator = MolFeatures(descriptor_list=feat_cols)
features = feature_generator(mols)

INFO: Making features:


100%|██████████| 100/100 [00:00<00:00, 378.86it/s]


In [15]:
if use_simulation_data:
    theo_generator = TheoSimulation(file_name="Theo_simu.xlsx", descriptor_list=theo_cols, data_dir_path="./data")
    _, theos = theo_generator.labels_for_combos(samples)
    features = np.concatenate([features, theos], axis=-1)

columns = {"ID": ["".join(x) for x in samples]}
columns.update({x: features[:, i] for i, x in enumerate(feat_cols)})
if use_simulation_data:
    columns.update({x: theos[:, i] for i, x in enumerate(theo_cols)})
columns.update({x: targets[:, i] for i, x in enumerate(label_names)})
columns.update({"MolFormular": [rdkit.Chem.rdMolDescriptors.CalcMolFormula(m) for m in mols]})


In [14]:
ignore_samples_mask = targets[:, 2] > 0.2