This is a notebook to display the results after executing the following commands:
```
python reactiont5.py
```


In [1]:
import numpy
import warnings
#import tensorflow as tf
from transformers import logging
logging.set_verbosity_error()
warnings.filterwarnings("ignore")
#tf.get_logger().setLevel('ERROR')

import pandas as pd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoConfig, PreTrainedModel
from transformers import get_linear_schedule_with_warmup
from rdkit import Chem




In [3]:
class ReactionT5Yield(PreTrainedModel):
    config_class  = AutoConfig
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.model = T5ForConditionalGeneration.from_pretrained(self.config._name_or_path)
        self.model.resize_token_embeddings(self.config.vocab_size)
        self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
        self.fc2 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
        self.fc3 = nn.Linear(self.config.hidden_size//2*2, self.config.hidden_size)
        self.fc4 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
        self.fc5 = nn.Linear(self.config.hidden_size, 1)

        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
        self._init_weights(self.fc3)
        self._init_weights(self.fc4)
        self._init_weights(self.fc5)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.01)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.01)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        inputs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
            }
        encoder_outputs = self.model.encoder(**inputs)
        encoder_hidden_states = encoder_outputs[0]
        outputs = self.model.decoder(input_ids=torch.full((input_ids.size(0), 1),
                                                          self.config.decoder_start_token_id,
                                                          dtype=torch.long).to(input_ids.device),
                                     encoder_hidden_states=encoder_hidden_states)
        last_hidden_states = outputs[0]
        output1 = self.fc1(last_hidden_states[:, 0, :]) #.view(-1, self.config.hidden_size)削除
        output2 = self.fc2(encoder_hidden_states[:, 0, :]) #.view(-1, self.config.hidden_size)削除
        output = self.fc3(torch.hstack((output1, output2)))
        output = self.fc4(output)
        output = self.fc5(output)
        return output * 100

In [4]:
def custom_collate(batch):
    data_list, target_list = zip(*batch)
    batch_data = {key: torch.stack([d[key] for d in data_list]) for key in data_list[0]}
    batch_target = torch.stack(target_list)
    return batch_data, batch_target

class ReactionT5Dataset(Dataset):
    def __init__(self, input_ids, attention_masks, targets):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long).clone().detach(),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long).clone().detach()
        }, torch.tensor(self.targets[idx], dtype=torch.float32).clone().detach()

def canonicalize(smiles):
    try:
        new_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical=True)
    except:
        new_smiles = None
    return new_smiles

def make_input_list(react, prod, smiles_list):
    input_list = []
    for ops in smiles_list:
        canonicalize_ops = canonicalize(ops)
        if canonicalize_ops == None:
            print(f'{ops} is not canonicalized')
        input_list.append('REACTANT:' + react + 'REAGENT:' + canonicalize_ops + 'PRODUCT:' + prod)
    return input_list

def tokenize_smiles(smiles_list):
    encodings = tokenizer(smiles_list, padding=True, truncation=True, max_length=300, return_tensors="pt")
    return encodings['input_ids'].tolist(), encodings['attention_mask'].tolist()

def calculate_statistics(group):
    r2_test = group['r2_test']
    r2_test_dict = {f'run{i}': r2_test_val for i, r2_test_val in enumerate(r2_test)}
    return pd.Series({
        **r2_test_dict,
        'r2_test_mean': np.mean(r2_test),
        'r2_test_max': np.max(r2_test),
        'r2_test_min': np.min(r2_test),
        'r2_test_std': np.std(r2_test, ddof=0),
    })

In [5]:
import pickle
o=pickle.load(open("results.pkl","rb"))

In [6]:
import pandas as pd
results_df = pd.DataFrame(o)
gen_results = results_df.groupby(['target']).apply(calculate_statistics).reset_index()
gen_results.T

Unnamed: 0,0,1,2
target,Yield_CO_cl,Yield_CO_l,Yield_CO_s
run0,0.504806,0.604748,0.299739
run1,0.461216,0.53812,0.207225
run2,0.608692,0.58839,0.243142
run3,0.60022,0.671486,0.719229
run4,0.603094,0.510813,0.367751
run5,0.40864,0.38099,0.502642
run6,0.629278,0.538695,0.241459
run7,0.483289,0.586654,0.485936
run8,0.292545,0.457376,0.541093
