## Imports

In [1]:
import random
import Levenshtein
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

from tokenizers import Tokenizer, Regex
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Split
from transformers import PreTrainedTokenizerFast
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

## 1. Data Preprocessing

### 1.1 Load and Preprocess Data

In [2]:
file_path_vae_train = r'C:\Users\mokht\Desktop\CBRC_ED_Project\data\composed_dataset_100k.csv'
df_vae_train = pd.read_csv(file_path_vae_train, names=['SMILES'])

def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            return Chem.MolToSmiles(mol, canonical=True)
        else:
            print(f"Unable to parse SMILES: {smiles}")
            return None
    except Exception as e:
        return None

df_vae_train['SMILES'] = df_vae_train['SMILES'].apply(canonicalize_smiles)
df_vae_train.dropna(subset=['SMILES'], inplace=True)
df_vae_train.drop_duplicates(subset=['SMILES'], inplace=True)
print("Dataset shape after preprocessing:", df_vae_train.shape)

df_vae_train['length'] = df_vae_train['SMILES'].apply(len)
max_length = df_vae_train['length'].max()
longest_smiles = df_vae_train[df_vae_train['length'] == max_length]
print("Maximum SMILES length in dataset:", max_length)
print("SMILES strings with the maximum length:")
print(longest_smiles['SMILES'])

preprocessed_file_path = r'C:\Users\mokht\Desktop\CBRC_ED_Project\data\preprocessed_composed_dataset_100k.csv'
df_vae_train.to_csv(preprocessed_file_path, index=False)

[01:54:49] Explicit valence for atom # 12 N, 4, is greater than permitted


Unable to parse SMILES: c1c(c(ncc1)CSCCN\C(=[NH]\C#N)NCC)Br


[01:54:50] Explicit valence for atom # 6 N, 4, is greater than permitted


Unable to parse SMILES: Cc1nc(sc1)\[NH]=C(\N)N


[01:54:51] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: c1(cc(N\C(=[NH]\c2cccc(c2)CC)C)ccc1)CC


[01:54:53] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)NC(C)=O


[01:54:59] Explicit valence for atom # 1 N, 4, is greater than permitted


Unable to parse SMILES: O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3


[01:55:05] Explicit valence for atom # 6 N, 4, is greater than permitted
[01:55:05] Explicit valence for atom # 11 N, 4, is greater than permitted


Unable to parse SMILES: c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC
Unable to parse SMILES: s1cc(CSCCN\C(NC)=[NH]\C#N)nc1\[NH]=C(\N)N


[01:55:07] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1ccccc1


[01:55:10] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: s1cc(nc1\[NH]=C(\N)N)C


[01:55:15] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N


[01:55:20] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N\C(NC)=[NH]\C#N




Dataset shape after preprocessing: (99849, 1)
Maximum SMILES length in dataset: 100
SMILES strings with the maximum length:
59012    CC1(C)C(=O)CC[C@@]2(C)[C@H]3C(=O)C=C4[C@H]5C[C...
69959    CC1(C)[C@@H](O)CC[C@@]2(C)[C@H]3CC=C4[C@H]5C[C...
Name: SMILES, dtype: object


### 1.2 Reload Preprocessed Data

In [3]:
df_vae_train = pd.read_csv(preprocessed_file_path)
df_vae_train.head()

Unnamed: 0,SMILES,length
0,Cc1cc(NC(=O)COC(=O)COc2ccc3c4c(c(=O)oc3c2)CCC4...,50
1,Cn1c2c([N+](=O)[O-])cccc2c(=O)c2c(O)cc3c(c21)C...,57
2,C=C[C@]1(C)CC[C@H]2C(=C[C@@H]3OC(=O)[C@]4(C)[C...,73
3,COc1cc(CC[C@H](C[C@@H](OC(C)=O)[C@@H]2CCCCC[C@...,69
4,Cn1ccoc1=Nc1ccc(Cl)c(Cl)c1,26


## 2. Tokenizer Preparation

### 2.1 Load and Configure Custom Tokenizer

In [4]:
custom_tokenizer = Tokenizer(
    WordLevel.from_file(
        '/smiles-Word/vocab.json',
        unk_token='[UNK]'
    )
)
custom_tokenizer.pre_tokenizer = Split(
    pattern=Regex(r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|\+|\\|\/|:|@|\?|>|>>|\*|\$|\%[0-9A-Fa-f]{2}|[0-9])"),
    behavior='isolated'
)

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_object=custom_tokenizer)
hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
hf_tokenizer.save_pretrained('/smiles-Word')

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/smiles-Word/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

## 3. Dataset Preparation

### 3.1 Define Dataset Class

In [5]:
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, tokenizer, max_length=512):
        self.smiles_list = smiles_list
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        input_encodings = self.tokenizer(smiles, truncation=True, padding='max_length', max_length=self.max_length)
        target_encodings = self.tokenizer(smiles, truncation=True, padding='max_length', max_length=self.max_length)
        return {
            'input_ids': torch.tensor(input_encodings['input_ids']),
            'attention_mask': torch.tensor(input_encodings['attention_mask']),
            'labels': torch.tensor(target_encodings['input_ids'])
        }

smiles_data = df_vae_train['SMILES'].tolist()
dataset = SMILESDataset(smiles_data, tokenizer, max_length=max_length)

### 3.2 Verify Tokenization

In [11]:
for i in range(2):
    smiles = smiles_data[i]
    tokenized_output = tokenizer(smiles, truncation=True, padding='max_length', max_length=max_length)
    print(f"Original SMILES: {smiles}")
    print(f"Tokenized input_ids: {tokenized_output['input_ids']}")
    print(f"Tokenized tokens   : {tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'])}")
    print("-" * 80)

Original SMILES: Cc1cc(NC(=O)COC(=O)COc2ccc3c4c(c(=O)oc3c2)CCC4)no1
Tokenized input_ids: [16, 15, 20, 15, 15, 17, 23, 16, 17, 22, 19, 18, 16, 19, 16, 17, 22, 19, 18, 16, 19, 15, 21, 15, 15, 15, 26, 15, 32, 15, 17, 15, 17, 22, 19, 18, 44, 15, 26, 15, 21, 18, 16, 16, 16, 32, 18, 25, 44, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenized tokens   : ['C', 'c', '1', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'C', 'O', 'C', '(', '=', 'O', ')', 'C', 'O', 'c', '2', 'c', 'c', 'c', '3', 'c', '4', 'c', '(', 'c', '(', '=', 'O', ')', 'o', 'c', '3', 'c', '2', ')', 'C', 'C', 'C', '4', ')', 'n', 'o', '1', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

## 4. Model Training

### 4.1 Load and Train T5 Model

In [12]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/24964 [00:00<?, ?it/s]

{'loss': 0.514, 'grad_norm': 0.631125807762146, 'learning_rate': 4.9018193475995835e-05, 'epoch': 0.04}
{'loss': 0.0499, 'grad_norm': 1.0503255128860474, 'learning_rate': 4.801635008415485e-05, 'epoch': 0.08}
{'loss': 0.0305, 'grad_norm': 0.3795585036277771, 'learning_rate': 4.701450669231386e-05, 'epoch': 0.12}
{'loss': 0.0209, 'grad_norm': 0.489401638507843, 'learning_rate': 4.6012663300472876e-05, 'epoch': 0.16}
{'loss': 0.017, 'grad_norm': 0.8533644676208496, 'learning_rate': 4.5010819908631885e-05, 'epoch': 0.2}
{'loss': 0.0136, 'grad_norm': 0.2048753798007965, 'learning_rate': 4.4008976516790894e-05, 'epoch': 0.24}
{'loss': 0.0118, 'grad_norm': 0.3870348632335663, 'learning_rate': 4.300713312494991e-05, 'epoch': 0.28}
{'loss': 0.0108, 'grad_norm': 0.3495531380176544, 'learning_rate': 4.200528973310892e-05, 'epoch': 0.32}
{'loss': 0.0096, 'grad_norm': 0.2877950668334961, 'learning_rate': 4.1003446341267935e-05, 'epoch': 0.36}
{'loss': 0.0107, 'grad_norm': 0.5881220698356628, 'lear

TrainOutput(global_step=24964, training_loss=0.016849695425674598, metrics={'train_runtime': 4357.5372, 'train_samples_per_second': 45.828, 'train_steps_per_second': 5.729, 'train_loss': 0.016849695425674598, 'epoch': 2.0})

### 4.2 Save the Fine-tuned Model and Tokenizer

In [13]:
model_save_path = r'C:\Users\mokht\Desktop\CBRC_ED_Project\saved_models\t5_smiles_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('C:\\Users\\mokht\\Desktop\\CBRC_ED_Project\\saved_models\\t5_smiles_model\\tokenizer_config.json',
 'C:\\Users\\mokht\\Desktop\\CBRC_ED_Project\\saved_models\\t5_smiles_model\\special_tokens_map.json',
 'C:\\Users\\mokht\\Desktop\\CBRC_ED_Project\\saved_models\\t5_smiles_model\\tokenizer.json')

## 5. Model Evaluation

### 5.1 Load Fine-tuned Model and Tokenizer

In [15]:
# Load the fine-tuned model and tokenizer
model_save_path = r'C:\Users\mokht\Desktop\CBRC_ED_Project\saved_models\t5_smiles_model'
model = T5ForConditionalGeneration.from_pretrained(model_save_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_save_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

### 5.2 Encoder Embeddings and Token Count

In [16]:
def get_encoder_embeddings_and_count_tokens(model, tokenizer, smiles):
    model.eval()
    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=110)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    token_count = inputs['input_ids'].shape[1]
    with torch.no_grad():
        encoder_outputs = model.encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
    return encoder_outputs.last_hidden_state, token_count

smiles_string = "O=Cc1ccc2occ(-c3nnn[nH]3)c(=O)c2c1"
embeddings, token_count = get_encoder_embeddings_and_count_tokens(model, tokenizer, smiles_string)
print(f"String's Length: {len(smiles_string)}")
print(f"Number of tokens: {token_count}")
print(f"Embedding: {embeddings}")
print(f"Embedding's Shape: {embeddings.shape}")

String's Length: 34
Number of tokens: 31
Embedding: tensor([[[-0.0794, -0.0196, -0.1027,  ...,  0.1986, -0.0344,  0.1155],
         [-0.2519, -0.1162, -0.0213,  ...,  0.0628,  0.0564,  0.0233],
         [-0.0664,  0.0070, -0.0521,  ..., -0.1870,  0.1103,  0.0472],
         ...,
         [-0.1752,  0.0316, -0.0411,  ...,  0.1669, -0.0772,  0.0534],
         [ 0.0467, -0.2008,  0.1495,  ...,  0.1188, -0.0050,  0.0150],
         [-0.2719,  0.0716,  0.0740,  ..., -0.3742,  0.2752, -0.1003]]],
       device='cuda:0')
Embedding's Shape: torch.Size([1, 31, 512])


### 5.3 SMILES Encoding and Decoding

In [17]:
def encode_decode_smiles(model, tokenizer, smiles, max_length=110):
    model.eval()
    inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )
    decoded_smiles = "".join(tokenizer.convert_ids_to_tokens(generated_ids[0], skip_special_tokens=True))
    return decoded_smiles

smiles_string = "O=Cc1ccc2occ(-c3nnn[nH]3)c(=O)c2c1"
reconstructed_smiles = encode_decode_smiles(model, tokenizer, smiles_string, max_length=110)
print(f"Original: {smiles_string}, Reconstructed: {reconstructed_smiles}")

Original: O=Cc1ccc2occ(-c3nnn[nH]3)c(=O)c2c1, Reconstructed: O=Cc1ccc2occ(-c3nnn[nH]3)c(=O)c2c1


## 6. Model Testing

### 6.1 Load and Preprocess Test Data

In [18]:
file_path_vae_test = r'C:\Users\mokht\Desktop\CBRC_ED_Project\data\composed_dataset_500k.csv'
df_vae_test = pd.read_csv(file_path_vae_test, names=['SMILES'])
df_vae_test['SMILES'] = df_vae_test['SMILES'].apply(canonicalize_smiles)
df_vae_test.dropna(subset=['SMILES'], inplace=True)
df_vae_test.drop_duplicates(subset=['SMILES'], inplace=True)
print("Dataset shape after preprocessing:", df_vae_test.shape)

df_vae_test['length'] = df_vae_test['SMILES'].apply(len)
max_length = df_vae_test['length'].max()
longest_smiles = df_vae_test[df_vae_test['length'] == max_length]
print("Maximum SMILES length in dataset:", max_length)
print("SMILES strings with the maximum length:")
print(longest_smiles['SMILES'])
df_vae_test.head()

preprocessed_file_path = r'C:\Users\mokht\Desktop\CBRC_ED_Project\data\preprocessed_composed_dataset_500k.csv'
df_vae_test.to_csv(preprocessed_file_path, index=False)

[11:23:11] Explicit valence for atom # 6 N, 4, is greater than permitted


Unable to parse SMILES: Cc1nc(sc1)\[NH]=C(\N)N


[11:23:12] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: c1(cc(N\C(=[NH]\c2cccc(c2)CC)C)ccc1)CC


[11:23:23] Explicit valence for atom # 1 N, 4, is greater than permitted


Unable to parse SMILES: O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3


[11:23:51] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N\C(NC)=[NH]\C#N


[11:24:14] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)NC(C)=O


[11:24:30] Explicit valence for atom # 11 N, 4, is greater than permitted


Unable to parse SMILES: s1cc(CSCCN\C(NC)=[NH]\C#N)nc1\[NH]=C(\N)N


[11:25:29] Explicit valence for atom # 6 N, 4, is greater than permitted


Unable to parse SMILES: c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC


[11:25:32] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1ccccc1


[11:25:42] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N


[11:26:08] Explicit valence for atom # 12 N, 4, is greater than permitted


Unable to parse SMILES: c1c(c(ncc1)CSCCN\C(=[NH]\C#N)NCC)Br


[11:26:37] Explicit valence for atom # 5 N, 4, is greater than permitted


Unable to parse SMILES: s1cc(nc1\[NH]=C(\N)N)C




Dataset shape after preprocessing: (499593, 1)
Maximum SMILES length in dataset: 100
SMILES strings with the maximum length:
74862     CC1(C)C[C@@H]2C3=CC[C@@H]4[C@]5(C)CC[C@H](O)C(...
351945    CC1(C)[C@@H](O)CC[C@@]2(C)[C@H]3CC=C4[C@@H]5C[...
Name: SMILES, dtype: object


Unnamed: 0,SMILES,length
0,Cn1c(SCc2noc(-c3ccsc3)n2)nnc1C1CCS(=O)(=O)C1,44
1,C=C1CCC2C(C)(CO)C(O)CCC2(C)C1CC(OC(=O)CC)C1=CC...,51
2,Brc1ccc(-c2cn3nc(Cc4noc5ccccc45)sc3n2)cc1,41
3,CCOC(=O)CC(=O)CSc1nc(-c2ccccc2)cc(-c2ccc(Cl)cc...,53
4,CCCCCCOC(=O)[C@]12O[C@@]1(C/C(CO)=C(\C)CC1(c3c...,79


### 6.2 Reload Preprocessed Test Data

In [19]:
preprocessed_file_path = r'C:\Users\mokht\Desktop\CBRC_ED_Project\data\preprocessed_composed_dataset_500k.csv'
df_vae_test = pd.read_csv(preprocessed_file_path)
df_vae_test.head()

Unnamed: 0,SMILES,length
0,Cn1c(SCc2noc(-c3ccsc3)n2)nnc1C1CCS(=O)(=O)C1,44
1,C=C1CCC2C(C)(CO)C(O)CCC2(C)C1CC(OC(=O)CC)C1=CC...,51
2,Brc1ccc(-c2cn3nc(Cc4noc5ccccc45)sc3n2)cc1,41
3,CCOC(=O)CC(=O)CSc1nc(-c2ccccc2)cc(-c2ccc(Cl)cc...,53
4,CCCCCCOC(=O)[C@]12O[C@@]1(C/C(CO)=C(\C)CC1(c3c...,79


### 6.3 Evaluate Model Performance


In [20]:
random.seed(42)
sampled_df = df_vae_test.sample(n=500, random_state=42)
sampled_smiles_list = sampled_df['SMILES'].tolist()

def exact_match_accuracy(original_smiles_list, reconstructed_smiles_list):
    correct = 0
    mismatches = []
    for orig, recon in zip(original_smiles_list, reconstructed_smiles_list):
        if orig == recon:
            correct += 1
        else:
            mismatches.append((orig, recon))
    total = len(original_smiles_list)
    accuracy = correct / total
    return accuracy, mismatches

def average_levenshtein_distance(original_smiles_list, reconstructed_smiles_list):
    total_distance = sum([Levenshtein.distance(orig, recon) for orig, recon in zip(original_smiles_list, reconstructed_smiles_list)])
    total = len(original_smiles_list)
    return total_distance / total

def validity_of_generated_smiles(reconstructed_smiles_list):
    valid_count = sum([1 for s in reconstructed_smiles_list if Chem.MolFromSmiles(s) is not None])
    total = len(reconstructed_smiles_list)
    return valid_count / total

def average_tanimoto_similarity(original_smiles_list, reconstructed_smiles_list):
    similarities = []
    morgan_gen = GetMorganGenerator(radius=2, fpSize=2048)
    for orig, recon in tqdm(zip(original_smiles_list, reconstructed_smiles_list), total=len(original_smiles_list), desc="Calculating Tanimoto Similarity"):
        mol_orig = Chem.MolFromSmiles(orig)
        mol_recon = Chem.MolFromSmiles(recon)
        if mol_orig and mol_recon:
            fp_orig = morgan_gen.GetFingerprint(mol_orig)
            fp_recon = morgan_gen.GetFingerprint(mol_recon)
            similarity = DataStructs.TanimotoSimilarity(fp_orig, fp_recon)
            similarities.append(similarity)
    return sum(similarities) / len(similarities)

original_smiles_list = sampled_smiles_list
reconstructed_smiles_list = []

for s in tqdm(original_smiles_list, desc="Evaluating"):
    reconstructed_smiles_list.append(encode_decode_smiles(model, tokenizer, s, max_length=110))

accuracy, mismatches = exact_match_accuracy(original_smiles_list, reconstructed_smiles_list)
print(f"Exact Match Accuracy: {accuracy}")

print("Mismatched SMILES strings:")
for orig, recon in mismatches[:10]:
    print(f"Original     : {orig}")
    print(f"Reconstructed: {recon}")

average_distance = average_levenshtein_distance(original_smiles_list, reconstructed_smiles_list)
print(f"Average Levenshtein Distance: {average_distance}")

validity = validity_of_generated_smiles(reconstructed_smiles_list)
print(f"Validity of Generated SMILES: {validity}")

average_similarity = average_tanimoto_similarity(original_smiles_list, reconstructed_smiles_list)
print(f"Average Tanimoto Similarity: {average_similarity}")

Evaluating: 100%|██████████| 500/500 [14:42<00:00,  1.76s/it]


Exact Match Accuracy: 0.994
Mismatched SMILES strings:
Original     : CCCCCCCC/C=C\CCCCCCCCCCCCCC(=O)OC[C@H](COC(=O)CCCCCCCCCCCCCC)OCCCCCCCCCCCCCCCCCC
Reconstructed: CCCCCCCC/C=C\CCCCCCCCCCCCCC(=O)OC[C@H](COC(=O)CCCCCCCCCCCCCC)OCCCCCCCCCCCCCCCCCCC
Original     : CCCCCCCCCCCCCC[C@H]1O[C@H]1CC/C=C\CCCCCCCCCCC1=C[C@@H](C)OC1=O
Reconstructed: CCCCCCCCCCCCCCC[C@H]1O[C@H]1CC/C=C\CCCCCCCCCCC1=C[C@@H](C)OC1=O
Original     : CC(C)=CCC/C(C)=C/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=C(C)/C=C/C=C(\C)CCC=C(C)C
Reconstructed: CC(C)=CCC/C(C)=C/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=C(C)/C=C/C=C(C)/C=C/C=C(\C)CCC=C(C)C
Average Levenshtein Distance: 0.026
Validity of Generated SMILES: 1.0


Calculating Tanimoto Similarity: 100%|██████████| 500/500 [00:00<00:00, 1221.59it/s]

Average Tanimoto Similarity: 1.0



