In [4]:
import torch
from torch.utils.data import Dataset
import random
from rdkit import Chem
import pickle

from calc_property import calculate_property

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class SMILESDataset(Dataset):
    def __init__(self, data_path, data_length=None, shuffle=False):
        with open(data_path,'r') as f:
            lines = f.readlines()
        self.data = [l.strip() for l in lines]

        with open('./normalize.pkl', 'rb') as w:
            norm = pickle.load(w)
        self.property_mean, self.property_std = norm

        if shuffle:
            random.shuffle(self.data)
        
        ## Why need this line? ##
        if data_length is not None:
            self.data = self.data[data_length[0]: data_length[1]]
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        smiles = 'Q' + self.data[idx]
        properties = (calculate_property(smiles[1:])-self.property_std) / self.property_mean
        
        return smiles, properties

In [6]:
sampleDataset = SMILESDataset(data_path='./data/pubchem-1m-simple.txt', data_length=None, shuffle=False)
sample = sampleDataset.__getitem__(0)

sample

('QCN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1',
 tensor([  0.5682,   0.4030,   0.6777,   0.6901,   0.6560,   0.6904,   0.6890,
           0.6158,   0.6544,   0.3731,   0.6623,  -1.0214,   0.5386,  -6.4296,
           0.6370,   0.7405,   0.7696,   0.8429,   0.2686,   1.6701,   0.6852,
           0.6222,   0.6510,   0.5670,   0.3475,   0.6824,   0.8768,   0.8768,
          -0.1836,   2.4966,   0.1489,   0.7014,   0.6364,   0.2755,   0.4956,
          -2.3909,   0.4995,   0.0625,   0.7835,  -1.1521,   0.3690,   0.4610,
           0.6192,   0.3638, -29.1460,   0.2716,  -2.7021,   0.9223,   0.2647,
           0.6883,   0.5710,   0.4035,   1.0784]))

In [7]:
with open('./data/pubchem-1m-simple.txt', 'r') as f:
    lines = f.readlines()
    
with open('./normalize.pkl', 'rb') as w:
    norm = pickle.load(w)
    print(norm)

    

(tensor([ 2.0210e+00,  7.5875e+02,  1.7544e+01,  1.4052e+01,  1.4723e+01,
         1.1664e+01,  8.1625e+00,  8.8090e+00,  6.2020e+00,  6.9684e+00,
         4.2730e+00,  4.9467e+00,  2.9353e+00,  3.5438e+00,  3.5288e+02,
         1.1951e+00,  1.9034e+00,  2.5342e+00,  4.1357e-01, -2.1272e+00,
         2.4343e+01,  3.3127e+02,  1.7561e+01,  7.6679e+00,  4.3503e+00,
         1.4605e+02,  1.1184e+01,  1.1184e+01,  1.9147e-01, -9.6040e-01,
         2.6799e+00,  9.4141e+01,  3.5333e+02,  1.7290e+00,  5.1585e+00,
         2.6400e-01,  5.4614e-01,  8.1014e-01,  1.1892e+00,  7.1781e-01,
         1.9070e+00,  4.0690e+00,  1.3547e+00,  6.2828e+00,  4.5800e-03,
         5.4205e+00,  2.0371e-01,  3.9774e-01,  6.0145e-01,  1.2960e+02,
         2.7171e+00,  6.8247e+01,  6.1739e-01]), tensor([5.9750e-01, 4.0613e+02, 5.8108e+00, 4.7454e+00, 4.7843e+00, 4.0368e+00,
        2.9280e+00, 3.1273e+00, 2.3723e+00, 3.8311e+00, 1.8252e+00, 9.7076e+00,
        1.4360e+00, 2.5803e+01, 1.1541e+02, 2.3501e-01, 2.95

#### Define Custom Tokenizer 

In [8]:
import pandas as pd
from transformers import BertTokenizer

tokenizer = BertTokenizer(vocab_file= "./vocab_bpe_300.txt" ,lowercase=False, do_basic_tokenize=False)

In [9]:
df = pd.read_fwf('./vocab_bpe_300.txt', header=None)
df

Unnamed: 0,0
0,[PAD]
1,[UNK]
2,[CLS]
3,[SEP]
4,[MASK]
...,...
295,##c12
296,##[Si
297,##c(C(=O
298,##[nH+]


In [31]:
# from rdkit import Chem
# from transformers import BertTokenizer

# from typing import List
# import re

# ## REGEX_PATTERN ##
# SMI_REGEX_PATTERN =  r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"

# class RegexTokenizer:
#     """Run regex tokenization"""

#     def __init__(self, regex_pattern: str=SMI_REGEX_PATTERN) -> None:
#         """Constructs a RegexTokenizer.
#         Args:
#             regex_pattern: regex pattern used for tokenization.
#             suffix: optional suffix for the tokens. Defaults to "".
#         """
#         self.regex_pattern = regex_pattern
#         self.regex = re.compile(self.regex_pattern)

#     def tokenize(self, text: str) -> List[str]:
#         """Regex tokenization.
#         Args:
#             text: text to tokenize.
#         Returns:
#             extracted tokens separated by spaces.
#         """
#         tokens = [token for token in self.regex.findall(text)]
#         return tokens
    
    
# class SMILESTokenizer(BertTokenizer):
#     def __init__(self, 
#         vocab_file: str,
#         unk_token: str = "[UNK]",
#         sep_token: str = "[SEP]",
#         pad_token: str = "[PAD]",
#         cls_token: str = "[CLS]",
#         mask_token: str = "[MASK]",
#         do_lower_case = False,
#         **kwargs,
#         ) -> None:
        
#         super().__init__(
#             vocab_file=vocab_file,
#             unk_token=unk_token,
#             sep_token=sep_token,
#             pad_token=pad_token,
#             cls_token=cls_token,
#             mask_token=mask_token,
#             do_lower_case=do_lower_case,
#             **kwargs,
#         )
        
#         self.tokenizer = RegexTokenizer()

In [1]:
pretrain_config = {
        'embed_dim': 256,#256
        'property_width': 384, #???
        'batch_size': 4,#64
        'temp': 0.07,
        'queue_size': 2048,#65536
        'momentum': 0.995,
        'alpha': 0.4,
        'bert_config': './config_bert.json',    #config file for BERT model. The configuration for ViT can be manually changed in albef.py
        'schedular': {'sched': 'cosine', 'lr': 1e-4, 'epochs': 30, 'min_lr': 1e-5,
                      'decay_rate': 1, 'warmup_lr': 1e-5, 'warmup_epochs': 20, 'cooldown_epochs': 0},
        'optimizer': {'opt': 'adamW', 'lr': 1e-4, 'weight_decay': 0.02}
    }

In [None]:
# SMILES Sequence tokenizer
import torch
import torch.nn.functional as F
from torch import nn 
from xbert import BertConfig 
from transformers import BertTokenizer, BertForMaskedLM


class SPMM(nn.Module):
    def __init__(self,
                 tokenizer=None,
                 config=None,
                 ):
        super().__init__()

        self.tokenizer = BertTokenizer('./vocab_bpe_300.txt', do_lower_case=False,do_basic_tokenize=False)
        embed_dim = config['embed_dim']

        smilesAndFusion_config = BertConfig.from_json_file('./config_bert_smiles_and_fusion_encoder.json')
        property_config = BertConfig.from_json_file('./config_bert_property_encoder.json')
        self.smilesEncoder = BertForMaskedLM(config = smilesAndFusion_config)
        self.propertyEncoder = BertForMaskedLM(config = property_config)

        smilesWidth = self.smilesEncoder.config.hidden_size
        propertyWidth = config['property_width']

        self.smilesProj = nn.Linear(smilesWidth, embed_dim)
        self.propertyProj = nn.Linear(propertyWidth, embed_dim)

        # special tokens & embedding for property input
        self.propertyEmbed = nn.Linear(1, propertyWidth)
        self.property_CLS = nn.parameter(torch.ones([1, 1, propertyWidth]))
        self.property_MASK = nn.parameter(torch.ones([1, 1, propertyWidth]))
        

        self.temp = nn.Parameter(torch.ones([]) * config['temp'])
        self.queue_size = config['queue_size']
        self.momentum = config['momentum']
        
        self.itm_head_smiles = nn.Linear(smilesWidth, 2)
        self.itm_head_properties = nn.Linear(propertyWidth, 2)

        # Momentum Model

        self.smilesEncoder_m = BertForMaskedLM(config = smilesAndFusion_config)
        self.propertyEncoder_m = BertForMaskedLM(config = property_config)
        self.smilesProj_m = nn.Linear(smilesWidth, embed_dim)
        self.propertyProj_m = nn.Linear(propertyWidth, embed_dim)

        self.model_pairs = [[self.smilesEncoder, self.smilesEncoder_m],
                            [self.smilesProj, self.smilesProj_m],
                            [self.propertyEncoder, self.propertyEncoder_m],
                            [self.propertyProj, self.propertyProj_m]]
        
        self.copy_params()

        # Create the queue
        self.register_buffer("smiles_queue", torch.randn(embed_dim, self.queue_size))
        self.register_buffer("text_queue", torch.randn(embed_dim, self.queue_size))
        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

        self.image_queue = nn.functional.normalize(self.image_queue, dim=0)
        self.text_queue = nn.functional.normalize(self.text_queue, dim=0)



    def forward(self, smiles, property, alpha=0):
        
        with torch.no_grad():
            self.temp.clamp_(0.001, 0.5)

        #1. property tokenizing & embedding
        embedProperty = self.propertyEmbed(property.unsqueeze(2))
        
        property_MASK = self.property_MASK.expand(property.size(0), property.size(1), -1)
        mask50 = torch.bernoulli(torch.ones_like(property)*0.5)
        mask50Expand = mask50.unsqueeze(2).repeat(1,1,property_MASK.size(2))
        maskedProperty = embedProperty*(1-mask50Expand) + property_MASK * mask50Expand
        inputProperty = torch.cat([self.property_CLS.expand(property.size(0), -1,-1), maskedProperty], dim=-1)
         
        encodedProperty = self.propertyEncoder(inputs_embeds=property, return_dict=True).last_hidden_state

        #2. input throug encoders
        smilesEmbeds = self.smilesEncoder.bert(smiles, attention_mask = )
        

        #3-1. Contrastive Loss between the different modalities

        #3-2. Contrastive Loss within the same modalities

        #4. X-attention

        #5. Next property prediction

        #6. Next word prediction

        #7. SMILES-property matching 
    
    @torch.no_grad()
    def copy_params(self):
        for model_pair in self.model_pairs:
            for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
                param_m.data.copy_(param.data)  # initialize
                param_m.requires_grad = False

In [None]:
from xbert import BertConfig 
from transformers import BertTokenizer, BertForMaskedLM

smilesAndFusion_config = BertConfig.from_json_file('./config_bert_smiles_and_fusion_encoder.json')
smilesEncoder = BertForMaskedLM(config = smilesAndFusion_config)


In [None]:
sample[0]
sampleEmbed = tokenizer.encode(sample[0], padding='longest', truncation=True, max_length=100, return_tensors='pt')

In [None]:
propertyOriginal = torch.ones([32,53])
torch.ones_like(propertyOriginal)*0.5

mask50 = torch.bernoulli(torch.ones_like(propertyOriginal)*0.5)
mask50Exp = mask50.unsqueeze(2).repeat(1,1,10)


In [None]:
mask50Exp.shape

torch.Size([32, 53, 10])

In [None]:
sample[1].size(0)

53

In [None]:
from transformers import BertTokenizer

tokenizer_sample = BertTokenizer.from_pretrained("bert-base-cased")

sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

encoded_sequence_a = tokenizer_sample(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer_sample(sequence_b)["input_ids"]

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 376kB/s]  
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 9.68kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 189kB/s]


In [None]:
a = tokenizer_sample(sequence_b)
a

{'input_ids': [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}