In [1]:
import logging
logging.basicConfig(format="%(asctime)s — %(levelname)s — %(name)s — %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO)
from aitextgen import aitextgen
from aitextgen.colab import mount_gdrive, copy_file_from_gdrive
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from aitextgen.utils import build_gpt2_config
from aitextgen.tokenizers import train_tokenizer

## Train the Tokenizer

Let's train a Byte-Pair Encoding tokenizer on the ZINC250K dataset. The `train_tokenizer()` function in aitextgen wraps the training method for the `tokenizer` package from HuggingFace.

In [2]:
data_file = "zinc_valid.txt"
train_tokenizer(data_file)

02/19/2021 00:30:20 — INFO — aitextgen.tokenizers — Saving aitextgen-vocab.json and aitextgen-merges.txt to the current directory. You will need both files to build the GPT2Tokenizer.


## Specify a Model Configuration

Let's specify the model config parameters and build a small GPT-2 model (~100 million parameters in size).

In [3]:
config = build_gpt2_config(vocab_size=868, max_length=512, dropout=0.0, n_embd=768, n_layer=12, n_head=12)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.0,
  "bos_token_id": 0,
  "embd_pdrop": 0.0,
  "eos_token_id": 0,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 512,
  "resid_pdrop": 0.0,
  "summary_activation": null,
  "summary_first_dropout": 0.0,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.2.2",
  "use_cache": true,
  "vocab_size": 868
}

## Instantiate a custom GPT-2 model

Instantiate the GPT-2 model using the specified config and custom tokenizer we trained.

In [4]:
ai = aitextgen(config=config, vocab_file="aitextgen-vocab.json", merges_file="aitextgen-merges.txt", to_gpu=True)

02/19/2021 00:30:24 — INFO — aitextgen — Constructing GPT-2 model from provided config.
02/19/2021 00:30:26 — INFO — aitextgen — Using a custom tokenizer.


Lets try randomly generating a molecule now - we should get junk since we haven't trained the model.

In [5]:
ai.generate(1)

[1m[0mK53BrCcccnncCCCCCnOCCOCCNCCOCCOCcSCCCS&ns+])/OnSCCNOCCSnoc8�OCCOCCNCCCNOCCOCCNNCCCOCcNOCCCCOCCCNcNCCCNSSCCCSCCOCCCNcSCCScCCCCOOCnCCCCCOCNCCCCCCNCCCcCSCCCcSCCCS+SCCScCCCCCOCCsnc�	OCNCCCCCCsncCCCCCOCC45CCCcnoc�CCOCCOCcBrCCOCCCNcCCCCCOCCCCCCCNCCCCCNONCCCOCCCNcCCCCCnCOCCOCCNCCCNS(=NCCCNS	�CCCCCOCCCCCCCOCC+](/COCCOCC�NCCCCCCNCCCCCCCSCCOCCOCCNCCOCCCSocncOCCOCCNCSCCNCBrCcsncCCOcncccn�31�cncONC�COCCOCC�53@@](/NCCCcT�CCCc�(=�SCCCS�sncconc�CSCCNCsncCCCCCN�BrCcSCCCSCCCCCOCCNCCCCCCCCCSCOSsncNNNOCnCCCCCOCCCCCCCOCCOCCOCCNNCCOCCCCCCOCCnoc�coSCCSc+ClCc&ClCCcncnc�NCCCc@](=+])/�COCCOCCCSCCSCCOCCOCCOCCCCCCCN��SCCCS45COCCOCC�NS�NCCCcCCCSCcSCCCS��NCCCCCCCSCCSNCCCcCCOCCNC+])(31�@@](/NCCCcCOCCOCTNCCCcCSCCCc�TNCCCcCCOCCNCCOSOCCcOCCcCSCCNCCSCCOOCCOCCN45BrCcx	SSN(-45CCCCCOCCNNCcCCSCSCCCnNCCOncncBrCcNCCOCOCCOCCOCCOCCOCOCCOC+](/	SCCCS��OC@@](/cocOCCS31NCCCCCCNCnOCCS�NCCCNSccccCCCcOCCSNCCO&NCCCCCCCCSNCCCcSCCScCOCCOCCCCCCCNOCCOCCOCCOCCOCSCCOCCCCCN�CSCCO�NCCCcSCCScCOCCOCCCCSCCOCCOC��NCCCNS�SCCCN@

## Train the GPT-2 model on ZINC250K

In [25]:
ai.train("zinc_valid.txt",
         line_by_line=True,
         num_steps=5000,
         generate_every=1000,
         save_every=1000,
         save_gdrive=False,
         batch_size=8,
         n_gpu=1
         )

  0%|          | 0/240133 [00:00<?, ?it/s]

02/19/2021 00:47:07 — INFO — aitextgen.TokenDataset — Encoding 240,133 sets of tokens from zinc_valid.txt.
GPU available: True, used: True
02/19/2021 00:47:12 — INFO — lightning — GPU available: True, used: True
TPU available: None, using: 0 TPU cores
02/19/2021 00:47:12 — INFO — lightning — TPU available: None, using: 0 TPU cores
02/19/2021 00:47:12 — INFO — pytorch_lightning.accelerators.gpu — LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


  0%|          | 0/5000 [00:00<?, ?it/s]

[1m1,000 steps reached: saving model to /trained_model[0m
[1m1,000 steps reached: generating sample texts.[0m
[)C)cc1
<|endoftext|>O=C[C@@H]1C[C)N2ccccc2)n1
<|endoftext|>CN[C@H](c1ccccc1
<|endoftext|>O)c1ccccc1ccc(NC[C)C@@H]2
<|endoftext|>O=C)[C@H](c1nnc(F)c3)[nH]c1
<|endoftext|>O=C[C@@H](C/C@H]1(CC(C/NC(C)]2)c1ccc(F)cc1
<|endoftext|>C(C(=O)Cc1ccc(OC)c1ccc1ccc(Cl)C)CC(=O))
<|endoftext|>O=C[C(c1cc(ccc(-c2ccc(F)c(OC
[1m2,000 steps reached: saving model to /trained_model[0m
[1m2,000 steps reached: generating sample texts.[0m
O)1C@H](C[C@@H]1CC(C)C)C)c2)c1
<|endoftetext|>COc)C(C)NCc1ccccc1F
<<|endoftext|>CCc1ccc(C)c2cc(Cl)c(C)c(Cl)cc2)nc(F)c(C)ccc2)cc1
<|endoftext|>CC[NH+](C@H](H](C[C@H](c1cccc(N=C)C)C)c1
<|endoftext|>CC[C@@@H]3+])3[NH2+]Cc3ccccc2+]C3)cc1
<|endoftext|>C[C@@H](C)C[C@H](NC(=O)Cn1ccc(F)F)cc1
<|endoftext|>O=C1C(NC(=O)Nc2
[1m3,000 steps reached: saving model to /trained_model[0m
[1m3,000 steps reached: generating sample texts.[0m
]1C[C@@H]1CCCN1C[C@@H]2c1
<|endoftex

02/19/2021 01:03:18 — INFO — aitextgen — Saving trained model pytorch_model.bin to /trained_model


## Generate molecules from the trained GPT-2 model

In [26]:
ai.generate(5)

[1m[0mtext|>COc1ccc(NC(=O)c2ccc(NC(=O)c3ccccc3)o2)cc1
<|endoftext|>CC(=O)N1CCC(C(=O)NCC(Cc2ccccc2)C[C@@H](C)O1
<|endoftext|>Cc1ccc(NC(=O)[C@@H](C)c2ccc3c(c3ccccc3)o2)cc1=O
<|endoftext|>C[C@H]1C[C@H]([NH2+][C@@H]1C[C@H]1c1ccc(Cl)cc1
<|endoftext|>C[C@H](NC(=O)c1nccn1Cl)c1ccccc1)c1ccccc1
<|endoftext|>Cc1cccc(N2C(=O)c3ccc(C)c(C)c4)c(C)C)c3
[1m[0m1
(C)cc2cc(NC(=O)N(C)C2)C2)[C@@H](C)O[C@@H]1C(=O)[O-]
<|endoftext|>NC(=O)c1cccc(C[NH+](C)Cc2ccccc2)c1
<|endoftext|>C[C@@H](C(=O)NCC(=O)c1ccc(S(=O)(=O)c(Cl)c1)N1CCc2ccccc2CN1
<|endoftext|>COc1ccc(C(=O)N2C[C@@C@@H]3c4ccccc4C2=O)cc1
<|endoftext|>CC(C)Oc1ccc(F)cc1
<|endoftext|>O=C(NC[C@@H]1CCOC(c2ccc(Cl)cc2)C1
<|endoftext|>C[C
[1m[0mc3ccc(Cl)cc3)cc2)cc1
<|endoftext|>CC(=O)Nc1ccc(C)c(CCNC(=O)c2ccc(C)cc2)c(C)c(N)c1
<|endoftext|>CO[C@@H]1CCC[C@@H]([NH2+][C@@H]1C[C@H](C)C(=O)OCC1
<|endoftext|>C[C@H]1CC1)NC(=O)C[C@H]1CCC[C@@C@@H]1C
<|endoftext|>Cc1cccc(C(=O)N(C)Cc1
<|endoftext|>Cc1ccccc1-c1ccc(NC(=O)c2c1
<|endoftext|>C[C@H](NC(=O)N[C@H]1CCCC[C@@H]1C


Obviously, not all of these molecules are valid, but this is impressive given we only trained on 250K molecules for 5000 steps.