In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
train_srp53 = pd.read_csv('/content/drive/MyDrive/Molecular Exploration/Data/sr-p53.smiles',
                          sep='\t',
                          names=['smiles', 'id', 'target'])

In [None]:
train_srp53.head()

Unnamed: 0,smiles,id,target
0,[I-].CCN1C(SC2=CC=CC=C12)=CC=CC3=[N+](CC)C4=CC...,NCGC00166288-01,1
1,[H][C@@]12[C@H](OC(=O)[C@@](O)(CCCC(C)(C)O)CC(...,NCGC00185752-01,1
2,Cl.CC(N)COC1=C(C)C=CC=C1C,NCGC00094121-01,0
3,CO.COC1=C(Cl)C=C(Cl)C(NC2=C(C=NC3=CC(OCCCN4CCN...,NCGC00241107-01,1
4,[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@@]2([H])NC(=...,NCGC00094586-01,0


In [None]:
len(train_srp53)

8634

In [None]:
sum(train_srp53.target)

537

In [None]:
!pip install -q SmilesPE

### Tokenization of string compounds with SmilesPE (Byte pair encoding library with built-in tokenizers)

In [None]:
from SmilesPE.pretokenizer import atomwise_tokenizer

smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = atomwise_tokenizer(smi)
print(toks)

['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Br']


***example of pretrained SMILES byte-pair encoding***

In [None]:
import requests
file_url = 'https://raw.githubusercontent.com/XinhaoLi74/SmilesPE/master/SPE_ChEMBL.txt'

r = requests.get(file_url, stream = True)

with open('/content/drive/MyDrive/Molecular Exploration/Data/BPE_codes.txt', 'wb') as file:
    for block in r.iter_content(chunk_size = 1024):
        if block:
            file.write(block)

In [None]:
import codecs
from SmilesPE.tokenizer import *

spe_vob= codecs.open('/content/drive/MyDrive/DATA_2040/Molecular Exploration/Data/BPE_codes.txt')
spe = SPE_Tokenizer(spe_vob)

smi = 'CC[N+](C)(C)Cc1ccccc1Br'
bpe_encoding = spe.tokenize(smi)

# should get >>> 'CC [N+](C) (C)C c1ccccc1 Br'

FileNotFoundError: ignored

*The output of the byte-pair encoding is a space-separated string of tokens, each token being a string. The example output below would be the input sequence to a model.*

In [None]:
bpe_encoding.split(' ')

NameError: ignored

### Looking at the byte-pair encoding alphabet across the whole (~8000 large) dataset

In [None]:
# initialize the pretrained BP encoder
spe = SPE_Tokenizer(spe_vob)

# initialize empyt vocabulary set
alphabet = set()

# traverse through data adding byte-pair tokens to vocabulary
for smi in train_srp53.smiles:
    bpe_encoding = spe.tokenize(smi)
    tkns = set(bpe_encoding.split(' '))
    alphabet = alphabet.union(tkns)

***The alphabet for this training set is 1096 elements -- the whole alphabet used to train this BP encoder is ~3000 ==> what do we do to prepare for getting test samples with tokens unseen in the training set?***

In [None]:
len(alphabet)

NameError: ignored

In [None]:
from matplotlib import pyplot as plt

In [None]:
def smiles_to_token(row):
  return atomwise_tokenizer(row['smiles'])

train_srp53['tokens'] = train_srp53.apply(lambda row: smiles_to_token(row), axis=1)

In [None]:
train_srp53.head()

Unnamed: 0,smiles,id,target,tokens
0,[I-].CCN1C(SC2=CC=CC=C12)=CC=CC3=[N+](CC)C4=CC...,NCGC00166288-01,1,"[[I-], ., C, C, N, 1, C, (, S, C, 2, =, C, C, ..."
1,[H][C@@]12[C@H](OC(=O)[C@@](O)(CCCC(C)(C)O)CC(...,NCGC00185752-01,1,"[[H], [C@@], 1, 2, [C@H], (, O, C, (, =, O, ),..."
2,Cl.CC(N)COC1=C(C)C=CC=C1C,NCGC00094121-01,0,"[Cl, ., C, C, (, N, ), C, O, C, 1, =, C, (, C,..."
3,CO.COC1=C(Cl)C=C(Cl)C(NC2=C(C=NC3=CC(OCCCN4CCN...,NCGC00241107-01,1,"[C, O, ., C, O, C, 1, =, C, (, Cl, ), C, =, C,..."
4,[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@@]2([H])NC(=...,NCGC00094586-01,0,"[[H], [C@], 1, 2, S, C, (, C, ), (, C, ), [C@@..."


In [None]:
vocab = set()
for smi in train_srp53.smiles:
  tok = atomwise_tokenizer(smi)
  tokens = set(tok)
  vocab = vocab.union(tokens)

def CountFrequency(my_list):
  
    # Creating an empty dictionary 
    freq = {}
    for item in my_list:
        if item in freq:
            freq[item] += 1
        else:
            freq[item] = 1
    
    return freq

token_appears_once = {}
token_freq = {}
token_prop = {k:[] for k in vocab}
smile_lengths = []

for i, row in train_srp53.iterrows():

  token_dict = CountFrequency(row['tokens'])

  smile_lengths.append(len(row['tokens']))

  for token, count in token_dict.items():

    if token in token_appears_once.keys():
      token_appears_once[token] += 1
    else:
      token_appears_once[token] = 1

    if token in token_freq.keys():
      token_freq[token] += count
    else:
      token_freq[token] = count

  for tok in token_prop.keys():

      token_prop[tok].append(row['tokens'].count(tok) / len(row['tokens']))
    

In [None]:
print(token_appears_once['N'])
print(token_freq['N'])
print(len(token_prop['N']))
print(len(smile_lengths))

4486
10031
8634
8634


## EDA Plots

In [None]:
import plotly.express as px
from heapq import nlargest
  
def dict_to_df(d, N):
    
  # N largest values in dictionary
  # Using nlargest
  res = nlargest(N, d, key = d.get)
    
  df = pd.DataFrame(columns=['Token', 'Count'])
  df['Token'] = res

  counts = [d[token] for token in res]
  df['Count'] = counts

  return df

In [None]:
token_appears_df = dict_to_df(token_appears_once, 30)
fig = px.bar(token_appears_df, x='Token', y='Count')
fig.show()

In [None]:
token_freq_df = dict_to_df(token_freq, 30)
fig = px.bar(token_freq_df, x='Token', y='Count', log_y=True)
fig.show()

In [None]:
fig = px.histogram(pd.DataFrame(smile_lengths, columns=['Lengths']), x = 'Lengths')
fig.show()

In [None]:
prop_df = dict_to_df(token_prop, len(vocab))
prop_df['Average'] = prop_df.apply(lambda row: np.mean(row.Count), axis=1)
prop_df_sorted = prop_df.sort_values(by=['Average'], ascending=False)

# fig = px.box(prop_df_sorted.head(10), x='Token', y='Count')
# fig.show()

In [None]:
# prop_dict = {'C': token_prop['C'], '(':token_prop['(']}
# # prop_df = pd.DataFrame([token_prop['C'], token_prop['('], token_prop[')']], columns=['Carbon', '(', ')'])
# prop_df = pd.DataFrame(prop_dict)

# fig = px.box(prop_df, y=)
# fig.show()