# This files prepares data for `GCN` by creating `data.cites` & `data.content`  files from `.txt` data files.

## Author : Manas Mahale <<manas.mahale@bcp.edu.in>>

- [x] Enumerate `SMILES`.
- [x] Create `Vocab`.
- [x] Create `.content` file.
- [x] Create `.cites` file with `n` clustered references.

In [1]:
%config Completer.use_jedi = False

In [2]:
import numpy as np
np.random.seed = 42
import random
random.seed = 42
import pandas as pd

In [3]:
drug      = pd.read_csv('./data/drug/drug.txt', header=None)
non_drug  = pd.read_csv('./data/drug/non_drug.txt', header=None)
drug_like = pd.read_csv('./data/drug/drug_like.txt', header=None)

In [4]:
drug['label']      = 'drug'
drug_like['label'] = 'drug_like'
non_drug['label']  = 'non_drug'

In [5]:
frames = [drug, drug_like, non_drug]
result = pd.concat(frames)
result['id'] = range(len(non_drug) + len(drug) + len(drug_like))
result.columns = ['smiles', 'label', 'id']
result.set_index('id')

Unnamed: 0_level_0,smiles,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,N[C@@H](CSSC[C@H](N)C(=O)O)C(=O)O,drug
1,O=C(O)c1cc(O)nc(O)n1,drug
2,NC(=O)CCCC[C@H]1CCSS1,drug
3,CCCCC[C@](C)(O)/C=C/[C@H]1[C@H](O)C[C@H](O)[C@...,drug
4,Cn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=O)[C@H...,drug
...,...,...
2995,Cc1ccc(NCc2cc3cc4c(cc3n(CC(=O)Nc3ccc(F)cc3F)c2...,non_drug
2996,O=C(Nc1ncc2c(n1)-c1ccccc1OC2)c1ccc(Cl)cc1Cl,non_drug
2997,O=C1CCCN1CCCNc1ncnc2c1[nH]c1ccccc12,non_drug
2998,CCN(CC)CCCNC(=O)CSc1nc(-c2ccc(OC)cc2)cc(C(F)(F...,non_drug


In [6]:
smiles = result['smiles'].values

In [7]:
a = [list(set(i)) for i in smiles]
vocab = list(set(np.concatenate(a)))
print("Vocab length : ", str(len(vocab)))

Vocab length :  34


In [8]:
l     = result['label'].values
index = result['id'].values 

In [9]:
with open('./data/drug/data.content', 'w') as f:
    for n, i in enumerate(smiles) :
        f.write(str(n+1) + '\t')
        for j in vocab :
            if j in i :
                f.write('1\t')
            else:
                f.write('0\t')
        f.write(l[n] + '\n')

In [10]:
labels = result['label'].unique()

In [11]:
drug      = result[result['label'] == 'drug']['id'].values
drug_like = result[result['label'] == 'drug_like']['id'].values
non_drug  = result[result['label'] == 'non_drug']['id'].values

In [12]:
def pick(filename, data, n):
    with open(filename, 'w') as f:
        for j in data:
            for i in result[result['label'] == j]['id'].values:
                for _ in range(n):
                    f.write(str(i) + '\t' + str(result[result['label'] == j]['id'].values[random.randint(0, len(result[result['label'] == j]['id'].values)-1)]) + '\n')

In [13]:
pick('./data/drug/data.cites', labels, 5)

In [14]:
a = pd.read_csv('./data/drug/data.cites', sep='\t' , header=None)
df = a.reindex(np.random.permutation(a.index))
df.set_index(0).to_csv('./data/drug/data.cites', sep='\t', header=None)