In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
crystals = pd.read_csv('./data_c/entry_mention.csv')

## Name Ambiguity

In [49]:
crystals

Unnamed: 0,entries,mentions,csd_id
0,acetanilide,[2941],ACANIL
1,"2,6-Dimethylphenol",[157226],DMEPOL
2,UPC-33,[508171],METPAF
3,"1-Chloro-2,4-dinitrobenzene","[55124, 55125, 55126, 55127, 55128]",BENCLN
4,tetrahydrate,"[579884, 741350]",OBORUU
...,...,...,...
1441,benzo[e]pyrene,[99355],CEQGEL
1442,3-aminoflavone,[525087],MOVBOP
1443,azetidine,[946358],XATMOV
1444,MCPA,[112522],CMPHAA


In [50]:
target_csd = crystals.csd_id.values

# we have repetitive CSD IDs:
ucsd, csd_cnts = np.unique(target_csd, return_counts=True)

In [54]:
uc=ucsd
uc.sort()
list(uc)

['ABEDUJ',
 'ABIREM',
 'ABUMIT',
 'ABUMOZ',
 'ACAFLR',
 'ACANIL',
 'ACEMID',
 'ACITOY',
 'ACNAQU',
 'ACRDIN',
 'ACTOLD',
 'ADRENL',
 'ADUWIH',
 'AFASEI',
 'AFASUY',
 'AFOVUP',
 'AFUHIT',
 'AFUYIJ',
 'AHEMAB',
 'AHIMAF',
 'AKUVUA',
 'ALOXAN',
 'AMBZAM',
 'AMIPYR',
 'AMITEW',
 'AMNNPQ',
 'AMNPHA',
 'AMPHOL',
 'AMPHOM',
 'AMPYRD',
 'AMPYRE',
 'AMPYRM',
 'ANDSEO',
 'ANEDEF',
 'ANEDIJ',
 'ANTCEN',
 'ANTHAL',
 'ANTPYR',
 'ANTQUO',
 'APAZEY',
 'APOQOO',
 'AQAXUM',
 'ARABOL',
 'ARCLAM',
 'AREBUX',
 'ASAZIG',
 'AVEHOA',
 'AVENAT',
 'AWIZOW',
 'AWULIO',
 'AXOSIQ',
 'AZARIF',
 'AZAROL',
 'AZLENE',
 'AZOBEN',
 'AZOXOF',
 'AZURAC',
 'AZURID',
 'BAGFIY',
 'BAJBIA',
 'BAJCIY',
 'BAKPAE',
 'BANJEH',
 'BANYEV',
 'BARLEO',
 'BARMAL',
 'BARMEP',
 'BASCIH',
 'BASKUE',
 'BAWHEM',
 'BDTOLE',
 'BEDMIG',
 'BEFNIL',
 'BEFNOR',
 'BEFNUX',
 'BEJYIY',
 'BEKFEF',
 'BENCLN',
 'BENZIL',
 'BEOXAZ',
 'BEVHIT',
 'BEYFAM',
 'BEZHUL',
 'BIDJIH',
 'BIDVOA',
 'BIFERO',
 'BIHJIO',
 'BIMKUD',
 'BINAPH',
 'BINMEQ',
 'BIOTIN',

Some of these repetitive CSDs have crystal names that are different only different up to an upper-case letter (e.g., '1,2-dichlorobenzene' and '1,2-Dichlorobenzene' for CSD = 'ABUMIT') and some have names that are more different (e.g., '2,6-Diaminopyridine' and 'pyridine-2,6-diamine' for CSD = 'FOYLEK'). We'll check how many of each of these categories exist:

In [53]:
crystals[crystals.csd_id == "ABEDUJ"]

Unnamed: 0,entries,mentions,csd_id
761,bosutinib,[518],ABEDUJ


In [14]:
reps = ucsd[csd_cnts > 1]

csd_with_diff_names = []
for csd in reps:
    names = crystals[crystals.csd_id == csd].entries
    #names = D[D['CSD ID'].str.contains(csd)]['chemical name']
    names = [x.lower() for x in names]
    if len(np.unique(names)) > 1:
        print(csd, names)
        csd_with_diff_names += [csd]

ANTQUO ['anthracene-9,10-dione', '9,10-anthraquinone', '9,10-anthraquinone']
BINAPH ["1,1'-binaphthalene", "1,1'-binaphthyl"]
CLANIC ['4-chloroaniline', 'p-chloroaniline']
COTZAN ['acetaminophen', 'paracetamol']
COUMAR ['chromen-2-one', '2h-1-benzopyran-2-one']
COWHUR ['(-)-epicatechin', 'epicatechin']
DANTEN ['bianthrone', 'dianthraquinone']
DNBENZ ['m-dinitrobenzene', '1,3-dinitrobenzene']
DPUREA ['1,3-diphenylurea', "n,n'-diphenylurea"]
DUCKOB ['n-butane', 'butane']
DXYLEN ['[2,2]-paracyclophane', '[2.2]paracyclophane']
ETDIAM ['1,2-diaminoethane', '1,2-ethanediamine']
FLURON ['9-fluorenone', '9h-fluoren-9-one']
FOYLEK ['pyridine-2,6-diamine', '2,6-diaminopyridine']
HIQWEJ ["4,4'-bipyridine", "4,4'-bipyridine", "4,4'-bipyridyl"]
HXQUIN ['quinolin-8-ol', '8-hydroxyquinoline', '8-hydroxyquinoline']
HYQUIN ['benzene-1,4-diol', 'hydroquinone']
JEMPEZ ['v11', 'v101']
NANILI ['4-nitroaniline', 'p-nitroaniline', '4-nitroaniline']
NOZKES ['ethane-1,2-diol', '1,2-ethanediol']
OCHTET ['1,3,5,

In [12]:
D = pd.read_csv('./data_c/crystals_wdoi.csv')

In [None]:
len(csd_with_diff_names)

For those CSDs with different chemical names, we break the all the corresponding CSDs accordingly. For example, for CSD "DUCKOB" we have the following names:

In [15]:
csd_with_diff_names
crystals[crystals.csd_id == 'DUCKOB'].entries

909     n-Butane
1433      butane
Name: entries, dtype: object

whereas the full set of CSD/names for this ID is the following set:

In [16]:
D[D['CSD ID'].str.contains('DUCKOB')][['CSD ID', 'chemical name']]

Unnamed: 0,CSD ID,chemical name
167061,DUCKOB,Decadeutero-n-butane
167062,DUCKOB01,Decadeutero-n-butane
167063,DUCKOB02,Decadeutero-n-butane
167064,DUCKOB03,Decadeutero-n-butane
167065,DUCKOB04,n-Butane
167066,DUCKOB05,butane
167067,DUCKOB06,butane
167068,DUCKOB07,butane
167069,DUCKOB08,butane
167070,DUCKOB09,butane


We note that there are three different names associated with this CSD: "Decadeutero-n-butane", "n-Butane" and "butane". Among these, only the last two names exist in our word2vec model. And then there are also combination of these cases:

In [17]:
crystals[crystals.csd_id == csd_with_diff_names[0]]

Unnamed: 0,entries,mentions,csd_id
234,"anthracene-9,10-dione","[23036, 23037, 23038, 23039, 23040, 23041, 230...",ANTQUO
458,"9,10-anthraquinone",[23035],ANTQUO
847,"9,10-Anthraquinone",[23034],ANTQUO


## Identify CSDs for Names More Accurately
We used to identify CSD of any given name such that all variations of its CSD is also considered. For example, for "DUCKOB", considering all variations of CSD gives us "Decadeutero-n-butane" too, which seems slightly different than "butane" than is included in our word2vec vocabulary.

Hence, here we take each given chemical name, and only consider those CSD variations whose names exactly (up to uppercase letters) match our candidate.

In [18]:
target_names = crystals['entries'].values

In [19]:
D['chemical name'] = D['chemical name'].str.lower()
D['synonyms'] = D['synonyms'].str.lower()

In [20]:
# we do not differ upper- or lower-case letters
target_names = np.unique([x.lower() for x in target_names])

In [22]:
%%time
name2csd = {}
for name in tqdm(target_names):
    csds = D[(D['chemical name'] == name) | (D['synonyms'] == name)]['CSD ID'].values
    name2csd[name] = csds.tolist()

100%|██████████| 1421/1421 [02:08<00:00, 11.04it/s]

CPU times: user 2min 6s, sys: 1.66 s, total: 2min 8s
Wall time: 2min 8s





*Minor Issue:* It seems that some terms are equivalent to each other (this is because we foudn that some CSDs are associated with two names after exact-matching). Here is the list manually compiled:
* "acetaminophen"   & "paracetamol"
* "bianthrone" & "dianthraquinone"
* "hydroquinone" & "benzene-1,4-diol"
* "2-methylphenol" & "o-cresol"
* "s-triazine" & "1,3,5-triazine"
* "m-dinitrobenzene" & "1,3-dinitrobenzene	"

In [185]:
to_remove = ['paracetamol', 'dianthraquinone', 'benzene-1,4-diol', 'o-cresol', '1,3,5-triazine', '1,3-dinitrobenzene']

[name2csd.pop(x) for x in to_remove]

In [225]:
# saving the dictionary
with open("name_to_csd.json", "w") as f:
    json.dump(name2csd, f, indent=True)

## Extracting the Relevant Sub-matrix

In [25]:
# all the CSDs that are involved
target_csds = sum(name2csd.values(), [])

In [29]:
store = pd.HDFStore('./data_c/entry_mention_vs_CSD_AMD100_dm.h5')

In [30]:
# get the first chunk to set up the column-locator
chunk = store.select('df', start=0, stop=100)
cols = chunk.columns
cols_locator = cols.isin(target_csds)

In [217]:
%%time

submat = pd.DataFrame([])

# I already checked it out and saw that the number of rows in the larger matrix is 3926,
# hence we need 39+1=40 iterations (assuming the chunk size is 100)
for i in tqdm(range(40)):
    chunk = store.select('df', start=i * 100, stop=(i + 1) * 100)
    row_locator = chunk.index.isin(target_csds)
    # double check if the order of columns does not change across the chunks
    assert np.all(chunk.columns == cols), "Order of the columns suddenly changed in the {}-th chunk.".format(i)
    # get the submatrix in the chunk and append it to the previous submatrices
    sub_chunk = chunk.iloc[row_locator, cols_locator]
    submat = pd.concat((submat, sub_chunk), axis=0)

100%|███████████████████████████████████████████████████████████| 40/40 [01:23<00:00,  2.09s/it]

CPU times: user 1min 4s, sys: 18.3 s, total: 1min 23s
Wall time: 1min 23s





In [218]:
submat.shape

(2658, 3064)

In [222]:
submat.to_csv('mst_dists_submatrix.csv')

In [23]:
submat=pd.read_csv('./data_c/mst_dists_submatrix.csv')