In [None]:
import cirpy
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from gensim.models import Word2Vec
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm

# Get the SMILES and IUPAC Format

In [None]:
name2csd = json.load(open('./data_c/name_to_csd.json'))
name2smile_csd = dict()
l = list(enumerate(name2csd.keys()))
for idx, nm in l:
    smile = cirpy.resolve(nm, 'smiles')
    if smile:
        name2smile_csd[nm] = (smile, name2csd[nm])
        print(smile)
    time.sleep(1)

In [None]:
for idx, nm in l[21:]:
    smile = cirpy.resolve(nm, 'smiles')
    if smile:
        name2smile_csd[nm] = (smile, name2csd[nm])
        print(smile)
    time.sleep(1)

with open("name_to_smile.json", "w") as f:
    json.dump(name2smile_csd, f, indent=True)

In [None]:
name2smile = json.load(open('./name_to_smile.json'))
len(name2smile_csd.keys())

In [None]:
name2mol = dict()
for nm in list(name2smile.keys()):
    sm = name2smile[nm][0]
    mol = Chem.MolFromSmiles(sm)
    mc = rdMolDescriptors.CalcMolFormula(mol)
    name2mol[nm] = mc

with open("name_to_mol.json", "w") as f:
    json.dump(name2mol, f, indent=True)

# Relocate the Chemistries

In [None]:
crystals = pd.read_csv('./data_c/entry_mention.csv')
crystals

In [None]:
D = pd.read_csv('./data_c/crystals_wdoi.csv')
D

In [None]:
w2v_md = Word2Vec.load('./models/pubmed.model')

In [None]:
target_names = crystals['entries'].values
target_names = np.unique(target_names)
D['chemical name L'] = D['chemical name'].str.lower()
D['synonyms L'] = D['synonyms'].str.lower()

In [None]:
% % time
name2csd = dict()
for name in tqdm(target_names):
    csds = D[(D['chemical name L'] == name.lower()) | (D['synonyms L'] == name.lower())]['CSD ID'].values
    name2csd[name] = csds.tolist()

In [None]:
with open("name_to_csd.json", "w") as f:
    json.dump(name2csd, f, indent=True)

In [None]:
len(name2csd)

In [None]:
name2smile_csd = dict()
l = list(enumerate(name2csd.keys()))
for idx, nm in l:
    smile = cirpy.resolve(nm, 'smiles')
    if smile:
        name2smile_csd[nm] = (smile, name2csd[nm])
        print(smile)
    time.sleep(1)

In [None]:
for idx, nm in l[21:]:
    smile = cirpy.resolve(nm, 'smiles')
    if smile:
        name2smile_csd[nm] = (smile, name2csd[nm])
        print(smile)
    time.sleep(1)

with open("name_to_smile.json", "w") as f:
    json.dump(name2smile_csd, f, indent=True)

In [None]:
def get_euler_dist(v1, v2):
    vi = np.array(v1).reshape([-1])
    vj = np.array(v2).reshape([-1])
    return np.sqrt(np.sum(np.square(vi - vj)))


def get_cos_similar(v1, v2):
    vi = np.array(v1).reshape([-1])
    vj = np.array(v2).reshape([-1])

    num = float(np.dot(vi, vj))  # 向量点乘
    denom = np.linalg.norm(vi) * np.linalg.norm(vj)  # 求模长的乘积
    return (num / denom) if denom != 0 else 0

In [None]:
smile_embd = pd.read_csv('./smile_embedding.csv')
smile_embd

In [None]:

for idx, i_row in smile_embd.iterrows():
    for jdx, j_row in smile_embd[idx + 1:].iterrows():
        if i_row['iupac'].lower() == j_row['iupac'].lower():
            print(idx, jdx, i_row['iupac'], j_row['iupac'])
            smile_embd.drop(index=jdx, inplace=True)
smile_embd

In [None]:
%%time
nm = list(smile_embd['iupac'])

str_df = pd.DataFrame(columns=nm, index=nm)
sem_df = pd.DataFrame(columns=nm, index=nm)

for i, name in tqdm(enumerate(nm)):
    semvec1 = w2v_md.wv[name]
    strvec1 = [float(i) for i in smile_embd['embd'].iloc[i][1:-1].split()]
    for j in range(i):
        strvec2 = [float(i) for i in smile_embd['embd'].iloc[j][1:-1].split()]
        sem_cos = get_cos_similar(semvec1, w2v_md.wv[nm[j]])
        str_euler = get_euler_dist(strvec1, strvec2)

        str_df.iloc[i, j] = str_euler
        str_df.iloc[j, i] = str_euler
        sem_df.iloc[i, j] = sem_cos
        sem_df.iloc[j, i] = sem_cos
sem_df

In [None]:
str_df

In [None]:
sem_df.to_csv('./sementic_df.csv')
str_df.to_csv('./structural_df.csv')

In [None]:
str_df=pd.read_csv('./structural_df.csv')
str_df.index=list(str_df['Unnamed: 0'])
str_df.drop(columns='Unnamed: 0',inplace=True)
str_df

In [None]:
sem_df=pd.read_csv('./sementic_df.csv')
sem_df.index=list(sem_df['Unnamed: 0'])
sem_df.drop(columns='Unnamed: 0',inplace=True)
sem_df

In [None]:
def get_scatter(sem_df, str_df, threshod=10, getplt=False):
    nm = list(sem_df.columns)
    nm_thre = [name for name in nm if w2v_md.wv.vocab[name].count > threshod]
    op_sem_df = sem_df.loc[nm_thre, nm_thre]
    op_str_df = str_df.loc[nm_thre, nm_thre]
    X = []
    y = []
    for i in range(len(op_sem_df)):
        for j in range(i - 1):
            X.append(op_str_df.iloc[i, j])
            y.append(op_sem_df.iloc[i, j])

    sp = spearmanr(X, y)
    pr = pearsonr(X, y)
    if getplt:
        plt.title("Threshod {} distances\nCount of Samples: {}".format(threshod, len(y)))
        plt.ylabel("Semantic Cosine Similarity")
        plt.xlabel("Structural Distance")
        plt.scatter(X, y)
        plt.show()
        print(sp)
        print(pr)

    return sp, pr, len(nm_thre)

In [None]:
res = get_scatter(sem_df, str_df, threshod=10, getplt=True)

In [None]:
res = get_scatter(sem_df, str_df, threshod=100, getplt=True)

In [None]:
res = get_scatter(sem_df, str_df, threshod=1000, getplt=True)

In [None]:
%%time
thresholds = np.arange(0, 10000, 10)
sp_coefs = np.zeros(len(thresholds))
sp_pvals = np.zeros(len(thresholds))
pr_coefs = np.zeros(len(thresholds))
pr_pvals = np.zeros(len(thresholds))
nrows = np.zeros(len(thresholds))
for i, thr in tqdm(list(enumerate(thresholds))):
    res = get_scatter(sem_df, str_df, threshod=thr)
    sp_coefs[i] = res[0][0]
    sp_pvals[i] = np.log(res[0][1]) if res[0][1] > 0 else np.nan
    pr_coefs[i] = res[1][0]
    pr_pvals[i] = np.log(res[1][1]) if res[1][1] > 0 else np.nan
    nrows[i] = res[2]

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].plot(thresholds, pr_coefs, lw=2, label='Pearson')
axes[0].plot(thresholds, sp_coefs, lw=2, label='Spearman')
axes[0].set_xlim([0, 10000])
axes[0].set_ylabel('Correlation Coefficients', fontsize=15)
axes[0].set_xlabel('Threshold on Mentions', fontsize=15)
axes[0].legend(fontsize=15, loc=1)

axes[1].plot(thresholds, pr_pvals, lw=2, label='Pearson')
axes[1].plot(thresholds, sp_pvals, lw=2, label='Spearman')
axes[1].axhline(y=np.log(0.01), color='r', ls='--', label='log(0.01)')
axes[1].set_xlim([-300, 10000])
axes[1].set_ylabel('Log(p-value)', fontsize=15)
axes[1].set_xlabel('Threshold on Mentions', fontsize=15)
axes[1].legend(fontsize=15, loc=4)

axes[2].plot(thresholds, nrows, lw=2, color='indianred', label='No. Columns')
axes[2].set_xlim([-300, 10000])
axes[2].set_ylabel('Number of Crystals', fontsize=15, labelpad=-5)
axes[2].set_xlabel('Threshold on Mentions', fontsize=15)

plt.subplots_adjust(right=1.3)

In [None]:
{
    "depth": 2,
    "phrase_count": 10,
    "phrase_threshold": 15,
    "size": 200,
    "window": 8,
    "min_count": 5,
    "sg": True,
    "hs": True,
    "workers": 20,
    "negative": 15,
    "alpha": 0.01,
    "subsample": 0.0001,
    "batch": 5000,
    "epochs": 30
}

In [None]:
pd.read_csv('smile_embedding.csv')