# Nearest atoms from KDTree
## New hyperparameter for tuning - radius
### radius = max distance for atoms to be neighbors

In [237]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from scipy.spatial import cKDTree as KDTree
from tqdm import tqdm_notebook as tqdm

In [238]:
train= pd.read_csv('champs-scalar-coupling/train.csv') # Load only first 20 rows
# test= pd.read_csv('champs-scalar-coupling/test.csv')
structures= pd.read_csv('champs-scalar-coupling/structures.csv')

In [239]:
# molecules with F atom
mol_F = structures[structures['atom']=='F']['molecule_name'].unique()
len(mol_F)

1907

In [240]:
structures[structures.molecule_name==mol_F[0]].to_csv('mol_f.csv')

In [241]:
# molecules with N atom
mol_N = structures[structures['atom']=='N']['molecule_name'].unique()
len(mol_N)

79827

In [242]:
# molecules with O atom
mol_O = structures[structures['atom']=='O']['molecule_name'].unique()
len(mol_O)

111279

In [243]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [244]:
struct = structures.groupby(['molecule_name','atom'])['atom_index'].count().unstack().fillna(0).reset_index()
struct['total']=structures.groupby('molecule_name')['atom'].count().values
struct.head(10)

atom,molecule_name,C,F,H,N,O,total
0,dsgdb9nsd_000001,1.0,0.0,4.0,0.0,0.0,5
1,dsgdb9nsd_000002,0.0,0.0,3.0,1.0,0.0,4
2,dsgdb9nsd_000003,0.0,0.0,2.0,0.0,1.0,3
3,dsgdb9nsd_000004,2.0,0.0,2.0,0.0,0.0,4
4,dsgdb9nsd_000005,1.0,0.0,1.0,1.0,0.0,3
5,dsgdb9nsd_000007,2.0,0.0,6.0,0.0,0.0,8
6,dsgdb9nsd_000008,1.0,0.0,4.0,0.0,1.0,6
7,dsgdb9nsd_000009,3.0,0.0,4.0,0.0,0.0,7
8,dsgdb9nsd_000010,2.0,0.0,3.0,1.0,0.0,6
9,dsgdb9nsd_000011,2.0,0.0,4.0,0.0,1.0,7


In [245]:
print("Max number of F:", struct.F.max())
print("")
print("Max number of N:", struct.N.max())
print("")
print("Max number of O:", struct.O.max())

Max number of F: 6.0

Max number of N: 7.0

Max number of O: 5.0


In [333]:
def add_col_atom(df, atom, number_atoms):
    """Add new columns with the distances to heteroatoms.
       F1 corresponds to the nearest atom F.
       If F is absent in the molecule, we set large distance."""
    
    for i in range(1, number_atoms+1):
            df[str(atom)+str(i)]=1000
    return df

def find_neigb_atom(structures, molecules, atom_to_find, radius):
    """Radius for finding neighbors"""
    
    atom_dict = {'F': 6, 'N': 7, "O": 5}
    
    #Set empty dataframe to store results
    df_nieghb = add_col_atom(pd.DataFrame(columns=structures.columns), atom_to_find, atom_dict[atom_to_find])
    
    for imol,name in tqdm(list(enumerate(molecules))):
        
        molecule = structures.set_index('molecule_name').loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        #Set new columns and rewrite some values
        res= add_col_atom(molecule.reset_index(),atom_to_find, atom_dict[atom_to_find])
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
           
            inds = inds[1:sum(distances<radius)+1]

            dist_atom = distances[atoms[inds] == atom_to_find]
            ind_atom = inds[atoms[inds] == atom_to_find]
           
            for i, d in enumerate(dist_atom,1): # print(i,  d) -> 0 (0, 3.381815264756334), 1 (2, 3.382439208351072
                res.loc[a0, str(atom_to_find)+str(i)]=d
        
        df_nieghb = df_nieghb.append(res, ignore_index=True)
        
        print(df_nieghb.shape)
    
    return df_nieghb#, out_dist, out_atoms

In [334]:
#Set hyperparameters
#atom_dict = {'F': int(struct.F.max()), 'N': int(struct.N.max()), "O": int(struct.O.max())}
atom_dict = {'F': 6, 'N': 7, "O": 5}
radius_dist=3

In [335]:
%%time
struct_dist_F= find_neigb_atom(structures, mol_F[:10], 'F', radius_dist)

(7, 12)
(14, 12)
(24, 12)
(33, 12)
(44, 12)
(56, 12)
(67, 12)
(78, 12)
(88, 12)
(98, 12)
CPU times: user 3.41 s, sys: 1.08 s, total: 4.49 s
Wall time: 5.68 s


In [223]:
def find_neighbors(structures, molecules, radius):
    """Radius for finding neighbors"""
    out_atom_index = [] ; out_atom_name = []; 
    out_name =[] ; out_neigb= []; out_dists= [];
    out_neigb_number =[]; out_all_atoms = []
    
    for imol,name in tqdm(list(enumerate(molecules))):
          
        molecule = structures.loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            out_name.append(name)
            #out_all_atoms.append(molecule.shape[0])
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
            inds = inds[1:sum(distances<radius)+1]
            
            out_all_atoms.append(len(atoms_idx))
            out_neigb_number.append(sum(distances<radius))
            out_neigb.append(atoms[inds])
            out_dists.append(distances)
            out_atom_index.append(a0)
            out_atom_name.append(atoms[a0])
    
    neighbors = pd.DataFrame({'molecule_name':out_name, 'atom_index': out_atom_index,'atom': out_atom_name, 
                              'all_atoms':out_all_atoms,'number_neihgbors': out_neigb_number,'neighbors':out_neigb, 
                              'L2dist':out_dists})
    #return len(out_name), len(out_neigb), len(out_dists)
    return neighbors
 

In [224]:
#Set hyperparameter
radius=1.8

In [225]:
%%time
train_neigb = find_neighbors(structures.set_index('molecule_name'), train.molecule_name.unique(), radius)

CPU times: user 5min 59s, sys: 13.5 s, total: 6min 12s
Wall time: 6min 12s


In [331]:
train_neigb.head()

Unnamed: 0,molecule_name,atom_index,atom,all_atoms,number_neihgbors,neighbors,L2dist
0,dsgdb9nsd_000001,0,C,5,4,"[H, H, H, H]","[1.0919463791331034, 1.0919475411120265, 1.091..."
1,dsgdb9nsd_000001,1,H,5,4,"[C, H, H, H]","[1.0919530596119005, 1.7831197560388008, 1.783..."
2,dsgdb9nsd_000001,2,H,5,4,"[C, H, H, H]","[1.0919516185813627, 1.7831197560388008, 1.783..."
3,dsgdb9nsd_000001,3,H,5,4,"[C, H, H, H]","[1.0919463791331034, 1.783147496403011, 1.7831..."
4,dsgdb9nsd_000001,4,H,5,4,"[C, H, H, H]","[1.0919475411120265, 1.7831478722297796, 1.783..."


In [328]:
train_neigb.shape

(1533537, 7)

In [301]:
train_neigb[train_neigb['atom']=='F']['molecule_name'].unique()

array(['dsgdb9nsd_000826', 'dsgdb9nsd_000828', 'dsgdb9nsd_000830', ...,
       'dsgdb9nsd_133805', 'dsgdb9nsd_133808', 'dsgdb9nsd_133810'],
      dtype=object)

In [317]:
qq = train_neigb[train_neigb['molecule_name']=='dsgdb9nsd_000828'].copy()
qq

Unnamed: 0,molecule_name,atom_index,atom,all_atoms,number_neihgbors,neighbors,L2dist
5942,dsgdb9nsd_000828,0,F,7,1,[C],[1.3227281340979686]
5943,dsgdb9nsd_000828,1,C,7,4,"[F, F, F, C]","[1.3227281340979686, 1.3394346231006724, 1.339..."
5944,dsgdb9nsd_000828,2,F,7,1,[C],[1.3394364965282812]
5945,dsgdb9nsd_000828,3,F,7,1,[C],[1.3394346231006724]
5946,dsgdb9nsd_000828,4,C,7,3,"[H, O, C]","[1.1085379205303283, 1.1949593807724774, 1.542..."
5947,dsgdb9nsd_000828,5,O,7,1,[C],[1.1949593807724774]
5948,dsgdb9nsd_000828,6,H,7,1,[C],[1.1085379205303283]


In [318]:
ww = struct_dist_F[struct_dist_F['molecule_name']=='dsgdb9nsd_000828'].loc[:, :"F3"].copy()
ww

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,F1,F2,F3
7,dsgdb9nsd_000828,0,F,0.036904,0.054835,-0.061209,2.170204,2.170205,1000.0
8,dsgdb9nsd_000828,1,C,-0.017329,1.374488,0.010798,1.322728,1.339435,1.339436
9,dsgdb9nsd_000828,2,F,1.227817,1.868042,0.000406,2.166995,2.170204,1000.0
10,dsgdb9nsd_000828,3,F,-0.659056,1.848638,-1.065052,2.166995,2.170205,1000.0
11,dsgdb9nsd_000828,4,C,-0.741532,1.857302,1.284527,2.351043,2.351046,2.380305
12,dsgdb9nsd_000828,5,O,-1.192351,1.10581,2.096901,2.69686,1000.0,1000.0
13,dsgdb9nsd_000828,6,H,-0.794344,2.962136,1.358063,2.670096,2.670144,1000.0


In [326]:
zz=struct_dist_F[struct_dist_F['molecule_name']=='dsgdb9nsd_000828'].copy()
zz

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,F1,F2,F3,F4,F5,F6
7,dsgdb9nsd_000828,0,F,0.036904,0.054835,-0.061209,2.170204,2.170205,1000.0,1000,1000,1000
8,dsgdb9nsd_000828,1,C,-0.017329,1.374488,0.010798,1.322728,1.339435,1.339436,1000,1000,1000
9,dsgdb9nsd_000828,2,F,1.227817,1.868042,0.000406,2.166995,2.170204,1000.0,1000,1000,1000
10,dsgdb9nsd_000828,3,F,-0.659056,1.848638,-1.065052,2.166995,2.170205,1000.0,1000,1000,1000
11,dsgdb9nsd_000828,4,C,-0.741532,1.857302,1.284527,2.351043,2.351046,2.380305,1000,1000,1000
12,dsgdb9nsd_000828,5,O,-1.192351,1.10581,2.096901,2.69686,1000.0,1000.0,1000,1000,1000
13,dsgdb9nsd_000828,6,H,-0.794344,2.962136,1.358063,2.670096,2.670144,1000.0,1000,1000,1000


## Checking the results

In [336]:
print(train_neigb.shape, train.shape)

(1533537, 7) (4658147, 6)


In [340]:
# Отличается число строк!

In [338]:
train_neigb.molecule_name.nunique()

85003

In [339]:
train.molecule_name.nunique()

85003

In [None]:
# А число молекул одинаковое.
Причина различий - train_neigb построен на основании таблицы structures, 
           в которую включены атомы F -  а  train атомов F нет 

### To prepare final features

In [9]:
#train= pd.read_csv('champs-scalar-coupling/train.csv')
#test= pd.read_csv('champs-scalar-coupling/test.csv')

In [10]:
%%time
#train_neigb = find_neighbors(structures.set_index('molecule_name'), train.molecule_name.unique(), 1.8)
#test_neigb = find_neighbors(structures.set_index('molecule_name'), test.molecule_name.unique(), 1.8)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 10 µs


# Write outputs



In [332]:
%%time
train_neigb.to_csv('train_neigb_'+str(radius)+'.csv')
#test_neigb.to_csv('test_neigb'+str(radius)+'.csv')

CPU times: user 3min 45s, sys: 4.9 s, total: 3min 50s
Wall time: 3min 57s


In [231]:
struct_dist.to_csv('struct_dist.csv')

In [None]:
def add_col(df,**atom_dict):
    """Add new columns with the distances to heteroatoms.
       F1 corresponds to the nearest atom F.
       If F is absent in the molecule, we set large distance."""
    
    for atom, value in atom_dict.items():
        for i in range(1, value+1):
            df[str(atom)+str(i)]=1000
    return df

def find_neigb_atom(structures, molecules, atom_to_find, radius):
    """Radius for finding neighbors"""
    
    atom_dict = {'F': 6, 'N': 7, "O": 5}
    df_0 = add_col(pd.DataFrame(data=None, columns=structures.columns),**atom_dict)
    #df_0 = add_col(structures.iloc[:2].copy(),**atom_dict)
    
    for imol,name in tqdm(list(enumerate(molecules))):
        
        molecule = structures.set_index('molecule_name').loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        #Set new columns and rewrite some values
        res= add_col(molecule.reset_index(),**atom_dict)
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
            print('===1===')
            print(distances)
            inds = inds[1:sum(distances<radius)+1]
            print('===2===')
            print(atoms[inds])
            dist_atom = distances[atoms[inds] == atom_to_find]
            print('===3===')
            print(dist_atom)
            ind_atom = inds[atoms[inds] == atom_to_find]
            print('===4===')
            print(ind_atom)
           
            for i, d in enumerate(dist_atom,1): # print(i,  d) -> 0 (0, 3.381815264756334), 1 (2, 3.382439208351072
                #print(str(atom_to_find)+str(i))
                res.loc[a0, str(atom_to_find)+str(i)]=d 
                print('===5===')
                print(res)
                #print(molecule)
        print('===6===')
        print(res)
        print(res.shape)
        df_nieghb =pd.concat([df_0, res], axis=0)
        print('====FINAL====')
        print(df_nieghb.shape)
        print(df_nieghb)
    
    return df_nieghb