# Nearest atoms from KDTree
## New hyperparameter for tuning - radius
### radius = max distance for atoms to be neighbors

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from scipy.spatial import cKDTree as KDTree
from tqdm import tqdm_notebook as tqdm

In [14]:
train= pd.read_csv('champs-scalar-coupling/train.csv') # Load only first 20 rows
# test= pd.read_csv('champs-scalar-coupling/test.csv')
structures= pd.read_csv('champs-scalar-coupling/structures.csv')

In [15]:
# molecules with F atom
mol_F = structures[structures['atom']=='F']['molecule_name'].unique()
len(mol_F)

1907

In [16]:
structures[structures.molecule_name==mol_F[0]].to_csv('mol_f.csv')

In [17]:
# molecules with N atom
mol_N = structures[structures['atom']=='N']['molecule_name'].unique()
len(mol_N)

79827

In [18]:
# molecules with O atom
mol_O = structures[structures['atom']=='O']['molecule_name'].unique()
len(mol_O)

111279

In [67]:
het= np.append(np.append(mol_F[:5],mol_N[:5]), mol_O[:5])
het

In [19]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [20]:
struct = structures.groupby(['molecule_name','atom'])['atom_index'].count().unstack().fillna(0).reset_index()
struct['total']=structures.groupby('molecule_name')['atom'].count().values
struct.head(10)

atom,molecule_name,C,F,H,N,O,total
0,dsgdb9nsd_000001,1.0,0.0,4.0,0.0,0.0,5
1,dsgdb9nsd_000002,0.0,0.0,3.0,1.0,0.0,4
2,dsgdb9nsd_000003,0.0,0.0,2.0,0.0,1.0,3
3,dsgdb9nsd_000004,2.0,0.0,2.0,0.0,0.0,4
4,dsgdb9nsd_000005,1.0,0.0,1.0,1.0,0.0,3
5,dsgdb9nsd_000007,2.0,0.0,6.0,0.0,0.0,8
6,dsgdb9nsd_000008,1.0,0.0,4.0,0.0,1.0,6
7,dsgdb9nsd_000009,3.0,0.0,4.0,0.0,0.0,7
8,dsgdb9nsd_000010,2.0,0.0,3.0,1.0,0.0,6
9,dsgdb9nsd_000011,2.0,0.0,4.0,0.0,1.0,7


In [21]:
print("Max number of F:", struct.F.max())
print("")
print("Max number of N:", struct.N.max())
print("")
print("Max number of O:", struct.O.max())

Max number of F: 6.0

Max number of N: 7.0

Max number of O: 5.0


In [77]:
def add_col(df,**atom_dict):
    """Add new columns with the distances to heteroatoms.
       F1 corresponds to the nearest atom F.
       If F is absent in the molecule, we set large distance."""
    
    for atom, value in atom_dict.items():
        for i in range(1, value+1):
            df[str(atom)+str(i)]=1000
    return df
#def add_col(df, atom, number_atoms)

def find_neigb_atom(structures, molecules, radius, **atom_dict):
    """Radius for finding neighbors"""
    
    #atom_dict = {'F': 6, 'N': 7, "O": 5}

    df_nieghb = add_col(pd.DataFrame(columns=structures.columns),**atom_dict)
    
    
    for imol,name in tqdm(list(enumerate(molecules))):
        
        molecule = structures.set_index('molecule_name').loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        #Set new columns and rewrite some values
        res= add_col(molecule.reset_index(),**atom_dict)
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
            inds = inds[1:sum(distances<radius)+1]

            for atom_to_find in atom_dict.keys():
                dist_atom = distances[atoms[inds] == atom_to_find]
                ind_atom = inds[atoms[inds] == atom_to_find]
           
                for i, d in enumerate(dist_atom,1): # print(i,  d) -> 0 (0, 3.381815264756334), 1 (2, 3.382439208351072
                    res.loc[a0, str(atom_to_find)+str(i)]=d
        
        df_nieghb = df_nieghb.append(res, ignore_index=True)
        print(df_nieghb.shape)
      
    return df_nieghb#, out_dist, out_atoms

In [78]:
#Set hyperparameters
#atom_dict = {'F': int(struct.F.max()), 'N': int(struct.N.max()), "O": int(struct.O.max())}
atom_dict = {'F': 6, 'N': 7, "O": 5}
radius_dist=3

In [97]:
%%time
struct_dist= find_neigb_atom(structures, het, radius_dist, **atom_dict)

(7, 24)
(14, 24)
(24, 24)
(33, 24)
(44, 24)
(48, 24)
(51, 24)
(57, 24)
(63, 24)
(72, 24)
(75, 24)
(81, 24)
(88, 24)
(94, 24)
(103, 24)

CPU times: user 4.91 s, sys: 1.16 s, total: 6.07 s
Wall time: 6.49 s


In [98]:
struct_dist.shape

(103, 24)

In [76]:
structures.set_index('molecule_name').loc[het].shape

(103, 5)

In [99]:
struct_dist

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,F1,F2,F3,F4,...,N3,N4,N5,N6,N7,O1,O2,O3,O4,O5
0,dsgdb9nsd_000826,0,F,0.015866,0.060812,-0.023692,2.161847,2.161852,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
1,dsgdb9nsd_000826,1,C,-0.021207,1.397890,0.017264,1.338219,1.338241,1.338252,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
2,dsgdb9nsd_000826,2,F,1.240270,1.842505,-0.026129,2.161852,2.162061,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
3,dsgdb9nsd_000826,3,F,-0.642161,1.822862,-1.089413,2.161847,2.162061,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
4,dsgdb9nsd_000826,4,C,-0.706010,1.869850,1.220992,2.311485,2.311724,2.311765,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
5,dsgdb9nsd_000826,5,C,-1.266763,2.255680,2.206575,1000.000000,1000.000000,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
6,dsgdb9nsd_000826,6,H,-1.764462,2.598332,3.081448,1000.000000,1000.000000,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000
7,dsgdb9nsd_000828,0,F,0.036904,0.054835,-0.061209,2.170204,2.170205,1000.000000,1000,...,1000,1000,1000,1000,1000,2.696860,1000,1000,1000,1000
8,dsgdb9nsd_000828,1,C,-0.017329,1.374488,0.010798,1.322728,1.339435,1.339436,1000,...,1000,1000,1000,1000,1000,2.409293,1000,1000,1000,1000
9,dsgdb9nsd_000828,2,F,1.227817,1.868042,0.000406,2.166995,2.170204,1000.000000,1000,...,1000,1000,1000,1000,1000,1000.000000,1000,1000,1000,1000


In [223]:
def find_neighbors(structures, molecules, radius):
    """Radius for finding neighbors"""
    out_atom_index = [] ; out_atom_name = []; 
    out_name =[] ; out_neigb= []; out_dists= [];
    out_neigb_number =[]; out_all_atoms = []
    
    for imol,name in tqdm(list(enumerate(molecules))):
          
        molecule = structures.loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            out_name.append(name)
            #out_all_atoms.append(molecule.shape[0])
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
            inds = inds[1:sum(distances<radius)+1]
            
            out_all_atoms.append(len(atoms_idx))
            out_neigb_number.append(sum(distances<radius))
            out_neigb.append(atoms[inds])
            out_dists.append(distances)
            out_atom_index.append(a0)
            out_atom_name.append(atoms[a0])
    
    neighbors = pd.DataFrame({'molecule_name':out_name, 'atom_index': out_atom_index,'atom': out_atom_name, 
                              'all_atoms':out_all_atoms,'number_neihgbors': out_neigb_number,'neighbors':out_neigb, 
                              'L2dist':out_dists})
    #return len(out_name), len(out_neigb), len(out_dists)
    return neighbors
 

In [224]:
#Set hyperparameter
radius=1.8

In [225]:
#%%time
#Running time = 6min 12s
#train_neigb = find_neighbors(structures.set_index('molecule_name'), train.molecule_name.unique(), radius)

CPU times: user 5min 59s, sys: 13.5 s, total: 6min 12s
Wall time: 6min 12s


In [None]:
# Writing time = 3min 57s
# train_neigb.to_csv('train_neigb_'+str(radius)+'.csv')

## Checking the results

In [95]:
train_neigb=pd.read_csv('train_neigb_1.8.csv')
train_neigb.head()

Unnamed: 0.1,Unnamed: 0,molecule_name,atom_index,atom,all_atoms,number_neihgbors,neighbors,L2dist
0,0,dsgdb9nsd_000001,0,C,5,4,['H' 'H' 'H' 'H'],[1.09194638 1.09194754 1.09195162 1.09195306]
1,1,dsgdb9nsd_000001,1,H,5,4,['C' 'H' 'H' 'H'],[1.09195306 1.78311976 1.7831475 1.78315669]
2,2,dsgdb9nsd_000001,2,H,5,4,['C' 'H' 'H' 'H'],[1.09195162 1.78311976 1.78314839 1.78315766]
3,3,dsgdb9nsd_000001,3,H,5,4,['C' 'H' 'H' 'H'],[1.09194638 1.7831475 1.78314787 1.78315766]
4,4,dsgdb9nsd_000001,4,H,5,4,['C' 'H' 'H' 'H'],[1.09194754 1.78314787 1.78314839 1.78315669]


In [102]:
print(train_neigb.shape, train.shape)
print(struct_dist.shape, structures.shape)

(1533537, 8) (4658147, 6)
(103, 24) (2358657, 6)


In [104]:
train_neigb.molecule_name.nunique() -  train.molecule_name.nunique()

0

## Проверка на то, что записывается

In [90]:
train_neigb[train_neigb['molecule_name']=='dsgdb9nsd_000828']

Unnamed: 0.1,Unnamed: 0,molecule_name,atom_index,atom,all_atoms,number_neihgbors,neighbors,L2dist
5942,5942,dsgdb9nsd_000828,0,F,7,1,['C'],[1.32272813]
5943,5943,dsgdb9nsd_000828,1,C,7,4,['F' 'F' 'F' 'C'],[1.32272813 1.33943462 1.3394365 1.54271368]
5944,5944,dsgdb9nsd_000828,2,F,7,1,['C'],[1.3394365]
5945,5945,dsgdb9nsd_000828,3,F,7,1,['C'],[1.33943462]
5946,5946,dsgdb9nsd_000828,4,C,7,3,['H' 'O' 'C'],[1.10853792 1.19495938 1.54271368]
5947,5947,dsgdb9nsd_000828,5,O,7,1,['C'],[1.19495938]
5948,5948,dsgdb9nsd_000828,6,H,7,1,['C'],[1.10853792]


In [100]:
columns_to_show=['molecule_name', 'atom_index', 'atom', 'F1', 'F2','F3','O1', 'O2',"O3"]
struct_dist[struct_dist['molecule_name']=='dsgdb9nsd_000828'].loc[:, columns_to_show]

Unnamed: 0,molecule_name,atom_index,atom,F1,F2,F3,O1,O2,O3
7,dsgdb9nsd_000828,0,F,2.170204,2.170205,1000.0,2.69686,1000,1000
8,dsgdb9nsd_000828,1,C,1.322728,1.339435,1.339436,2.409293,1000,1000
9,dsgdb9nsd_000828,2,F,2.166995,2.170204,1000.0,1000.0,1000,1000
10,dsgdb9nsd_000828,3,F,2.166995,2.170205,1000.0,1000.0,1000,1000
11,dsgdb9nsd_000828,4,C,2.351043,2.351046,2.380305,1.194959,1000,1000
12,dsgdb9nsd_000828,5,O,2.69686,1000.0,1000.0,1000.0,1000,1000
13,dsgdb9nsd_000828,6,H,2.670096,2.670144,1000.0,2.037213,1000,1000


Смотрим молекулу dsgdb9nsd_000828:
<br> в train_neigb для атома - 4 ближайшими соседями (т.е. на расстоянии <1.8) являются:
<br>                   ['H' 'O' 'C'] на расстоянии [1.10853792 1.19495938 1.54271368]
<br>  <br> в struct_dist - атом - 4:
<br>     расстояния до ближайших атомов F:  F1 = 2.351043, F2=2.351046, F3=2.380305
 <br>    расстояния до ближайших атомов O:  O1 = 1.194959 ( O2=1000 O3=1000, т.е. их в молекуле нет вообще, 
                                      и если посмотреть на колонку atom - там только один О)

Вывод: все работает хорошо

### To prepare final features

In [9]:
#train= pd.read_csv('champs-scalar-coupling/train.csv')
#test= pd.read_csv('champs-scalar-coupling/test.csv')

In [10]:
%%time
#train_neigb = find_neighbors(structures.set_index('molecule_name'), train.molecule_name.unique(), 1.8)
#test_neigb = find_neighbors(structures.set_index('molecule_name'), test.molecule_name.unique(), 1.8)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 10 µs


# Write outputs



In [11]:
train_neigb.to_csv('train_neigb'+str(radius)+'.csv')
#test_neigb.to_csv('test_neigb'+str(radius)+'.csv')

In [231]:
struct_dist.to_csv('struct_dist.csv')

In [None]:
def add_col(df,**atom_dict):
    """Add new columns with the distances to heteroatoms.
       F1 corresponds to the nearest atom F.
       If F is absent in the molecule, we set large distance."""
    
    for atom, value in atom_dict.items():
        for i in range(1, value+1):
            df[str(atom)+str(i)]=1000
    return df

def find_neigb_atom(structures, molecules, atom_to_find, radius):
    """Radius for finding neighbors"""
    
    atom_dict = {'F': 6, 'N': 7, "O": 5}
    df_0 = add_col(pd.DataFrame(data=None, columns=structures.columns),**atom_dict)
    #df_0 = add_col(structures.iloc[:2].copy(),**atom_dict)
    
    for imol,name in tqdm(list(enumerate(molecules))):
        
        molecule = structures.set_index('molecule_name').loc[name ]
        atoms    = molecule.atom.values
        atoms_idx= molecule.atom_index.values
        
        #Set new columns and rewrite some values
        res= add_col(molecule.reset_index(),**atom_dict)
        
        coords   = molecule[['x', 'y', 'z']].values
        kdt = KDTree(coords)
        
        for a0 in atoms_idx:
            
            distances, inds = kdt.query(coords[a0], len(atoms))
            
            distances = distances[1:sum(distances<radius)]; # remove a0 from list
            print('===1===')
            print(distances)
            inds = inds[1:sum(distances<radius)+1]
            print('===2===')
            print(atoms[inds])
            dist_atom = distances[atoms[inds] == atom_to_find]
            print('===3===')
            print(dist_atom)
            ind_atom = inds[atoms[inds] == atom_to_find]
            print('===4===')
            print(ind_atom)
           
            for i, d in enumerate(dist_atom,1): # print(i,  d) -> 0 (0, 3.381815264756334), 1 (2, 3.382439208351072
                #print(str(atom_to_find)+str(i))
                res.loc[a0, str(atom_to_find)+str(i)]=d 
                print('===5===')
                print(res)
                #print(molecule)
        print('===6===')
        print(res)
        print(res.shape)
        df_nieghb =pd.concat([df_0, res], axis=0)
        print('====FINAL====')
        print(df_nieghb.shape)
        print(df_nieghb)
    
    return df_nieghb