In [1]:
import sys
import numpy as np
import pandas as pd
import os

# import matplotlib.pyplot as plt
# %matplotlib inline
from tqdm import tqdm_notebook
pd.options.display.precision = 15

import time
import datetime
import openbabel
import gc
#import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import HTML
import json
# import altair as alt
# alt.renderers.enable('notebook')

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

os.environ["TZ"]="Europe/Paris"
time.tzset()

## Part 1

In [3]:
train = pd.read_csv('champs-scalar-coupling/train.csv')
test = pd.read_csv('champs-scalar-coupling/test.csv')

In [5]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 6 columns):
id                          int64
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 737.4 MB


In [6]:
test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505542 entries, 0 to 2505541
Data columns (total 5 columns):
id               int64
molecule_name    object
atom_index_0     int64
atom_index_1     int64
type             object
dtypes: int64(3), object(2)
memory usage: 377.5 MB


In [7]:
structures = pd.read_csv('champs-scalar-coupling/structures.csv')
structures.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358657 entries, 0 to 2358656
Data columns (total 6 columns):
molecule_name    object
atom_index       int64
atom             object
x                float64
y                float64
z                float64
dtypes: float64(3), int64(1), object(2)
memory usage: 384.6 MB


In [8]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

del structures
#atom_count=structures.groupby(['molecule_name','atom']).size().unstack(fill_value=0)
#train=pd.merge(train,atom_count, how = 'left', left_on  = 'molecule_name', right_on = 'molecule_name')
#test=pd.merge(test,atom_count, how = 'left', left_on  = 'molecule_name', right_on = 'molecule_name')

In [9]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4658147 entries, 0 to 4658146
Data columns (total 14 columns):
id                          int64
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
atom_0                      object
x_0                         float64
y_0                         float64
z_0                         float64
atom_1                      object
x_1                         float64
y_1                         float64
z_1                         float64
dtypes: float64(7), int64(3), object(4)
memory usage: 1.5 GB


In [10]:
%%time
train['bonds']=train['type'].str[0].astype(int)
test['bonds']=test['type'].str[0].astype(int)

train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['abs_dist']=np.linalg.norm(train_p_0-train_p_1,axis=1,ord=1)
test['abs_dist']=np.linalg.norm(test_p_0-test_p_1,axis=1,ord=1)

CPU times: user 5.53 s, sys: 1.71 s, total: 7.24 s
Wall time: 7.99 s


In [11]:
def dist12(name='xy',a='x',b='y'):
    train_p_0=train[[a+'_0',b+'_0']].values
    train_p_1=train[[a+'_1',b+'_1']].values
    test_p_0=test[[a+'_0',b+'_0']].values
    test_p_1=test[[a+'_1',b+'_1']].values
    
    train[name] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
    test[name] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
    train['abs_'+name]= np.linalg.norm(train_p_0-train_p_1,axis=1,ord=1)
    test['abs_'+name]= np.linalg.norm(test_p_0-test_p_1,axis=1,ord=1)

In [12]:
dist12('dist_xy','x','y')
dist12('dist_xz','x','z')
dist12('dist_yz','y','z')

In [13]:
%%time
train['dist_to_type_mean'] = train['dist'] / train.groupby('type')['dist'].transform('mean')
test['dist_to_type_mean'] = test['dist'] / test.groupby('type')['dist'].transform('mean')

train['dist_to_type_std'] = train['dist'] / train.groupby('type')['dist'].transform('std')
test['dist_to_type_std'] = test['dist'] / test.groupby('type')['dist'].transform('std')

train['dist_to_type_mean_xy'] = train['dist_xy'] / train.groupby('type')['dist_xy'].transform('mean')
test['dist_to_type_mean_xy'] = test['dist_xy'] / test.groupby('type')['dist_xy'].transform('mean')

train['dist_to_type_mean_xz'] = train['dist_xz'] / train.groupby('type')['dist_xz'].transform('mean')
test['dist_to_type_mean_xz'] = test['dist_xz'] / test.groupby('type')['dist_xz'].transform('mean')

train['dist_to_type_mean_yz'] = train['dist_yz'] / train.groupby('type')['dist_yz'].transform('mean')
test['dist_to_type_mean_yz'] = test['dist_yz'] / test.groupby('type')['dist_yz'].transform('mean')

CPU times: user 8.75 s, sys: 12.9 s, total: 21.7 s
Wall time: 28.8 s


In [16]:
%%time
train.to_csv('train_babel_0.csv')
del train

CPU times: user 3min 33s, sys: 6.74 s, total: 3min 39s
Wall time: 3min 45s


In [17]:
%%time
test.to_csv('test_babel_0.csv')
del test

CPU times: user 1min 48s, sys: 3.57 s, total: 1min 52s
Wall time: 1min 54s


## Part 2

In [3]:
def create_features(df):
    #1
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    #2
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    
    #3
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    
    #4
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    
    #5
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    
    #6
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    
    #7
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    
    #8
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_bonds_dist_std'] = df.groupby(['molecule_name', 'bonds'])['dist'].transform('std')
    df[f'molecule_bonds_dist_std_diff'] = df[f'molecule_bonds_dist_std'] - df['dist']
    
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    
    #9
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    return df

In [17]:
%%time
train2 = pd.read_csv('train_babel_0.csv')

CPU times: user 35.7 s, sys: 6.59 s, total: 42.3 s
Wall time: 46.8 s


In [18]:
%%time
#1
train2['molecule_couples'] = train2.groupby('molecule_name')['id'].transform('count')
train2['molecule_dist_mean'] = train2.groupby('molecule_name')['dist'].transform('mean')
train2['molecule_dist_min'] = train2.groupby('molecule_name')['dist'].transform('min')
train2['molecule_dist_max'] = train2.groupby('molecule_name')['dist'].transform('max')

CPU times: user 4.35 s, sys: 5.95 s, total: 10.3 s
Wall time: 13 s


In [19]:
%%time
#2
train2['atom_0_couples_count'] = train2.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
train2['atom_1_couples_count'] = train2.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')
train2[f'molecule_atom_index_0_x_1_std'] = train2.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
train2[f'molecule_atom_index_0_y_1_mean'] = train2.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')

CPU times: user 5.97 s, sys: 6.62 s, total: 12.6 s
Wall time: 17.8 s


In [20]:
%%time
#3
train2[f'molecule_atom_index_0_y_1_mean_diff'] = train2[f'molecule_atom_index_0_y_1_mean'] - train2['y_1']
train2[f'molecule_atom_index_0_y_1_mean_div'] = train2[f'molecule_atom_index_0_y_1_mean'] / train2['y_1']
train2[f'molecule_atom_index_0_y_1_max'] = train2.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    
train2[f'molecule_atom_index_0_y_1_max_diff'] = train2[f'molecule_atom_index_0_y_1_max'] - train2['y_1']
train2[f'molecule_atom_index_0_y_1_std'] = train2.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
train2[f'molecule_atom_index_0_z_1_std'] = train2.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
train2[f'molecule_atom_index_0_dist_mean'] = train2.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    

CPU times: user 10.5 s, sys: 20.8 s, total: 31.3 s
Wall time: 1min 5s


In [21]:
%%time
#4
train2[f'molecule_atom_index_0_dist_mean_diff'] = train2[f'molecule_atom_index_0_dist_mean'] - train2['dist']
train2[f'molecule_atom_index_0_dist_mean_div'] = train2[f'molecule_atom_index_0_dist_mean'] / train2['dist']
train2[f'molecule_atom_index_0_dist_max'] = train2.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
train2[f'molecule_atom_index_0_dist_max_diff'] = train2[f'molecule_atom_index_0_dist_max'] - train2['dist']
    
train2[f'molecule_atom_index_0_dist_max_div'] = train2[f'molecule_atom_index_0_dist_max'] / train2['dist']
train2[f'molecule_atom_index_0_dist_min'] = train2.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
train2[f'molecule_atom_index_0_dist_min_diff'] = train2[f'molecule_atom_index_0_dist_min'] - train2['dist']
train2[f'molecule_atom_index_0_dist_min_div'] = train2[f'molecule_atom_index_0_dist_min'] / train2['dist']
    

CPU times: user 6.91 s, sys: 16.2 s, total: 23.2 s
Wall time: 59.9 s


In [22]:
%%time
#5
train2[f'molecule_atom_index_0_dist_std'] = train2.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
train2[f'molecule_atom_index_0_dist_std_diff'] = train2[f'molecule_atom_index_0_dist_std'] - train2['dist']
train2[f'molecule_atom_index_0_dist_std_div'] = train2[f'molecule_atom_index_0_dist_std'] / train2['dist']
    
train2[f'molecule_atom_index_1_dist_mean'] = train2.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
train2[f'molecule_atom_index_1_dist_mean_diff'] = train2[f'molecule_atom_index_1_dist_mean'] - train2['dist']
train2[f'molecule_atom_index_1_dist_mean_div'] = train2[f'molecule_atom_index_1_dist_mean'] / train2['dist']
    

CPU times: user 8.02 s, sys: 21.5 s, total: 29.5 s
Wall time: 1min 26s


In [23]:
%%time
#6
train2[f'molecule_atom_index_1_dist_max'] = train2.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
train2[f'molecule_atom_index_1_dist_max_diff'] = train2[f'molecule_atom_index_1_dist_max'] - train2['dist']
train2[f'molecule_atom_index_1_dist_max_div'] = train2[f'molecule_atom_index_1_dist_max'] / train2['dist']
    
train2[f'molecule_atom_index_1_dist_min'] = train2.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
train2[f'molecule_atom_index_1_dist_min_diff'] = train2[f'molecule_atom_index_1_dist_min'] - train2['dist']
train2[f'molecule_atom_index_1_dist_min_div'] = train2[f'molecule_atom_index_1_dist_min'] / train2['dist']
    
    

CPU times: user 9.66 s, sys: 28 s, total: 37.7 s
Wall time: 2min 33s


In [24]:
%%time
#7
train2[f'molecule_atom_index_1_dist_std'] = train2.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
train2[f'molecule_atom_index_1_dist_std_diff'] = train2[f'molecule_atom_index_1_dist_std'] - train2['dist']
train2[f'molecule_atom_index_1_dist_std_div'] = train2[f'molecule_atom_index_1_dist_std'] / train2['dist']
    
train2[f'molecule_atom_1_dist_mean'] = train2.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
train2[f'molecule_atom_1_dist_min'] = train2.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
train2[f'molecule_atom_1_dist_min_diff'] = train2[f'molecule_atom_1_dist_min'] - train2['dist']
train2[f'molecule_atom_1_dist_min_div'] = train2[f'molecule_atom_1_dist_min'] / train2['dist']  

CPU times: user 16.3 s, sys: 51.1 s, total: 1min 7s
Wall time: 5min 11s


In [None]:
%%time
#8
train2[f'molecule_atom_1_dist_std'] = train2.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
train2[f'molecule_atom_1_dist_std_diff'] = train2[f'molecule_atom_1_dist_std'] - train2['dist']
train2[f'molecule_bonds_dist_std'] = train2.groupby(['molecule_name', 'bonds'])['dist'].transform('std')
train2[f'molecule_bonds_dist_std_diff'] = train2[f'molecule_bonds_dist_std'] - train2['dist']
    
train2[f'molecule_type_dist_mean'] = train2.groupby(['molecule_name', 'type'])['dist'].transform('mean')
train2[f'molecule_type_dist_mean_diff'] = train2[f'molecule_type_dist_mean'] - train2['dist']
train2[f'molecule_type_dist_mean_div'] = train2[f'molecule_type_dist_mean'] / train2['dist']
    

In [None]:
%%time
#9
train2[f'molecule_type_dist_max'] = train2.groupby(['molecule_name', 'type'])['dist'].transform('max')
train2[f'molecule_type_dist_min'] = train2.groupby(['molecule_name', 'type'])['dist'].transform('min')
train2[f'molecule_type_dist_std'] = train2.groupby(['molecule_name', 'type'])['dist'].transform('std')
train2[f'molecule_type_dist_std_diff'] = train2[f'molecule_type_dist_std'] - train2['dist']

In [None]:
train2.head().T

In [None]:
train2.info(memory_usage='deep')

In [37]:
train2.columns

Index(['Unnamed: 0', 'id', 'molecule_name', 'atom_index_0', 'atom_index_1',
       'type', 'scalar_coupling_constant', 'atom_0', 'x_0', 'y_0', 'z_0',
       'atom_1', 'x_1', 'y_1', 'z_1', 'bonds', 'dist', 'abs_dist', 'dist_xy',
       'abs_dist_xy', 'dist_xz', 'abs_dist_xz', 'dist_yz', 'abs_dist_yz',
       'dist_to_type_mean', 'dist_to_type_std', 'dist_to_type_mean_xy',
       'dist_to_type_mean_xz', 'dist_to_type_mean_yz', 'molecule_couples',
       'molecule_dist_mean', 'molecule_dist_min', 'molecule_dist_max',
       'atom_0_couples_count', 'atom_1_couples_count',
       'molecule_atom_index_0_x_1_std', 'molecule_atom_index_0_y_1_mean',
       'molecule_atom_index_0_y_1_mean_diff',
       'molecule_atom_index_0_y_1_mean_div', 'molecule_atom_index_0_y_1_max',
       'molecule_atom_index_0_y_1_max_diff', 'molecule_atom_index_0_y_1_std',
       'molecule_atom_index_0_z_1_std', 'molecule_atom_index_0_dist_mean',
       'molecule_atom_index_0_dist_mean_diff',
       'molecule_atom_index

In [39]:
%%time
train2.loc[:,:'molecule_atom_index_1_dist_mean_diff'].to_csv('train_babel_part1.csv')

SystemError: <built-in method item of numpy.ndarray object at 0x151aae4170> returned a result with an error set

In [None]:
%%time
train2 = create_features(train2)

train2 = reduce_mem_usage(train2)
train2.to_csv('train_babel_1.csv')
del train2

In [None]:
%%time
test2 = pd.read_csv('test_babel_0.csv')

In [None]:
%%time
test2 = create_features(test2)
test2 = reduce_mem_usage(test2)
test2.to_csv('test_babel_1.csv')
del test2

## Part 3

## Features from open babel

In [None]:
%%time
obConversion = openbabel.OBConversion()
obConversion.SetInFormat("xyz")

structdir='structures/'
mols=[]
mols_files=os.listdir(structdir)
mols_index=dict(map(reversed,enumerate(mols_files)))
for f in mols_index.keys():
    mol = openbabel.OBMol()
    obConversion.ReadFile(mol, structdir+f) 
    mols.append(mol)

In [None]:
def Atoms(molname,AtomId1,AtomId2):
    mol=mols[mols_index[molname+'.xyz']]
    return mol, mol.GetAtomById(AtomId1), mol.GetAtomById(AtomId2)

def SecondAtom(bond,FirstAtom):
    if FirstAtom.GetId()==bond.GetBeginAtom().GetId(): return bond.GetEndAtom()
    else: return bond.GetBeginAtom()

def Angle2J(molname,AtomId1,AtomId2,debug=False):
    mol,firstAtom,lastAtom=Atoms(molname,AtomId1,AtomId2)
    if debug: print (mol.GetFormula())
    if debug: print(firstAtom.GetType(),firstAtom.GetId(),':',lastAtom.GetType(),lastAtom.GetId())
    for b in openbabel.OBAtomBondIter(firstAtom): # all bonds for first atom
      secondAtom=SecondAtom(b,firstAtom)
      lastBond=secondAtom.GetBond(lastAtom)
      if lastBond: # found!
        if debug: print('middle',secondAtom.GetId(),secondAtom.GetType())
        return firstAtom.GetAngle(secondAtom,lastAtom)

#Angle2J('dsgdb9nsd_000003',1,2,debug=True) #water

In [None]:
def Torsion3J(molname,AtomId1,AtomId2,debug=False):
    mol,firstAtom,lastAtom=Atoms(molname,AtomId1,AtomId2)
    if debug: print (molname, mol.GetFormula())
    if debug: print(firstAtom.GetType(),firstAtom.GetId(),':',lastAtom.GetType(),lastAtom.GetId())
    for b in openbabel.OBAtomBondIter(firstAtom): # all bonds for first atom
      secondAtom=SecondAtom(b,firstAtom)
      for b2 in openbabel.OBAtomBondIter(secondAtom): # all bonds for second atom 
        thirdAtom=SecondAtom(b2,secondAtom)
        lastBond=thirdAtom.GetBond(lastAtom)
        if lastBond: # found!
          if debug: print(secondAtom.GetType(),secondAtom.GetId(),'<->',thirdAtom.GetType(),thirdAtom.GetId())
          return mol.GetTorsion(firstAtom,secondAtom,thirdAtom,lastAtom)
          
#Torsion3J('dsgdb9nsd_000007',2,5,debug=True) #methanol

In [None]:
for t in train['type'].unique():
    print(f'Training of type {t}')
    b=int(t[0]) # current bond for this type
    print('Predicting J=',b)
    X=train[train.type==t]
    if (b==1):
        X['sp']=X.apply(lambda row: Atoms(row.molecule_name, row.atom_index_0, row.atom_index_1)[2].GetHyb(),axis=1) # second atom is C or N for bond 1
    if (b==2):
        X['Angle']=X.apply(lambda row: Angle2J(row.molecule_name , row.atom_index_0, row.atom_index_1),axis=1) 
    if (b==3):
        X['Torsion']=X.apply(lambda row: Torsion3J(row.molecule_name , row.atom_index_0, row.atom_index_1),axis=1) 
        X['cosT']=np.cos(np.deg2rad(X['Torsion']))
        X['cos2T']=np.cos(2*np.deg2rad(X['Torsion']))
    y = X['scalar_coupling_constant']
    ids_train=X['id']
    X = X.drop(['id','type', 'molecule_name', 'scalar_coupling_constant','bonds'], axis=1)
    
    X_test = test[test.type==t]
    if (b==1): 
        X_test['sp']=X_test.apply(lambda row: Atoms(row.molecule_name, row.atom_index_0, row.atom_index_1)[2].GetHyb(),axis=1) # second atom is C or N for bond 1
    if (b==2):
        X_test['Angle']=X_test.apply(lambda row: Angle2J(row.molecule_name , row.atom_index_0, row.atom_index_1),axis=1) 
    if (b==3):
        X_test['Torsion']=X_test.apply(lambda row: Torsion3J(row.molecule_name , row.atom_index_0, row.atom_index_1),axis=1)  
        X_test['cosT']=np.cos(np.deg2rad(X_test['Torsion']))
        X_test['cos2T']=np.cos(2*np.deg2rad(X_test['Torsion']))
    ids_test=X_test['id']    
    X_test=X_test.drop(['id', 'type', 'molecule_name','bonds'], axis=1)
    
    