In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import HTML
import json

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
file_folder = 'D:\Ellunium\molekuls'
os.listdir(file_folder)

['dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'structures.zip',
 'test.csv',
 'train.csv']

In [4]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [5]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.80759999999998
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [6]:
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.0126981359,1.085804158,0.0080009958
1,dsgdb9nsd_000001,1,H,0.002150416,-0.0060313176,0.0019761204
2,dsgdb9nsd_000001,2,H,1.011730843,1.463751162,0.0002765748
3,dsgdb9nsd_000001,3,H,-0.540815069,1.447526614,-0.8766437152
4,dsgdb9nsd_000001,4,H,-0.5238136345,1.437932644,0.9063972942


In [7]:
print(f'There are {train.shape[0]} rows in train data.')
print(f'There are {test.shape[0]} rows in test data.')

print(f"There are {train['molecule_name'].nunique()} distinct molecules in train data.")
print(f"There are {test['molecule_name'].nunique()} distinct molecules in test data.")
print(f"There are {structures['atom'].nunique()} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

There are 4658147 rows in train data.
There are 2505542 rows in test data.
There are 85003 distinct molecules in train data.
There are 45772 distinct molecules in test data.
There are 5 unique atoms.
There are 8 unique types.


In [8]:
def map_atom_info(df, atom_idx):
    df = pd.merge(df, structures, how = 'left',
                  left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                  right_on = ['molecule_name',  'atom_index'])
    
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'atom': f'atom_{atom_idx}',
                            'x': f'x_{atom_idx}',
                            'y': f'y_{atom_idx}',
                            'z': f'z_{atom_idx}'})
    return df

train = map_atom_info(train, 0)
train = map_atom_info(train, 1)

test = map_atom_info(test, 0)
test = map_atom_info(test, 1)

In [9]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,dsgdb9nsd_000001,1,0,1JHC,84.80759999999998,H,0.002150416,-0.0060313176,0.0019761204,C,-0.0126981359,1.085804158,0.0080009958
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257,H,0.002150416,-0.0060313176,0.0019761204,H,1.011730843,1.463751162,0.0002765748
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548,H,0.002150416,-0.0060313176,0.0019761204,H,-0.540815069,1.447526614,-0.8766437152
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543,H,0.002150416,-0.0060313176,0.0019761204,H,-0.5238136345,1.437932644,0.9063972942
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074,H,1.011730843,1.463751162,0.0002765748,C,-0.0126981359,1.085804158,0.0080009958


In [10]:
train_p_0 = train[['x_0', 'y_0', 'z_0']].values
train_p_1 = train[['x_1', 'y_1', 'z_1']].values
test_p_0 = test[['x_0', 'y_0', 'z_0']].values
test_p_1 = test[['x_1', 'y_1', 'z_1']].values

train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
train['dist_x'] = (train['x_0'] - train['x_1']) ** 2
test['dist_x'] = (test['x_0'] - test['x_1']) ** 2
train['dist_y'] = (train['y_0'] - train['y_1']) ** 2
test['dist_y'] = (test['y_0'] - test['y_1']) ** 2
train['dist_z'] = (train['z_0'] - train['z_1']) ** 2
test['dist_z'] = (test['z_0'] - test['z_1']) ** 2

In [11]:
train['type_0'] = train['type'].apply(lambda x: x[0])
test['type_0'] = test['type'].apply(lambda x: x[0])
train['type_1'] = train['type'].apply(lambda x: x[1:])
test['type_1'] = test['type'].apply(lambda x: x[1:])

In [12]:
train['dist_to_type_mean'] = train['dist'] / train.groupby('type')['dist'].transform('mean')
test['dist_to_type_mean'] = test['dist'] / test.groupby('type')['dist'].transform('mean')

train['dist_to_type_0_mean'] = train['dist'] / train.groupby('type_0')['dist'].transform('mean')
test['dist_to_type_0_mean'] = test['dist'] / test.groupby('type_0')['dist'].transform('mean')

train['dist_to_type_1_mean'] = train['dist'] / train.groupby('type_1')['dist'].transform('mean')
test['dist_to_type_1_mean'] = test['dist'] / test.groupby('type_1')['dist'].transform('mean')

In [13]:
train[f'molecule_type_dist_mean'] = train.groupby(['molecule_name', 'type'])['dist'].transform('mean')
test[f'molecule_type_dist_mean'] = test.groupby(['molecule_name', 'type'])['dist'].transform('mean')

In [14]:
for f in ['atom_0', 'atom_1', 'type_0', 'type_1', 'type']:
    lbl = LabelEncoder()
    lbl.fit(list(train[f].values) + list(test[f].values))
    train[f] = lbl.transform(list(train[f].values))
    test[f] = lbl.transform(list(test[f].values))

In [15]:
X = train.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
y = train['scalar_coupling_constant']
X_test = test.drop(['id', 'molecule_name'], axis=1)

In [None]:
# LightGBM
features = [c for c in X.columns]

param = {'num_leaves': 128,
          'min_child_samples': 79,
          'objective': 'regression',
          'max_depth': 13,
          'learning_rate': 0.2,
          "boosting_type": "gbdt",
          "subsample_freq": 1,
          "subsample": 0.9,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3,
          'colsample_bytree': 1.0
         }

folds = KFold(n_splits=11, shuffle=True, random_state=15)
oof_l = np.zeros(len(X))
predictions_lgb_new = np.zeros(len(X_test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X.values, y.values)):
    print("fold n {}".format(fold_))
    trn_data = lgb.Dataset(X.iloc[trn_idx][features], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[val_idx][features], label=y.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_l[val_idx] = clf.predict(X.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb_new += clf.predict(X_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof_l, y)**0.5))

fold n 0
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 2.11468	valid_1's l1: 2.13547
[200]	training's l1: 2.02631	valid_1's l1: 2.05926
[300]	training's l1: 1.96785	valid_1's l1: 2.01253
[400]	training's l1: 1.92709	valid_1's l1: 1.98224
[500]	training's l1: 1.89085	valid_1's l1: 1.9558
[600]	training's l1: 1.85978	valid_1's l1: 1.93444
[700]	training's l1: 1.83363	valid_1's l1: 1.91679
[800]	training's l1: 1.80955	valid_1's l1: 1.90152
[900]	training's l1: 1.78768	valid_1's l1: 1.88846
[1000]	training's l1: 1.76726	valid_1's l1: 1.87643
[1100]	training's l1: 1.74808	valid_1's l1: 1.86494
[1200]	training's l1: 1.73007	valid_1's l1: 1.85481
[1300]	training's l1: 1.71379	valid_1's l1: 1.84596
[1400]	training's l1: 1.69869	valid_1's l1: 1.83825
[1500]	training's l1: 1.68428	valid_1's l1: 1.83181
[1600]	training's l1: 1.67016	valid_1's l1: 1.82533
[1700]	training's l1: 1.65651	valid_1's l1: 1.81856
[1800]	training's l1: 1.64267	valid_1's l1: 1.81147
[1

[5400]	training's l1: 1.34014	valid_1's l1: 1.70201
[5500]	training's l1: 1.334	valid_1's l1: 1.70007
[5600]	training's l1: 1.32788	valid_1's l1: 1.69841
[5700]	training's l1: 1.32199	valid_1's l1: 1.69679
[5800]	training's l1: 1.316	valid_1's l1: 1.69505
[5900]	training's l1: 1.31042	valid_1's l1: 1.69373
[6000]	training's l1: 1.3047	valid_1's l1: 1.69221
[6100]	training's l1: 1.29879	valid_1's l1: 1.69072
[6200]	training's l1: 1.29297	valid_1's l1: 1.68926
[6300]	training's l1: 1.28746	valid_1's l1: 1.68792
[6400]	training's l1: 1.28213	valid_1's l1: 1.68661
[6500]	training's l1: 1.27663	valid_1's l1: 1.68515
[6600]	training's l1: 1.27113	valid_1's l1: 1.68357
[6700]	training's l1: 1.26592	valid_1's l1: 1.68244
[6800]	training's l1: 1.26063	valid_1's l1: 1.68104
[6900]	training's l1: 1.25553	valid_1's l1: 1.68006
[7000]	training's l1: 1.25018	valid_1's l1: 1.67873
[7100]	training's l1: 1.24513	valid_1's l1: 1.67757
[7200]	training's l1: 1.24023	valid_1's l1: 1.67651
[7300]	training's

[500]	training's l1: 1.89049	valid_1's l1: 1.95207
[600]	training's l1: 1.86108	valid_1's l1: 1.93205
[700]	training's l1: 1.83483	valid_1's l1: 1.9152
[800]	training's l1: 1.81041	valid_1's l1: 1.89981
[900]	training's l1: 1.78768	valid_1's l1: 1.88567
[1000]	training's l1: 1.76723	valid_1's l1: 1.87334
[1100]	training's l1: 1.74881	valid_1's l1: 1.86347
[1200]	training's l1: 1.7313	valid_1's l1: 1.85444
[1300]	training's l1: 1.71489	valid_1's l1: 1.84613
[1400]	training's l1: 1.69898	valid_1's l1: 1.8378
[1500]	training's l1: 1.68502	valid_1's l1: 1.83106
[1600]	training's l1: 1.6716	valid_1's l1: 1.82522
[1700]	training's l1: 1.65729	valid_1's l1: 1.81862
[1800]	training's l1: 1.64381	valid_1's l1: 1.81217
[1900]	training's l1: 1.63112	valid_1's l1: 1.80625
[2000]	training's l1: 1.61921	valid_1's l1: 1.80109
[2100]	training's l1: 1.60813	valid_1's l1: 1.79689
[2200]	training's l1: 1.59667	valid_1's l1: 1.79265
[2300]	training's l1: 1.58542	valid_1's l1: 1.78773
[2400]	training's l1:

[6000]	training's l1: 1.30372	valid_1's l1: 1.69004
[6100]	training's l1: 1.29823	valid_1's l1: 1.6886
[6200]	training's l1: 1.29254	valid_1's l1: 1.68694
[6300]	training's l1: 1.28688	valid_1's l1: 1.68575
[6400]	training's l1: 1.28138	valid_1's l1: 1.68426
[6500]	training's l1: 1.27597	valid_1's l1: 1.68286
[6600]	training's l1: 1.27074	valid_1's l1: 1.6818
[6700]	training's l1: 1.26539	valid_1's l1: 1.68088
[6800]	training's l1: 1.2602	valid_1's l1: 1.6795
[6900]	training's l1: 1.25492	valid_1's l1: 1.6782
[7000]	training's l1: 1.24989	valid_1's l1: 1.67702
[7100]	training's l1: 1.24478	valid_1's l1: 1.67582
[7200]	training's l1: 1.23986	valid_1's l1: 1.6747
[7300]	training's l1: 1.2349	valid_1's l1: 1.67377
[7400]	training's l1: 1.22981	valid_1's l1: 1.67256
[7500]	training's l1: 1.22495	valid_1's l1: 1.67133
[7600]	training's l1: 1.22003	valid_1's l1: 1.67015
[7700]	training's l1: 1.21518	valid_1's l1: 1.66916
[7800]	training's l1: 1.2105	valid_1's l1: 1.66825
[7900]	training's l1

[1100]	training's l1: 1.74682	valid_1's l1: 1.86287
[1200]	training's l1: 1.72993	valid_1's l1: 1.85362
[1300]	training's l1: 1.7133	valid_1's l1: 1.84476
[1400]	training's l1: 1.69833	valid_1's l1: 1.83716
[1500]	training's l1: 1.68385	valid_1's l1: 1.82967
[1600]	training's l1: 1.66979	valid_1's l1: 1.82271
[1700]	training's l1: 1.65597	valid_1's l1: 1.81601
[1800]	training's l1: 1.64253	valid_1's l1: 1.80949
[1900]	training's l1: 1.63034	valid_1's l1: 1.80405
[2000]	training's l1: 1.61781	valid_1's l1: 1.79825
[2100]	training's l1: 1.60672	valid_1's l1: 1.79372
[2200]	training's l1: 1.59509	valid_1's l1: 1.78918
[2300]	training's l1: 1.58407	valid_1's l1: 1.78422
[2400]	training's l1: 1.57301	valid_1's l1: 1.77955
[2500]	training's l1: 1.56286	valid_1's l1: 1.77536
[2600]	training's l1: 1.55312	valid_1's l1: 1.7715
[2700]	training's l1: 1.54315	valid_1's l1: 1.76797
[2800]	training's l1: 1.53355	valid_1's l1: 1.76414
[2900]	training's l1: 1.52387	valid_1's l1: 1.76051
[3000]	trainin

[6500]	training's l1: 1.27574	valid_1's l1: 1.68394
[6600]	training's l1: 1.27041	valid_1's l1: 1.68266
[6700]	training's l1: 1.26525	valid_1's l1: 1.6813
[6800]	training's l1: 1.25998	valid_1's l1: 1.68003
[6900]	training's l1: 1.25485	valid_1's l1: 1.67916
[7000]	training's l1: 1.24969	valid_1's l1: 1.67796
[7100]	training's l1: 1.24474	valid_1's l1: 1.67709
[7200]	training's l1: 1.23961	valid_1's l1: 1.67597
[7300]	training's l1: 1.23481	valid_1's l1: 1.67517
[7400]	training's l1: 1.22997	valid_1's l1: 1.67427
[7500]	training's l1: 1.22503	valid_1's l1: 1.67332
[7600]	training's l1: 1.22008	valid_1's l1: 1.67221
[7700]	training's l1: 1.21529	valid_1's l1: 1.67133
[7800]	training's l1: 1.21053	valid_1's l1: 1.67021
[7900]	training's l1: 1.20586	valid_1's l1: 1.66919
[8000]	training's l1: 1.20117	valid_1's l1: 1.66825
[8100]	training's l1: 1.1965	valid_1's l1: 1.66721
[8200]	training's l1: 1.19196	valid_1's l1: 1.6664
[8300]	training's l1: 1.18742	valid_1's l1: 1.66532
[8400]	training