In [22]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score

In [2]:
features = pd.read_csv("ruslan_nn/features16k.csv", index_col="_id")
properties = pd.read_csv("ruslan_nn/properties16k.csv", index_col='_id')
descriptors = pd.read_csv("ruslan_nn/descriptors16k.csv")

In [9]:
properties['HL_gap'] = properties['lumo'] - properties['homo']
property_ = 'energy_per_atom'

In [12]:
x = features
y = properties[property_]
y_pred = pd.DataFrame({property_: [np.nan] * len(y)}, index=y.index)

In [15]:
kf = KFold(n_splits=5, shuffle=True)
scores = []

for train_index, test_index in kf.split(x):
    X_train, X_test, y_train, y_test =\
        x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_test, y_test)

    model = CatBoostRegressor(iterations=5000, loss_function='MAE',
                              verbose=1000, random_seed=0, use_best_model=True)
    model.fit(train_pool, eval_set=val_pool)
    y_pred[property_].iloc[test_index] = model.predict(X_test)

0:	learn: 0.0396732	test: 0.0397686	best: 0.0397686 (0)	total: 232ms	remaining: 19m 22s
1000:	learn: 0.0003250	test: 0.0003866	best: 0.0003866 (1000)	total: 3m 41s	remaining: 14m 44s
2000:	learn: 0.0001909	test: 0.0002633	best: 0.0002633 (2000)	total: 7m 20s	remaining: 10m 59s
3000:	learn: 0.0001470	test: 0.0002326	best: 0.0002326 (3000)	total: 10m 48s	remaining: 7m 12s
4000:	learn: 0.0001227	test: 0.0002175	best: 0.0002175 (4000)	total: 14m 25s	remaining: 3m 36s
4999:	learn: 0.0001066	test: 0.0002089	best: 0.0002089 (4999)	total: 17m 46s	remaining: 0us

bestTest = 0.0002089151828
bestIteration = 4999

0:	learn: 0.0395749	test: 0.0396575	best: 0.0396575 (0)	total: 299ms	remaining: 24m 54s
1000:	learn: 0.0003721	test: 0.0004194	best: 0.0004194 (1000)	total: 3m 28s	remaining: 13m 52s
2000:	learn: 0.0002189	test: 0.0002992	best: 0.0002992 (2000)	total: 6m 54s	remaining: 10m 21s
3000:	learn: 0.0001661	test: 0.0002639	best: 0.0002639 (3000)	total: 10m 31s	remaining: 7m
4000:	learn: 0.000141

In [20]:
print(mean_absolute_error(y_pred[property_], y))

0.0002158534755350317


In [17]:
y_pred2 = pd.DataFrame({property_: [np.nan] * len(y)}, index=y.index)

In [None]:
kf = KFold(n_splits=5, shuffle=True)
scores = []

for train_index, test_index in kf.split(x):
    X_train, X_test, y_train, y_test =\
        x.iloc[train_index], x.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_test, y_test)

    model = CatBoostRegressor(iterations=5000, loss_function='RMSE',
                              verbose=1000, random_seed=0, use_best_model=True)
    model.fit(train_pool, eval_set=val_pool)
    y_pred2[property_].iloc[test_index] = model.predict(X_test)

Learning rate set to 0.026942
0:	learn: 0.0516787	test: 0.0509054	best: 0.0509054 (0)	total: 211ms	remaining: 17m 35s
1000:	learn: 0.0004873	test: 0.0006717	best: 0.0006717 (999)	total: 3m 40s	remaining: 14m 42s
2000:	learn: 0.0002726	test: 0.0005853	best: 0.0005853 (2000)	total: 7m 12s	remaining: 10m 47s
3000:	learn: 0.0001928	test: 0.0005600	best: 0.0005600 (3000)	total: 10m 49s	remaining: 7m 12s
4000:	learn: 0.0001519	test: 0.0005476	best: 0.0005476 (4000)	total: 14m 16s	remaining: 3m 33s
4999:	learn: 0.0001259	test: 0.0005412	best: 0.0005412 (4994)	total: 17m 44s	remaining: 0us

bestTest = 0.0005411918795
bestIteration = 4994

Shrink model to first 4995 iterations.
Learning rate set to 0.026942
0:	learn: 0.0514030	test: 0.0519667	best: 0.0519667 (0)	total: 82.5ms	remaining: 6m 52s
1000:	learn: 0.0005021	test: 0.0007227	best: 0.0007227 (1000)	total: 3m 30s	remaining: 13m 59s
2000:	learn: 0.0002852	test: 0.0006048	best: 0.0006048 (2000)	total: 6m 54s	remaining: 10m 21s
3000:	learn: 0

In [23]:
print(r2_score(y_pred2[property_], y))

0.999779717842571
