In [None]:
!pip install ase

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install autogluon

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autogluon
  Downloading autogluon-0.5.0-py3-none-any.whl (9.5 kB)
Collecting autogluon.vision==0.5.0
  Downloading autogluon.vision-0.5.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 2.8 MB/s 
[?25hCollecting autogluon.text==0.5.0
  Downloading autogluon.text-0.5.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 289 kB/s 
[?25hCollecting autogluon.core[all]==0.5.0
  Downloading autogluon.core-0.5.0-py3-none-any.whl (203 kB)
[K     |████████████████████████████████| 203 kB 35.9 MB/s 
[?25hCollecting autogluon.multimodal==0.5.0
  Downloading autogluon.multimodal-0.5.0-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 55.9 MB/s 
[?25hCollecting autogluon.tabular[all]==0.5.0
  Downloading autogluon.tabular-0.5.0-py3-none-any.whl (272 kB)
[K     |████████████████████████████████| 272 kB 52.1 MB/s 

In [None]:
import os
import pandas as pd
import numpy as np
from ase.db import connect
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_absolute_error

from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
def data_preparation():
    db = connect('/airi/train.db')
    db_test = connect('/airi/test.db')

    features = []
    target = []

    # Load total number of atoms in molecules and number of each atom type
    for row in db.select():
        features.append([row.natoms, list(row.numbers)])
        target.append(row.data['energy'])

    features_test = []

    for row in db_test.select():
        features_test.append([row.natoms, list(row.numbers)])

    #  Features vectorization

    d = DictVectorizer()
    features_onehot = d.fit_transform([Counter(x[1]) for x in features])
    features_onehot_test = d.transform([Counter(x[1]) for x in features_test])

    #  Dataset splits preparation

    X_train, X_test, y_train, y_test = train_test_split(features_onehot, target)
    X_val = features_onehot_test


    return X_train.toarray(), np.array(y_train), X_test.toarray(), np.array(y_test), X_val.toarray()

In [None]:
X_train, y_train, X_test, y_test, X_val = data_preparation()

In [None]:
X_train.shape

(150174, 8)

In [None]:
y_train.shape

(150174,)

In [None]:
features_train = TabularDataset(pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train, columns=['target'])], axis=1))
features_train

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,9.0,12.0,6.0,1.0,0.0,1.0,1.0,0.0,-1723.502283
1,12.0,11.0,4.0,1.0,0.0,2.0,0.0,0.0,-1515.910193
2,12.0,11.0,4.0,2.0,0.0,1.0,0.0,0.0,-1192.998059
3,10.0,11.0,3.0,2.0,0.0,1.0,0.0,1.0,-3711.020473
4,9.0,15.0,3.0,0.0,2.0,0.0,0.0,0.0,-939.767578
...,...,...,...,...,...,...,...,...,...
150169,7.0,11.0,2.0,1.0,5.0,0.0,0.0,0.0,-1106.079749
150170,13.0,10.0,5.0,2.0,0.0,1.0,0.0,0.0,-1210.268325
150171,12.0,13.0,4.0,0.0,0.0,2.0,0.0,0.0,-1516.847077
150172,12.0,11.0,5.0,1.0,0.0,1.0,1.0,0.0,-1632.629307


In [None]:
features_test = TabularDataset(pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test, columns=['target'])], axis=1))
features_test

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,13.0,12.0,1.0,3.0,0.0,1.0,0.0,0.0,-1142.722943
1,13.0,12.0,1.0,1.0,0.0,1.0,2.0,0.0,-1912.585287
2,11.0,12.0,2.0,2.0,0.0,0.0,0.0,1.0,-3296.934234
3,13.0,12.0,2.0,2.0,0.0,1.0,1.0,0.0,-1582.348705
4,13.0,12.0,1.0,1.0,0.0,2.0,0.0,0.0,-1390.493188
...,...,...,...,...,...,...,...,...,...
50053,11.0,14.0,2.0,2.0,0.0,0.0,1.0,0.0,-1259.278791
50054,10.0,14.0,2.0,1.0,3.0,0.0,1.0,0.0,-1482.758831
50055,11.0,11.0,2.0,1.0,0.0,1.0,0.0,1.0,-3581.805054
50056,12.0,13.0,1.0,4.0,0.0,0.0,0.0,1.0,-3431.192551


In [None]:
save_path = 'AutoGluon120min_best'  # folder where to store trained models
label = 'target'
metric = 'mean_absolute_error'
minutes = 120
time_limit = minutes * 60 # AutoGluon count time in seconds
predictor = TabularPredictor(label=label, eval_metric=metric, path=save_path).fit(features_train,
                                                                                  auto_stack=True,
                                                                                  time_limit=time_limit,
                                                                                  presets='best_quality')

Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 7200s
AutoGluon will save models to "AutoGluon120min_best/"
AutoGluon Version:  0.5.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    150174
Train Data Columns: 8
Label Column: target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (-817.4629541219781, -8570.473006232318, -1745.61875, 891.1372)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10798.2 MB
	Train Data (Original)  Memory Usage: 9.61 MB (0.1% of available memory)
	Inferring data type of ea

In [None]:
results = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                     model   score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   RandomForestMSE_BAG_L2   -0.202852    1105.598955  4845.305744                5.355027         218.531173            2       True         13
1      WeightedEnsemble_L3   -0.202852    1105.602311  4847.438853                0.003356           2.133108            3       True         15
2      WeightedEnsemble_L2   -0.599033     483.066835  3065.640325                0.003546           2.523673            2       True         10
3          CatBoost_BAG_L1   -0.643936       2.780691  2274.339244                2.780691        2274.339244            1       True          6
4     ExtraTreesMSE_BAG_L1   -0.667169       6.322812    44.660590                6.322812          44.660590            1       True          7
5          LightGBM_BAG_L2   -0.784553    1504.605105  5492.997964  

In [None]:
perf = predictor.evaluate(features_test, auxiliary_metrics=False)

Evaluation: mean_absolute_error on test data: -0.21284704166903287
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "mean_absolute_error": -0.21284704166903287
}


In [None]:
!zip -r /content/AutoGluon120min_best.zip /content/AutoGluon120min_best

  adding: content/AutoGluon120min_best/ (stored 0%)
  adding: content/AutoGluon120min_best/models/ (stored 0%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/ (stored 0%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F4/ (stored 0%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F4/xgb.ubj (deflated 54%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F4/model.pkl (deflated 46%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F1/ (stored 0%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F1/xgb.ubj (deflated 54%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F1/model.pkl (deflated 46%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F3/ (stored 0%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F3/xgb.ubj (deflated 54%)
  adding: content/AutoGluon120min_best/models/XGBoost_BAG_L1/S1F3/model.pkl (deflated 46%)
  adding: content/AutoGluon120min_best

In [None]:
pred = predictor.predict(TabularDataset(X_val))

In [None]:
pred

0       -5622.790039
1        -924.377747
2        -924.377747
3        -924.377747
4       -1524.821655
            ...     
32879   -1422.557007
32880   -1422.557007
32881   -1422.557007
32882   -1422.557007
32883   -1422.557007
Name: target, Length: 32884, dtype: float32

In [None]:
data_val = pd.read_csv('/airi/test_energy_public.csv').energy
data_val

0       -5622.210393
1        -924.409784
2        -924.413598
3        -924.408473
4       -1524.804832
            ...     
32879   -1422.570432
32880   -1422.568762
32881   -1422.270341
32882   -1422.312388
32883   -1422.564844
Name: energy, Length: 32884, dtype: float64

In [None]:
mean_absolute_error(data_val, pred)

0.26241632055430547