In [6]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from collections import OrderedDict
from time import time
from sklearn.model_selection import train_test_split, KFold

import utils
import plotter

from sklearn.feature_selection import RFECV

%matplotlib inline
%config InlineBackend.figure_format='retina'

PATH = os.getcwd()
RNG_SEED = 42
np.random.seed(seed=RNG_SEED)

In [3]:
data_path_jarvis = os.path.join(PATH, './data/descriptors//jarvis.bin')
data_path_magpie = os.path.join(PATH, './data/descriptors/magpie.bin')
data_path_mat2vec = os.path.join(PATH, './data/descriptors/mat2vec.bin')
data_path_oliynyk = os.path.join(PATH, './data/descriptors/oliynyk.bin')
data_path_onehot = os.path.join(PATH, './data/descriptors/onehot.bin')
data_path_random_200 = os.path.join(PATH, './data/descriptors/random_200.bin')

data_path = [data_path_jarvis, data_path_magpie, data_path_mat2vec, data_path_oliynyk, data_path_onehot, data_path_random_200]
x_value_raw = {}
x_label = ['jarvis', 'magpie', 'mat2vec', 'oliynyk', 'onehot', 'random_200']

for path, label in zip(data_path, x_label):
    with open(path, 'rb') as f:
        x_value_raw[label] = pickle.load(f)

In [7]:
df_classics = pd.DataFrame(columns=['label',
                                    'n_features',
                                    'model',
                                    'r2_val',
                                    'mae_val',
                                    'rmse_val',
                                    'gird_score'
                                    ])

In [None]:
for label, data in x_value_raw.items():

    length = len(x_value_raw[label][0].columns)

    X, y = data[0], data[1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RNG_SEED)

    visualizer = RFECV(RandomForestRegressor(n_jobs=-1, random_state=RNG_SEED, verbose=0), 
                       step=int(length*0.01), n_jobs=-1, cv=5, scoring='neg_root_mean_squared_error')
    visualizer = visualizer.fit(X_train, y_train)
    
    r2_val, mae_val, rmse_val = utils.evaluate_model(visualizer, X_test, y_test)
    result_dict = {
                    'label': label,
                    'n_features': visualizer.n_features_,
                    'model': visualizer,
                    'r2_val': r2_val,
                    'mae_val': mae_val,
                    'rmse_val': rmse_val,
                    'gird_score': visualizer.grid_scores_
                    }

    print(f"Optimal number of features for {label}: {visualizer.n_features_}")
    plotter.plot_RFECV(label, visualizer)
    
    df_classics = append_result_df(df_classics, result_dict)
        
    y_pred = visualizer.predict(X_test)
    plot = plotter.plot_pred_act(y_test, y_pred, label, visualizer, label='defect formation energy (eV)')