In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from collections import OrderedDict
from time import time
import utils
import plotter

%matplotlib inline
%config InlineBackend.figure_format='retina'

from sklearn.model_selection import train_test_split

PATH = os.getcwd()
RNG_SEED = 42
np.random.seed(seed=RNG_SEED)

In [3]:
data_path_jarvis = os.path.join(PATH, './data/descriptors/jarvis.bin')
data_path_magpie = os.path.join(PATH, './data/descriptors/magpie.bin')

data_path = [data_path_jarvis, 
             data_path_magpie,]
x_value_raw = {}
x_label = ['jarvis', 
           'magpie',]

for path, label in zip(data_path, x_label):
    with open(path, 'rb') as f:
        x_value_raw[label] = pickle.load(f)

In [4]:
for label, data in x_value_raw.items():
        print(f'length of {label} is {len(data[0].columns)}')

length of jarvis is 3066
length of magpie is 154


In [5]:
X_merge = pd.concat([x_value_raw['jarvis'][0], x_value_raw['magpie'][0]], axis=1)

In [None]:
corr = X_merge.corr().abs()
X_data = utils.corr_reduction(corr, 0.8, X_merge)

In [None]:
import seaborn as sns
ax = sns.heatmap(corr)
ax.axes.yaxis.set_visible(False)
ax.axes.xaxis.set_visible(False)

In [None]:
df_classics = pd.DataFrame(columns=[
        'feature',
        'model_name',
        'model',
        'fit_time',
        'r2_train',
        'mae_train',
        'rmse_train',
        'r2_val',
        'mae_val',
        'rmse_val',
        'y_test',
        'predict',])

In [None]:
scheme_label = {'X_merge': X_merge}
classic_models = OrderedDict()
y = x_value_raw['jarvis'][1]

param_grid = {'rfr':{ 'n_estimators' : [1500], # number of trees in the random forest
                    'max_features' : ['auto', 'sqrt'], # number of features in consideration at every split
                    'max_depth' : [int(x) for x in np.linspace(10, 120, num = 12)], # maximum number of levels allowed in each decision tree
                    'min_samples_split' : [2, 6, 10], # minimum sample number to split a node
                    'min_samples_leaf' : [1, 3, 4],
            },
              'gbr':{'n_estimators':[1500],
                     'learning_rate':[.001,0.01,.1],
                     'max_depth':[1,2,4],
                     'subsample':[.5,.75,1],
                     'random_state':[1]}
             }

for label, data in scheme_label.items():
    models = {'rfr':RandomForestRegressor(), 'gbr': GradientBoostingRegressor()}
    for model_label, model_class in models.items():
        X = data
        model = model_class
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RNG_SEED)
        model, result_dict = utils.fit_evaluate_model(model, X_train, y_train, X_test, y_test, label, param_grid[model_label])
        df_classics = utils.append_result_df(df_classics, result_dict)
        classic_models = utils.append_model_dict(classic_models, model, label)

In [None]:
for row in range(df_classics.shape[0]):
    act = table['y_test'][row]
    pred = table['predict'][row]
    feature_name = table['feature'][row]
    model = table['model_name'][row]
    utils.plot_pred_act(act, pred, feature_name, model, reg_line=True, label='')