In [5]:
from matminer.featurizers.composition import alloy
from matminer.featurizers.conversions import StrToComposition
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
import sklearn
from sklearn.ensemble import RandomForestRegressor


import matplotlib.pyplot as plt
import matplotlib
import joblib
import pandas as pd
import sys

In [6]:
ppn = sys.argv[1]

In [2]:
data = pd.read_csv('data.csv')

# Convert formula to composition
data = StrToComposition().featurize_dataframe(data, 'formula')
# 然后基于composition计算特征
data = alloy.WenAlloys().featurize_dataframe(data, 'composition')

StrToComposition:   0%|          | 0/799 [00:00<?, ?it/s]

WenAlloys:   0%|          | 0/799 [00:00<?, ?it/s]

In [3]:
# 选择前1500条数据作为训练集和验证集；后500条数据作为验证集。
data_fit = data.iloc[:600]
data_test = data.iloc[600:]

data_fit_X = data_fit[['APE mean', 'Electronegativity local mismatch', 'VEC mean', 'Shear modulus mean', 'Shear modulus delta', 'Shear modulus strength model']]
data_fit_y = data_fit['SFE']
data_test_X = data_test[['APE mean', 'Electronegativity local mismatch', 'VEC mean', 'Shear modulus mean', 'Shear modulus delta', 'Shear modulus strength model']]
data_test_y = data_test['SFE']

In [None]:
model_rf = RandomForestRegressor()
param_grid = {'n_estimators': list(range(100, 1000, 100)), 
              'max_depth': list(range(5, 20, 2)), 
              'max_features': list(range(1, 6, 1)), 
              'min_samples_leaf': list(range(1, 10, 1)), 
              'min_samples_split': list(range(2, 10, 1)), 
              'criterion': ['squared_error', 'absolute_error', 'poisson', 'friedman_mse']}
search_rf = GridSearchCV(model_rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=ppn)
search_rf.fit(data_fit_X, data_fit_y)

print('网格搜索-度量记录：',search_rf.cv_results_)  # 包含每次训练的相关信息
print('网格搜索-最佳度量值:',search_rf.best_score_)  # 获取最佳度量值
print('网格搜索-最佳参数：',search_rf.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典
print('网格搜索-最佳模型：',search_rf.best_estimator_)  # 获取最佳度量时的分类器模型

In [None]:
# 保存模型
import joblib

joblib.dump(search_rf, 'model_RF.pkl')

In [None]:
search_rf = joblib.load('model_GBR.pkl')

In [None]:
print("\n")
print("-"*50)

# 交叉验证评价性能
xgb_pridict = cross_val_predict(search_rf.best_estimator_, data_test_X, data_test_y, cv=10)

for scorer in ['r2_score', 'mean_absolute_error', 'mean_squared_error']:
    score = getattr(sklearn.metrics, scorer)(data_test_y, xgb_pridict)
    print(scorer, score)