# 测试不同模型

1.	multiple linear regression (MLR)
2.	backpropagation artificial neural network (BPNN)
3.	gradient boosting regression (GBR)
4.	extreme gradient boosting(XGBoost)
5.	RF
6.	support vector regression with radial basis kernel function (SVR-rbf)
7.	support vector regression with linear kernel function (SVR-lin)
8.	support vector regression with polynomial kernel function (SVR-poly)

## 处理数据

In [4]:
from matminer.featurizers.composition import alloy
from matminer.featurizers.conversions import StrToComposition

import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('data-全组分-500.csv')

# Convert formula to composition
data = StrToComposition().featurize_dataframe(data, 'formula')
# 然后基于composition计算特征
data = alloy.WenAlloys().featurize_dataframe(data, 'composition')

StrToComposition:   0%|          | 0/500 [00:00<?, ?it/s]

WenAlloys:   0%|          | 0/500 [00:00<?, ?it/s]

In [6]:
# 选择前400条数据作为训练集和验证集；后500条数据作为验证集。
data_fit = data.iloc[:400]
data_test = data.iloc[400:]

data_fit_X = data_fit[['Nb', 'APE mean', 'Radii gamma', 'Electronegativity local mismatch', 'VEC mean', 'Shear modulus strength model']]
data_fit_y = data_fit['Pugh']
data_test_X = data_test[['Nb', 'APE mean', 'Radii gamma', 'Electronegativity local mismatch', 'VEC mean', 'Shear modulus strength model']]
data_test_y = data_test['Pugh']

### 1. multiple linear regression (MLR)

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

model_lr = LinearRegression()
param_grid_simple = {'fit_intercept': [True, False], 'normalize': [True, False]}
search_lr = GridSearchCV(estimator=model_lr, param_grid=param_grid_simple, cv=5, scoring='neg_mean_squared_error')
search_lr.fit(data_fit_X, data_fit_y)

print('网格搜索-度量记录：',search_lr.cv_results_)  # 包含每次训练的相关信息
print('网格搜索-最佳度量值:',search_lr.best_score_)  # 获取最佳度量值
print('网格搜索-最佳参数：',search_lr.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典
print('网格搜索-最佳模型：',search_lr.best_estimator_)  # 获取最佳度量时的分类器模型

网格搜索-度量记录： {'mean_fit_time': array([0.00740666, 0.00220208, 0.00200133, 0.00180178]), 'std_fit_time': array([9.31667206e-03, 4.00400176e-04, 4.62310777e-07, 7.49105703e-04]), 'mean_score_time': array([0.00120111, 0.00100126, 0.00100112, 0.00100074]), 'std_score_time': array([4.00329016e-04, 3.56832255e-07, 3.37174788e-07, 5.95569420e-07]), 'param_fit_intercept': masked_array(data=[True, True, False, False],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_normalize': masked_array(data=[True, False, True, False],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'fit_intercept': True, 'normalize': True}, {'fit_intercept': True, 'normalize': False}, {'fit_intercept': False, 'normalize': True}, {'fit_intercept': False, 'normalize': False}], 'split0_test_score': array([-0.04157347, -0.04157347, -0.03562766, -0.03562766]), 'split1_test_score': array([-0.04843757, -0.04843757, 

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

### 2. gradient boosting regression (GBR)

In [5]:
from sklearn.ensemble import GradientBoostingRegressor

model_gbr = GradientBoostingRegressor()
param_grid_simple = {'n_estimators': list(range(100, 1000, 100)), 'max_depth': list(range(1, 10, 1)), 'learning_rate': [0.1, 0.01, 0.001], 'loss': ['squared_error', 'absolute_error', 'huber', 'quantile']}
search_gbr = GridSearchCV(estimator=model_gbr, param_grid=param_grid_simple, cv=5, scoring='neg_mean_squared_error')
search_gbr.fit(data_fit_X, data_fit_y)

print('网格搜索-度量记录：',search_gbr.cv_results_)  # 包含每次训练的相关信息
print('网格搜索-最佳度量值:',search_gbr.best_score_)  # 获取最佳度量值
print('网格搜索-最佳参数：',search_gbr.best_params_)  # 获取最佳度量值时的代定参数的值。是一个字典
print('网格搜索-最佳模型：',search_gbr.best_estimator_)  # 获取最佳度量时的分类器模型

网格搜索-度量记录： {'mean_fit_time': array([2.41284847e-02, 4.59720135e-02, 6.93703651e-02, 9.53991413e-02,
       1.13102865e-01, 1.35229397e-01, 1.63977003e-01, 1.81273413e-01,
       2.03399754e-01, 3.36304665e-02, 7.04638958e-02, 9.84012604e-02,
       1.28624392e-01, 1.65863228e-01, 1.92193794e-01, 2.26713800e-01,
       2.54752731e-01, 2.84764814e-01, 4.33478832e-02, 8.51866245e-02,
       1.28516817e-01, 1.68073797e-01, 2.11103153e-01, 2.51426506e-01,
       2.87422514e-01, 3.30140400e-01, 3.68555641e-01, 5.11530876e-02,
       1.01201868e-01, 1.51645947e-01, 2.00292587e-01, 2.50849056e-01,
       3.00533390e-01, 3.62558937e-01, 4.05148315e-01, 4.45736456e-01,
       5.90534210e-02, 1.15414238e-01, 1.74379826e-01, 2.30768251e-01,
       2.94684410e-01, 3.55054808e-01, 4.14303207e-01, 4.72268057e-01,
       5.31521225e-01, 6.73712254e-02, 1.33721542e-01, 2.01204491e-01,
       2.70060110e-01, 3.36521673e-01, 4.02100229e-01, 4.74467421e-01,
       5.41643429e-01, 6.04342699e-01, 7.5268030

In [6]:
# 输出训练集精度
print('训练集精度：', search_gbr.score(data_fit_X, data_fit_y))

训练集精度： -3.538102297128259e-05


In [8]:
# 交叉验证评价性能
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import GradientBoostingRegressor
import sklearn

search_gbr = GradientBoostingRegressor(learning_rate=0.01, loss='squared_error', max_depth=5, n_estimators=900)
gbr_pridict = cross_val_predict(search_gbr, data_test_X, data_test_y, cv=10)

for scorer in ['r2_score', 'mean_absolute_error', 'mean_squared_error']:
    score = getattr(sklearn.metrics, scorer)(data_test_y, gbr_pridict)
    print(scorer, score)

r2_score 0.9197179743478785
mean_absolute_error 0.07367843379026702
mean_squared_error 0.024849866400815813
