In [20]:
from matminer.featurizers.composition import alloy
from matminer.featurizers.conversions import StrToComposition

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

from figrecipes import PlotlyFig
import pandas as pd
import numpy as np

In [21]:
data = pd.read_csv('data.csv')

# Convert formula to composition
data = StrToComposition().featurize_dataframe(data, 'formula')
# 然后基于composition计算特征
data = alloy.WenAlloys().featurize_dataframe(data, 'composition')

StrToComposition:   0%|          | 0/2000 [00:00<?, ?it/s]

WenAlloys:   0%|          | 0/2000 [00:00<?, ?it/s]

In [22]:
# 去除数据中的'formula', 'C11', 'C12', 'C44', 'a', 'b', 'c', 'G', 'B', 'E', 'v', 'Zener', 'composition', 'Weight Fraction', 'Atomic Fraction'
data.drop(['formula', 'C11', 'C12', 'C44', 'a', 'b', 'c', 'G', 'B', 'E', 'v', 'Zener', 'composition', 'Weight Fraction', 'Atomic Fraction'], axis=1, inplace=True)
data.drop(['Interant electrons', 'Interant s electrons', 'Interant p electrons', 'Interant d electrons', 'Interant f electrons'], axis=1, inplace=True)

data.dropna(axis=1, how='any', inplace=True)

# 选择前1500条数据作为训练集和验证集；后500条数据作为验证集。
data_fit = data.iloc[:1500]
data_test = data.iloc[1500:]

In [24]:
data_fit_X = data_fit.drop('Pugh', axis=1)
data_fit_y = data_fit['Pugh']
data_fit.dtypes

Nb                                    int64
Mo                                    int64
Ta                                    int64
W                                     int64
Pugh                                float64
Yang delta                          float64
Yang omega                          float64
APE mean                            float64
Radii local mismatch                float64
Radii gamma                         float64
Configuration entropy               float64
Atomic weight mean                  float64
Total weight                        float64
Lambda entropy                      float64
Electronegativity delta             float64
Electronegativity local mismatch    float64
VEC mean                            float64
Mixing enthalpy                     float64
Mean cohesive energy                float64
Shear modulus mean                  float64
Shear modulus delta                 float64
Shear modulus local mismatch        float64
Shear modulus strength model    

In [27]:
data_fit.corr()

Unnamed: 0,Nb,Mo,Ta,W,Pugh,Yang delta,Yang omega,APE mean,Radii local mismatch,Radii gamma,...,Lambda entropy,Electronegativity delta,Electronegativity local mismatch,VEC mean,Mixing enthalpy,Mean cohesive energy,Shear modulus mean,Shear modulus delta,Shear modulus local mismatch,Shear modulus strength model
Nb,1.0,-0.318892,-0.333438,-0.374204,0.867078,-0.465959,-0.564729,-0.25095,-0.516958,-0.415125,...,-0.47027,0.156575,-0.272526,-0.604774,0.283227,-0.24308,-0.761098,0.91695,0.433206,-0.828498
Mo,-0.318892,1.0,-0.340382,-0.343044,-0.414675,-0.586266,0.50588,0.575089,-0.495778,0.218043,...,-0.552003,-0.833155,-0.260401,0.570214,-0.59764,-0.777568,0.273248,-0.647357,-0.554282,0.719668
Ta,-0.333438,-0.340382,1.0,-0.289237,0.158796,0.228781,0.380938,-0.187563,0.18122,-0.643328,...,0.167768,0.177393,-0.225631,-0.549167,-0.27221,0.215326,-0.290368,-0.108566,-0.5147,0.206509
W,-0.374204,-0.343044,-0.289237,1.0,-0.619616,0.834666,-0.29892,-0.136712,0.842257,0.82519,...,0.863874,0.498977,0.753272,0.57604,0.570529,0.81211,0.780007,-0.180392,0.609591,-0.075816
Pugh,0.867078,-0.414675,0.158796,-0.619616,1.0,-0.474657,-0.300321,-0.257702,-0.553879,-0.793358,...,-0.504574,0.117998,-0.521675,-0.902756,0.021734,-0.244735,-0.965499,0.850657,0.077569,-0.678362
Yang delta,-0.465959,-0.586266,0.228781,0.834666,-0.474657,1.0,-0.226466,-0.411039,0.990344,0.446563,...,0.989976,0.743746,0.758013,0.219785,0.587486,0.940203,0.561353,-0.116392,0.460448,-0.091846
Yang omega,-0.564729,0.50588,0.380938,-0.29892,-0.300321,-0.226466,1.0,0.506716,-0.20004,-0.126819,...,-0.22787,-0.674886,-0.50125,0.178806,-0.840045,-0.327614,0.159504,-0.670994,-0.802131,0.770792
APE mean,-0.25095,0.575089,-0.187563,-0.136712,-0.257702,-0.411039,0.506716,1.0,-0.391685,0.180557,...,-0.380172,-0.737441,-0.435591,0.380895,-0.633108,-0.399307,0.227707,-0.505425,-0.495735,0.55948
Radii local mismatch,-0.516958,-0.495778,0.18122,0.842257,-0.553879,0.990344,-0.20004,-0.391685,1.0,0.505317,...,0.977467,0.715949,0.817591,0.30517,0.595456,0.894015,0.624634,-0.185363,0.457005,-0.032381
Radii gamma,-0.415125,0.218043,-0.643328,0.82519,-0.793358,0.446563,-0.126819,0.180557,0.505317,1.0,...,0.502678,0.03643,0.620969,0.911429,0.309572,0.34194,0.905323,-0.43771,0.419703,0.213957


筛选出与因变量之间的相关性

In [31]:
cor = data_fit.corr()
cor_target = abs(cor["Pugh"])
# 挑选出大于0.5的相关系数
relevant_features = cor_target[cor_target>0.5]
relevant_features

Nb                                  0.867078
W                                   0.619616
Pugh                                1.000000
Radii local mismatch                0.553879
Radii gamma                         0.793358
Lambda entropy                      0.504574
Electronegativity local mismatch    0.521675
VEC mean                            0.902756
Shear modulus mean                  0.965499
Shear modulus delta                 0.850657
Shear modulus strength model        0.678362
Name: Pugh, dtype: float64

筛选出3个相关性比较大的自变量来，然后我们来看一下自变量之间的相关性如何，要是自变量之间的相关性非常强的话，我们也只需要保留其中的一个就行

In [32]:
print(data_fit[['Shear modulus mean', 'VEC mean']].corr())
print("=" * 50)
print(data_fit[['Shear modulus mean', 'Nb']].corr())
print("=" * 50)
print(data_fit[['VEC mean', 'Nb']].corr())
print("=" * 50)

                    Shear modulus mean  VEC mean
Shear modulus mean            1.000000  0.919955
VEC mean                      0.919955  1.000000
                    Shear modulus mean        Nb
Shear modulus mean            1.000000 -0.761098
Nb                           -0.761098  1.000000
          VEC mean        Nb
VEC mean  1.000000 -0.604774
Nb       -0.604774  1.000000


确定多少个变量能使模型的性能达到最优

In [37]:
from sklearn.feature_selection import RFE

features_num_list = np.arange(1, 22)            # 特征数目
high_score = 0                                  # 
num_feature = 0                                 # 最优特征数目
score_list = []                                 # 交叉验证得分
for n in range(len(features_num_list)):
    model = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(data_fit_X,data_fit_y, test_size = 0.3, random_state = 0)
    rfe_model = RFE(model, n_features_to_select=features_num_list[n])
    X_train_rfe = rfe_model.fit_transform(X_train, y_train)
    X_test_rfe = rfe_model.transform(X_test)
    model.fit(X_train_rfe, y_train)
    score = model.score(X_test_rfe, y_test)
    score_list.append(score)
    if(score > high_score):
        high_score = score
        num_feature = features_num_list[n]
print("Optimum number of features: %d" % num_feature)
print("Score with %d features: %f" % (num_feature, high_score))


Optimum number of features: 4
Score with 4 features: 0.998705


从上面可以看出4个变量对于整个模型来说是最优的，下面找出这四个特征

In [41]:
cols = list(data_fit_X.columns)
model = RandomForestRegressor()
#Initializing RFE model
rfe = RFE(model, n_features_to_select=4)
X_rfe = rfe.fit_transform(data_fit_X, data_fit_y) # Fit the data to model
model.fit(X_rfe, data_fit_y)
df = pd.Series(rfe.support_, index=cols)
selected_features_rfe = df[df==True].index
print(selected_features_rfe)

Index(['Nb', 'VEC mean', 'Shear modulus mean', 'Shear modulus delta'], dtype='object')
