In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Youngs_Data_Refractory.csv")
data = data.iloc[:,1:data.shape[1]]
data.head()

Unnamed: 0,Alloy,Diff. Lattice Constants,Diff. Melting Point,Mixing Enthalpy,Lattice Constants,Lambda,Diff. in atomic radii,Omega,Melting Temp.,Diff. Electronegativity,Mixing Entropy,Valence electron,Young's Mod (GPa)
0,TaNbHfZrTi,0.129156,476.414945,2.644,3.19618,0.536462,4.994417,12769.215378,2523.0,0.08681,13.381611,4.4,49.88
1,TiTa,0.17525,674.5,1.388,3.12605,4.707131,1.106501,10859.876948,2615.5,0.02,5.763146,4.5,110.7408
2,MoZr,0.0425,384.0,-5.988,3.1895,0.08802,8.091706,2417.672605,2512.0,0.075,5.763146,5.0,158.3784
3,MoW,0.0091,399.5,-0.221,3.1561,268.450958,0.14652,85938.681914,3295.5,0.0,5.763146,6.0,285.8335
4,WTa,0.06805,202.5,-7.298,3.23325,1.135961,2.252413,2757.986918,3492.5,0.065,5.763146,5.5,218.0652


In [3]:
# Preparing training data
train_features = data.iloc[:,1:data.shape[1]-1]
feature_names = train_features.columns
print(feature_names)
Y_train = data.iloc[:,-1]
train_features.head()

Index(['Diff. Lattice Constants', 'Diff. Melting Point', 'Mixing Enthalpy',
       'Lattice Constants', 'Lambda', 'Diff. in atomic radii', 'Omega',
       'Melting Temp.', 'Diff. Electronegativity', 'Mixing Entropy',
       'Valence electron'],
      dtype='object')


Unnamed: 0,Diff. Lattice Constants,Diff. Melting Point,Mixing Enthalpy,Lattice Constants,Lambda,Diff. in atomic radii,Omega,Melting Temp.,Diff. Electronegativity,Mixing Entropy,Valence electron
0,0.129156,476.414945,2.644,3.19618,0.536462,4.994417,12769.215378,2523.0,0.08681,13.381611,4.4
1,0.17525,674.5,1.388,3.12605,4.707131,1.106501,10859.876948,2615.5,0.02,5.763146,4.5
2,0.0425,384.0,-5.988,3.1895,0.08802,8.091706,2417.672605,2512.0,0.075,5.763146,5.0
3,0.0091,399.5,-0.221,3.1561,268.450958,0.14652,85938.681914,3295.5,0.0,5.763146,6.0
4,0.06805,202.5,-7.298,3.23325,1.135961,2.252413,2757.986918,3492.5,0.065,5.763146,5.5


In [5]:
correlation = train_features.corr()
correlation.to_csv('smaller_correlation.csv')

In [6]:
#scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train_features = pd.DataFrame(scaler.fit_transform(train_features))
scaled_train_features.columns = feature_names
scaled_train_features.head()

Unnamed: 0,Diff. Lattice Constants,Diff. Melting Point,Mixing Enthalpy,Lattice Constants,Lambda,Diff. in atomic radii,Omega,Melting Temp.,Diff. Electronegativity,Mixing Entropy,Valence electron
0,-0.375363,-0.315878,1.215738,0.248822,-0.107501,0.029526,-0.151763,-0.064638,0.451324,0.684527,-0.916637
1,0.076551,0.923781,1.021209,-0.838763,0.045866,-2.369301,-0.192286,0.236073,-2.126662,-2.393009,-0.674114
2,-1.224958,-0.89423,-0.121189,0.145228,-0.123991,1.94054,-0.371461,-0.100398,-0.00439,-2.393009,0.5385
3,-1.552419,-0.797228,0.772006,-0.372744,9.744409,-2.961604,1.401162,2.446705,-2.898397,-2.393009,2.96373
4,-0.974461,-2.030096,-0.324082,0.823709,-0.085456,-1.662278,-0.364238,3.087138,-0.390258,-2.393009,1.751115


In [7]:
# Gradient boosting
from sklearn.ensemble import GradientBoostingRegressor

# parameter after hyper parameter tuning 
params = {'learning_rate': 0.39, 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 200}

gbr = GradientBoostingRegressor(**params, random_state = 1)

In [9]:
# cross validation
from sklearn.model_selection import cross_validate
cv_result = cross_validate(gbr, scaled_train_features, Y_train, cv = 5, return_train_score = True, scoring = 'neg_mean_absolute_error')

In [10]:
print(cv_result)
print('train error:', np.mean(cv_result['train_score']) )
print('test error:', np.mean(cv_result['test_score']) )

{'fit_time': array([0.18251801, 0.1584506 , 0.12094116, 0.14564252, 0.19700313]), 'score_time': array([0.00146484, 0.00134254, 0.00334978, 0.00289059, 0.00132442]), 'test_score': array([-17.97457097,  -9.95949727, -10.16039227, -11.14135467,
       -13.85035262]), 'train_score': array([-2.47646405e-04, -7.88703962e-01, -7.88669502e-01, -2.24092520e-04,
       -7.88704504e-01])}
train error: -0.4733099415426761
test error: -12.6172335597428


In [12]:
gbr.fit(scaled_train_features,Y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.39, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=200,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
imp = pd.DataFrame(gbr.feature_importances_)
result = pd.concat([pd.DataFrame(feature_names),imp],axis = 1)
result.columns = ['Feature Names','Importance']
result.to_csv("Feature_importance_gbr_refractory.csv")

In [15]:
# Ada Boost
from sklearn.ensemble import AdaBoostRegressor

# parameter after hyper parameter tuning 
params = {'learning_rate': 0.42, 'n_estimators': 250}

ada = AdaBoostRegressor(**params, random_state = 1)

In [16]:
# cross validation
from sklearn.model_selection import cross_validate
cv_result = cross_validate(ada, scaled_train_features, Y_train, cv = 5, return_train_score = True, scoring = 'neg_mean_absolute_error')

In [17]:
print(cv_result)
print('train error:', np.mean(cv_result['train_score']) )
print('test error:', np.mean(cv_result['test_score']) )

{'fit_time': array([0.66777134, 0.12350249, 0.10863352, 0.646837  , 0.09967208]), 'score_time': array([0.04459596, 0.00587654, 0.00737357, 0.02669191, 0.00934076]), 'test_score': array([-20.35847607,  -9.64682899, -11.18473801, -12.36549639,
       -18.48808468]), 'train_score': array([-4.79439771, -7.97486876, -7.73425811, -6.51816392, -7.54733972])}
train error: -6.913805643757041
test error: -14.408724830262113


In [18]:
ada.fit(scaled_train_features,Y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.42, loss='linear',
                  n_estimators=250, random_state=1)

In [19]:
imp = pd.DataFrame(ada.feature_importances_)
result = pd.concat([pd.DataFrame(feature_names),imp],axis = 1)
result.columns = ['Feature Names','Importance']
result.to_csv("Feature_importance_ada_refractory.csv")
result

Unnamed: 0,Feature Names,Importance
0,Diff. Lattice Constants,0.016173
1,Diff. Melting Point,0.007944
2,Mixing Enthalpy,0.011198
3,Lattice Constants,0.002313
4,Lambda,0.060285
5,Diff. in atomic radii,0.046152
6,Omega,0.007691
7,Melting Temp.,0.07539
8,Diff. Electronegativity,0.001203
9,Mixing Entropy,0.002986


In [20]:
# XGBoost

# parameter after hyper parameter tuning 
params = {'learning_rate': 0.21, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100}

import xgboost as xgb
xgb_reg = xgb.XGBRegressor(**params, random_state=1)

In [21]:
# cross validation
from sklearn.model_selection import cross_validate
cv_result = cross_validate(xgb_reg, scaled_train_features, Y_train, cv = 5, return_train_score = True, scoring = 'neg_mean_absolute_error')

In [22]:
print(cv_result)
print('train error:', np.mean(cv_result['train_score']) )
print('test error:', np.mean(cv_result['test_score']) )

{'fit_time': array([0.21562886, 0.07600951, 0.05091858, 0.09587336, 0.06838942]), 'score_time': array([0.00323534, 0.02217174, 0.00674248, 0.01072764, 0.00879264]), 'test_score': array([-19.58677123,  -9.3255882 ,  -9.82201458, -12.30493562,
       -11.21929683]), 'train_score': array([-0.07340881, -0.84846103, -0.87681016, -0.08431747, -0.86800381])}
train error: -0.5502002568663092
test error: -12.451721289621455


In [23]:
xgb_reg.fit(scaled_train_features,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.21, max_delta_step=0, max_depth=4,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=1, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [25]:
imp = pd.DataFrame(xgb_reg.feature_importances_)
result = pd.concat([pd.DataFrame(feature_names),imp],axis = 1)
result.columns = ['Feature Names','Importance']
result.to_csv("Feature_importance_xgb_refractory.csv")
result

Unnamed: 0,Feature Names,Importance
0,Diff. Lattice Constants,0.007768
1,Diff. Melting Point,0.016856
2,Mixing Enthalpy,0.023201
3,Lattice Constants,0.004717
4,Lambda,0.047388
5,Diff. in atomic radii,0.016712
6,Omega,0.002098
7,Melting Temp.,0.087393
8,Diff. Electronegativity,0.004496
9,Mixing Entropy,0.00663
