In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from pandas.plotting import scatter_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error as MSE
import warnings
warnings.filterwarnings('ignore')

#Importing data from csv and read Training Dataset

input_dir = 'C:/Users/MMohan/Videos/Scikit-Learn_ML/Models_MM/ST-RF_ILs/'
df_train = pd.read_csv(input_dir + 'SurfaceTension-ILs.csv')
var_columns = [c for c in df_train.columns if c not in('IonicLiuid','ST-exp')]
X = df_train.loc[:, var_columns]
y = df_train.loc[:, 'ST-exp']

# Splitting the data into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=35)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_test.describe()


Unnamed: 0,T (K),S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,Area (Å2),Mol-Wt (g/mol)
count,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0,1136.0
mean,321.66184,0.326594,2.349716,7.135597,10.340003,4.037372,0.347007,0.38164,1.924509,4.439504,2.672744,2.729236,1.65849,0.679044,382.513661,327.002597
std,28.704375,0.806864,1.228081,1.747528,7.066092,4.501617,0.925237,0.717734,3.998672,5.04748,2.940511,2.30949,1.736213,1.139722,137.773551,141.349585
min,263.32,0.0,0.11535,2.4469,0.9099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,171.0561,105.1361
25%,303.15,0.01495,2.1006,6.6862,6.46095,1.757737,0.0004,0.0,0.21825,1.0804,0.67825,0.5505,0.031,0.0,300.4046,226.0242
50%,318.15,0.1385,2.4982,6.69785,8.5982,2.6474,0.03675,0.12905,1.6157,2.61255,1.77865,1.7428,1.6392,0.0,351.122,282.1309
75%,333.225,0.1514,2.54575,7.54175,10.59105,4.234,0.32165,0.32425,2.39025,9.532,2.92055,5.2297,2.9574,1.409,419.0689,422.4358
max,532.6,6.741,16.289,21.695,36.7545,21.26135,8.835,5.466,50.957,39.61,12.2205,7.2472,8.993,8.993,1061.3956,928.8688


In [14]:
# Performing the Booster Tree (GradientBoostingRegressor)
import time
start = time.time()

GBTModel = GradientBoostingRegressor(loss='squared_error', learning_rate=0.03, n_estimators=100, 
                          subsample=0.6, criterion='friedman_mse', min_samples_split=3, 
                          min_samples_leaf=1, min_weight_fraction_leaf=0.003, max_depth=None, 
                          min_impurity_decrease=0.0, init=None, random_state=None, 
                          max_features=None, alpha=0.4, verbose=2, max_leaf_nodes=None, 
                          warm_start=False, validation_fraction=0.1, n_iter_no_change=None, 
                          tol=0.0001, ccp_alpha=0.0)

GBTModel.fit(X_train, y_train)

# Make prediction for Training
pred_train = GBTModel.predict(X_train)

# Mean absolute error (MAE)
mae_train = mean_absolute_error(y_train.values.ravel(), pred_train)
# Mean squared error (MSE)
mse_train = mean_squared_error(y_train.values.ravel(), pred_train)
rmse_train = (mse_train**0.5)
# mean absolute percentage error (MAPE)
mape_train = mean_absolute_percentage_error(y_train.values.ravel(), pred_train)
# R-squared scores
r2_train = r2_score(y_train.values.ravel(), pred_train)

# Print metrics
print('R2_Training:', round(r2_train, 3))
print('MAPE_Training:', "{:.2%}".format(mape_train))
print('MAE_Training:', round(mae_train, 2))
print('RMSE_Training:', round(rmse_train, 2))

# Make prediction for Testing
pred_test = GBTModel.predict(X_test)

# Mean absolute error (MAE)
mae_test = mean_absolute_error(y_test.values.ravel(), pred_test)
# Mean squared error (MSE)
mse_test = mean_squared_error(y_test.values.ravel(), pred_test)
rmse_test = (mse_test**0.5)
# mean absolute percentage error (MAPE)
mape_test = mean_absolute_percentage_error(y_test.values.ravel(), pred_test)
# R-squared scores
r2_test = r2_score(y_test.values.ravel(), pred_test)

# Print metrics
print("")
print('R2_Testing:', round(r2_test, 3))
print('MAPE_Testing:', "{:.2%}".format(mape_test))
print('MAE_Testing:', round(mae_test, 2))
print('RMSE_Testing:', round(rmse_test, 2))

end = time.time()
diff = end - start
print("")
print('Execution_time:', diff)


      Iter       Train Loss      OOB Improve   Remaining Time 
         1          74.7992           4.1406            0.40s
         2          71.1497           3.9890            0.39s
         3          67.1366           3.7581            0.39s
         4          66.1168           3.3896            0.38s
         5          58.7972           3.5340            0.38s
         6          54.9199           3.4911            0.36s
         7          54.5955           2.8728            0.37s
         8          52.3655           2.5452            0.37s
         9          49.4403           2.5554            0.35s
        10          44.1541           2.6003            0.35s
        11          40.7817           2.5345            0.35s
        12          41.1850           2.2269            0.34s
        13          38.7079           2.0654            0.34s
        14          37.9146           1.9497            0.34s
        15          34.4362           1.8615            0.33s
       