In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import time

In [2]:
df=pd.read_excel(r"./dataset/data.xlsx",sheet_name="Sheet1")
df

Unnamed: 0,Number of Storeys(NS),Height of Structure(HS),Number of Spans(HSP),Length of Spans (LS),Opening percentage (OP),Masonry wall Stiffeness Et (MS),Period (P)
0,1,3,2,3.0,0,2.25,0.092970
1,1,3,2,3.0,100,2.25,0.159750
2,1,3,2,4.5,0,2.25,0.094770
3,1,3,2,4.5,100,2.25,0.156589
4,1,3,2,6.0,0,2.25,0.106350
...,...,...,...,...,...,...,...
4021,22,66,6,6.0,50,25.00,1.628000
4022,22,66,6,6.0,75,25.00,2.714000
4023,22,66,6,6.0,100,25.00,2.879000
4024,22,66,6,7.5,0,25.00,0.602680


In [3]:
df.shape

(4026, 7)

In [4]:
df.head()

Unnamed: 0,Number of Storeys(NS),Height of Structure(HS),Number of Spans(HSP),Length of Spans (LS),Opening percentage (OP),Masonry wall Stiffeness Et (MS),Period (P)
0,1,3,2,3.0,0,2.25,0.09297
1,1,3,2,3.0,100,2.25,0.15975
2,1,3,2,4.5,0,2.25,0.09477
3,1,3,2,4.5,100,2.25,0.156589
4,1,3,2,6.0,0,2.25,0.10635


In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

print(normalized_df)

      Number of Storeys(NS)  Height of Structure(HS)  Number of Spans(HSP)  \
0                       0.0                      0.0                   0.0   
1                       0.0                      0.0                   0.0   
2                       0.0                      0.0                   0.0   
3                       0.0                      0.0                   0.0   
4                       0.0                      0.0                   0.0   
...                     ...                      ...                   ...   
4021                    1.0                      1.0                   1.0   
4022                    1.0                      1.0                   1.0   
4023                    1.0                      1.0                   1.0   
4024                    1.0                      1.0                   1.0   
4025                    1.0                      1.0                   1.0   

      Length of Spans (LS)  Opening percentage (OP)  \
0       

In [6]:
normalized_df.keys()

Index(['Number of Storeys(NS)', 'Height of Structure(HS)',
       'Number of Spans(HSP)', 'Length of Spans (LS)',
       'Opening percentage (OP)', 'Masonry wall Stiffeness Et (MS)',
       'Period (P)'],
      dtype='object')

In [7]:
X=normalized_df.drop("Period (P)" ,axis=1)

In [8]:
y=normalized_df["Period (P)"] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=99)

# RandomForestRegressor

In [10]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import KFold,cross_validate


In [11]:
def bayesopt_objective(n_estimators,max_depth,min_samples_leaf):
    model=RFR(n_estimators=int(n_estimators)
              ,max_depth=int(max_depth)
              ,min_samples_leaf=int(min_samples_leaf)
              ,random_state=99
              ,n_jobs=3)
    cv=KFold(n_splits=10,shuffle=True,random_state=99)
    validate_loss=cross_validate(model,X_train,y_train
                                   ,cv=cv
                                   ,scoring='neg_root_mean_squared_error'
                                   ,n_jobs=-1
                                   ,error_score='raise')
    
    return np.mean(abs(validate_loss['test_score']))

In [12]:
def param_bayes_opt(init_points,n_iter):
    opt=BayesianOptimization(bayesopt_objective
                             ,param_grid_simple
                             ,random_state=7)
 
    opt.maximize(init_points=init_points
                 ,n_iter=n_iter 
                )
    
    # 返回优化结果
    params_best=opt.max['params'] 
    score_best=opt.max['target'] 
    
    # 打印结果
    print("\n","best params: ", params_best,
          "\n","best cvscore: ", score_best)
    
    return params_best,score_best


In [13]:
def bayes_opt_validation(params_best):
    model=RFR(n_estimators=int(params_best['n_estimators'])
            ,max_depth=int(params_best['max_depth'])
            ,min_samples_leaf=int(params_best['min_samples_leaf'])
            ,random_state=99
            ,n_jobs=-1
           )
    cv=KFold(n_splits=10,shuffle=True,random_state=99)
    validate_loss=cross_validate(model,X_train,y_train
                                   ,cv=cv
                                   ,scoring='neg_root_mean_squared_error'
                                   ,n_jobs=-1
                                  )
    
    return np.mean(abs(validate_loss['test_score']))


In [15]:
# Start time
start_time = time.time()

param_grid_simple={'n_estimators':(800,5000)
                  ,'max_depth':(3,10)
                  ,'min_samples_leaf':(2,20)
                  }

params_best,score_best=param_bayes_opt(20,100)

params_best 
score_best 
validation_score=bayes_opt_validation(params_best)
validation_score 
# End time
end_time = time.time()
# Calculate total time
total_time = end_time - start_time
print("Total time taken: ", total_time)

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.07822  [0m | [0m3.534    [0m | [0m16.04    [0m | [0m2.641e+03[0m |
| [0m2        [0m | [0m0.03057  [0m | [0m8.064    [0m | [0m19.6     [0m | [0m3.062e+03[0m |
| [0m3        [0m | [0m0.02566  [0m | [0m6.508    [0m | [0m3.297    [0m | [0m1.927e+03[0m |
| [0m4        [0m | [0m0.02846  [0m | [0m6.499    [0m | [0m14.23    [0m | [0m4.176e+03[0m |
| [0m5        [0m | [0m0.03757  [0m | [0m5.667    [0m | [0m3.187    [0m | [0m2.01e+03 [0m |
| [0m6        [0m | [0m0.01628  [0m | [0m9.367    [0m | [0m5.841    [0m | [0m2.699e+03[0m |
| [0m7        [0m | [0m0.01237  [0m | [0m9.518    [0m | [0m2.448    [0m | [0m3.322e+03[0m |
| [0m8        [0m | [0m0.01752  [0m | [0m9.651    [0m | [0m6.145    [0m | [0m3.104e+03[0m |
| [0m9        [0m | [0m0.015    [0m | [0m9.364    [0m 

| [95m80       [0m | [95m0.07828  [0m | [95m3.026    [0m | [95m2.903    [0m | [95m1.071e+03[0m |
| [0m81       [0m | [0m0.07828  [0m | [0m3.036    [0m | [0m3.674    [0m | [0m1.074e+03[0m |
| [0m82       [0m | [0m0.03758  [0m | [0m5.29     [0m | [0m7.119    [0m | [0m1.08e+03 [0m |
| [0m83       [0m | [0m0.03764  [0m | [0m5.172    [0m | [0m11.29    [0m | [0m2.646e+03[0m |
| [0m84       [0m | [0m0.07822  [0m | [0m3.718    [0m | [0m11.32    [0m | [0m2.635e+03[0m |
| [0m85       [0m | [0m0.05259  [0m | [0m4.515    [0m | [0m14.69    [0m | [0m4.58e+03 [0m |
| [0m86       [0m | [0m0.05258  [0m | [0m4.92     [0m | [0m10.8     [0m | [0m2.637e+03[0m |
| [0m87       [0m | [0m0.05258  [0m | [0m4.962    [0m | [0m8.421    [0m | [0m2.633e+03[0m |
| [0m88       [0m | [0m0.07825  [0m | [0m3.042    [0m | [0m15.55    [0m | [0m4.577e+03[0m |
| [0m89       [0m | [0m0.02807  [0m | [0m6.43     [0m | [0m13.0     [

In [20]:
from sklearn.ensemble import RandomForestRegressor
params_best['n_estimators'] = int(params_best['n_estimators'])
params_best['max_depth'] = int(params_best['max_depth'])
params_best['min_samples_leaf'] = int(params_best['min_samples_leaf'])
rf_reg = RandomForestRegressor(**params_best )
rf_reg.fit(X_train, y_train)
print('train_mes : {:.8f}'.format(mean_squared_error(y_train, rf_reg.predict(X_train))))

train_mes : 0.00596842


In [19]:
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [21]:
y_pred= rf_reg.predict(X_test)
y_pred1=rf_reg.predict(X_val)
y_pred2=rf_reg.predict(X_train)

In [22]:
print("mean_absolute_error:", mean_absolute_error(y_val, y_pred1))
print("mean_squared_error:", mean_squared_error(y_val, y_pred1))
print("rmse:", sqrt(mean_squared_error(y_val, y_pred1)))
print("r2 score:", r2_score(y_val, y_pred1))

mean_absolute_error: 0.06253203904692697
mean_squared_error: 0.006962389512528188
rmse: 0.08344093427406112
r2 score: 0.860357888207365


In [23]:
print("mean_absolute_error:", mean_absolute_error(y_train, y_pred2))
print("mean_squared_error:", mean_squared_error(y_train, y_pred2))
print("rmse:", sqrt(mean_squared_error(y_train, y_pred2)))
print("r2 score:", r2_score(y_train, y_pred2))

mean_absolute_error: 0.05901417243052987
mean_squared_error: 0.0059684170515546305
rmse: 0.07725553088002587
r2 score: 0.8779846463653039


In [24]:
print("mean_absolute_error:", mean_absolute_error(y_test, y_pred))
print("mean_squared_error:", mean_squared_error(y_test, y_pred))
print("rmse:", sqrt(mean_squared_error(y_test, y_pred)))
print("r2 score:", r2_score(y_test, y_pred))

mean_absolute_error: 0.061474524472411134
mean_squared_error: 0.006413774133495096
rmse: 0.08008604206411436
r2 score: 0.8748059524986933
