In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import time
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [16]:
df=pd.read_excel(r"../dataset/data.xlsx",sheet_name="Sheet1")
df

Unnamed: 0,Number of Storeys(NS),Height of Structure(HS),Number of Spans(HSP),Length of Spans (LS),Opening percentage (OP),Masonry wall Stiffeness Et (MS),Period (P)
0,1,3,2,3.0,0,2.25,0.092970
1,1,3,2,3.0,100,2.25,0.159750
2,1,3,2,4.5,0,2.25,0.094770
3,1,3,2,4.5,100,2.25,0.156589
4,1,3,2,6.0,0,2.25,0.106350
...,...,...,...,...,...,...,...
4021,22,66,6,6.0,50,25.00,1.628000
4022,22,66,6,6.0,75,25.00,2.714000
4023,22,66,6,6.0,100,25.00,2.879000
4024,22,66,6,7.5,0,25.00,0.602680


In [17]:
df.shape

(4026, 7)

In [18]:
df.head()

Unnamed: 0,Number of Storeys(NS),Height of Structure(HS),Number of Spans(HSP),Length of Spans (LS),Opening percentage (OP),Masonry wall Stiffeness Et (MS),Period (P)
0,1,3,2,3.0,0,2.25,0.09297
1,1,3,2,3.0,100,2.25,0.15975
2,1,3,2,4.5,0,2.25,0.09477
3,1,3,2,4.5,100,2.25,0.156589
4,1,3,2,6.0,0,2.25,0.10635


In [19]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)

normalized_df = pd.DataFrame(normalized_data, columns=df.columns)

print(normalized_df)

      Number of Storeys(NS)  Height of Structure(HS)  Number of Spans(HSP)  \
0                       0.0                      0.0                   0.0   
1                       0.0                      0.0                   0.0   
2                       0.0                      0.0                   0.0   
3                       0.0                      0.0                   0.0   
4                       0.0                      0.0                   0.0   
...                     ...                      ...                   ...   
4021                    1.0                      1.0                   1.0   
4022                    1.0                      1.0                   1.0   
4023                    1.0                      1.0                   1.0   
4024                    1.0                      1.0                   1.0   
4025                    1.0                      1.0                   1.0   

      Length of Spans (LS)  Opening percentage (OP)  \
0       

In [20]:
normalized_df.keys()

Index(['Number of Storeys(NS)', 'Height of Structure(HS)',
       'Number of Spans(HSP)', 'Length of Spans (LS)',
       'Opening percentage (OP)', 'Masonry wall Stiffeness Et (MS)',
       'Period (P)'],
      dtype='object')

In [21]:
X=normalized_df.drop("Period (P)" ,axis=1)

In [22]:
y=normalized_df["Period (P)"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=99)

# LightGBM

In [24]:
import lightgbm as lgb

In [25]:
from sklearn.model_selection import KFold,cross_validate
from bayes_opt import BayesianOptimization
from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import optuna.integration.lightgbm as oplg

In [26]:
def bayesopt_objective(n_estimators,max_depth,num_leaves,learning_rate):
    model= lgb.LGBMRegressor(n_estimators=int(n_estimators)
              ,max_depth=int(max_depth)
              ,num_leaves=int(num_leaves)
              ,learning_rate=learning_rate
              )
    cv=KFold(n_splits=10,shuffle=True,random_state=99)
    validate_loss=cross_validate(model,X_train,y_train
                                   ,cv=cv
                                   ,scoring='neg_root_mean_squared_error'
                                   ,n_jobs=-1
                                   ,error_score='raise')
    
    return np.mean(abs(validate_loss['test_score']))

In [27]:
def param_bayes_opt(init_points,n_iter):
    opt=BayesianOptimization(bayesopt_objective
                             ,param_grid_simple
                             ,random_state=99)
    
        opt.maximize(init_points=init_points 
                 ,n_iter=n_iter 
                )
    
    params_best=opt.max['params'] 
    score_best=opt.max['target']
    
   
    print("\n","best params: ", params_best,
          "\n","best cvscore: ", score_best)
    
    return params_best,score_best

In [28]:
def bayes_opt_validation(params_best):
    model= lgb.LGBMRegressor(n_estimators=int(params_best['n_estimators'])
            ,max_depth=int(params_best['max_depth'])
            ,num_leaves=int(params_best['num_leaves'])
            ,learning_rate=params_best['learning_rate']
           )
    cv=KFold(n_splits=10,shuffle=True,random_state=99)
    validate_loss=cross_validate(model,X_train,y_train
                                   ,cv=cv
                                   ,scoring='neg_root_mean_squared_error'
                                   ,n_jobs=-1
                                  )
    
    return np.mean(abs(validate_loss['test_score']))

In [29]:
# Start time
start_time = time.time()

param_grid_simple={'n_estimators':(800,5000)
                  ,'max_depth':(2,10)
                  ,'num_leaves':(10,50)
                   ,'learning_rate':( 0.001,0.1)
                  }

params_best,score_best=param_bayes_opt(20,100)

params_best 
score_best 
validation_score=bayes_opt_validation(params_best)
validation_score
# End time
end_time = time.time()
# Calculate total time
total_time = end_time - start_time
print("Total time taken: ", total_time)

|   iter    |  target   | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.003874 [0m | [0m0.06756  [0m | [0m5.905    [0m | [0m4.267e+03[0m | [0m11.26    [0m |
| [95m2        [0m | [95m0.004197 [0m | [95m0.081    [0m | [95m6.525    [0m | [95m2.05e+03 [0m | [95m11.87    [0m |
| [95m3        [0m | [95m0.02343  [0m | [95m0.09907  [0m | [95m2.055    [0m | [95m4.033e+03[0m | [95m39.87    [0m |
| [0m4        [0m | [0m0.004068 [0m | [0m0.03837  [0m | [0m5.953    [0m | [0m4.702e+03[0m | [0m25.82    [0m |
| [0m5        [0m | [0m0.004241 [0m | [0m0.09742  [0m | [0m6.195    [0m | [0m1.193e+03[0m | [0m42.53    [0m |
| [0m6        [0m | [0m0.005329 [0m | [0m0.02196  [0m | [0m6.435    [0m | [0m2.028e+03[0m | [0m42.65    [0m |
| [0m7        [0m | [0m0.006887 [0m | [0m0.08298  [0m | [0m3.773    [0m | [0m3.508e+03[0m | [0m13

| [0m67       [0m | [0m0.003764 [0m | [0m0.07637  [0m | [0m8.446    [0m | [0m3.672e+03[0m | [0m46.9     [0m |
| [0m68       [0m | [0m0.004118 [0m | [0m0.08249  [0m | [0m8.731    [0m | [0m1.435e+03[0m | [0m29.33    [0m |
| [0m69       [0m | [0m0.007099 [0m | [0m0.05025  [0m | [0m3.563    [0m | [0m4.717e+03[0m | [0m33.29    [0m |
| [0m70       [0m | [0m0.02792  [0m | [0m0.03065  [0m | [0m2.624    [0m | [0m1.161e+03[0m | [0m17.99    [0m |
| [0m71       [0m | [0m0.00705  [0m | [0m0.01792  [0m | [0m5.373    [0m | [0m1.16e+03 [0m | [0m17.14    [0m |
| [0m72       [0m | [0m0.008191 [0m | [0m0.05459  [0m | [0m3.848    [0m | [0m1.788e+03[0m | [0m10.62    [0m |
| [0m73       [0m | [0m0.02599  [0m | [0m0.09489  [0m | [0m2.679    [0m | [0m1.162e+03[0m | [0m18.02    [0m |
| [0m74       [0m | [0m0.005775 [0m | [0m0.05029  [0m | [0m4.3      [0m | [0m1.404e+03[0m | [0m18.87    [0m |
| [0m75       [0m | [

In [30]:
params_best['max_depth'] = int(params_best['max_depth'])
params_best['n_estimators'] = int(params_best['n_estimators'])
params_best['num_leaves'] = int(params_best['num_leaves'])
lgb_reg= lgb.LGBMRegressor(**params_best )
lgb_reg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63
[LightGBM] [Info] Number of data points in the train set: 2415, number of used features: 6
[LightGBM] [Info] Start training from score 0.301664


In [31]:
y_pred= lgb_reg.predict(X_test)
y_pred1=lgb_reg.predict(X_val)
y_pred2=lgb_reg.predict(X_train)

In [32]:
print("mean_absolute_error:", mean_absolute_error(y_test, y_pred))
print("mean_squared_error:", mean_squared_error(y_test, y_pred))
print("rmse:", sqrt(mean_squared_error(y_test, y_pred)))
print("r2 score:", r2_score(y_test, y_pred))

mean_absolute_error: 0.06373161430356314
mean_squared_error: 0.006208845138778282
rmse: 0.07879622540945906
r2 score: 0.8788060762581171


In [33]:
print("mean_absolute_error:", mean_absolute_error(y_val, y_pred1))
print("mean_squared_error:", mean_squared_error(y_val, y_pred1))
print("rmse:", sqrt(mean_squared_error(y_val, y_pred1)))
print("r2 score:", r2_score(y_val, y_pred1))

mean_absolute_error: 0.0633796025111148
mean_squared_error: 0.006342511518935127
rmse: 0.07963988648243496
r2 score: 0.8727905554580779


In [34]:
print("mean_absolute_error:", mean_absolute_error(y_train, y_pred2))
print("mean_squared_error:", mean_squared_error(y_train, y_pred2))
print("rmse:", sqrt(mean_squared_error(y_train, y_pred2)))
print("r2 score:", r2_score(y_train, y_pred2))

mean_absolute_error: 0.06130360410020308
mean_squared_error: 0.005719867589499725
rmse: 0.0756298062241318
r2 score: 0.8830658681107654
