In [1]:
import pandas as pd
from pandas.plotting import radviz
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
#import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm, preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn import linear_model
from sklearn.ensemble import BaggingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

In [2]:
def read(file):
    return pd.read_csv(file)

#features.set_index(['Index'], inplace = True)
#features.drop(columns = ['Index'], inplace = True)

features = read('Pmax_forward_Final.csv')
features.head(5)

Unnamed: 0,α [°],H [mm],b [mm],Ls [mm],γ [N/mm3],Pmax [N],Vroot [mm3],φ [°]
0,15,90.0,19.4114,17.555563,1.6e-05,9.080644,4534.350557,39.33
1,15,90.0,12.941,41.703709,1.6e-05,8.758713,3535.876732,39.33
2,15,90.0,19.4114,17.555563,1.6e-05,11.753734,8267.564868,39.33
3,15,90.0,12.941,41.703709,1.6e-05,10.797683,5796.469487,39.33
4,15,90.0,19.4114,17.555563,1.6e-05,14.095079,23281.011643,39.33


In [3]:
#summary statistics
features.describe()
features.columns.values

array(['α [°]', 'H [mm]', 'b [mm]', 'Ls [mm]', 'γ [N/mm3]', 'Pmax [N]',
       'Vroot [mm3]', 'φ [°]'], dtype=object)

In [4]:
#pd.options.display.max_rows = 181
pd.options.display.max_columns = 30
#features.sort_values(by = "L [mm]", ascending = True, inplace = True)
#display(features)

In [5]:
#one hot encoding
pd.options.display.max_columns = 181
pd.options.display.max_rows = 181

if 'n [count]' in features.columns and 'Material' in features.columns:
    features = pd.get_dummies(features, columns = ['n [count]', 'Material'])
elif 'n [count]' in features.columns:
    features = pd.get_dummies(features, columns = ['n [count]'])
elif 'Material' in features.columns:
    features = pd.get_dummies(features, columns = ['Material'])
#features.head(5)
#display(features)
#print_shape(features)

In [6]:
#target pmax/f
pmax = np.array(features['Pmax [N]']).reshape(1, -1)

#Remove labels from the features
features = features.drop('Pmax [N]', axis = 1)
feature_list = list(features.columns)
features = StandardScaler().fit_transform(features)
pmax = Normalizer().fit_transform(pmax).reshape(-1, 1)
#pca = PCA(n_components = 11)
#principalComponents = pca.fit_transform(features)
#features = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10','pc11'])
#display(features)
#print(pca.explained_variance_ratio_.cumsum())

#Covert to numpy arrays
features = np.array(features)

In [7]:
#split the data into training set and testing set 
train_features, test_features, train_labels, test_labels = train_test_split(features, pmax, test_size = 0.25, random_state = 42)

In [8]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (132, 7)
Training Labels Shape: (132, 1)
Testing Features Shape: (45, 7)
Testing Labels Shape: (45, 1)


In [9]:
from sklearn.model_selection import GridSearchCV

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1100, num = 11)]
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
max_features = ["auto", "sqrt", 2, 3, 4]
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]
min_samples_split = [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]
print(min_samples_leaf)
print(max_depth)

[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [10]:
param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
}

In [11]:
gb = GradientBoostingRegressor()

In [12]:
grid_search = GridSearchCV(estimator = gb, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [13]:
grid_search.fit(train_features, train_labels.ravel())

Fitting 3 folds for each of 13750 candidates, totalling 41250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 1034 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 2166 tasks      | elapsed:   49.3s
[Parallel(n_jobs=-1)]: Done 3554 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 5365 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 6911 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 8404 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 10441 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 13162 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 15871 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 18753 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 20928 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 25174 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 29283 task

GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'max_features': ['auto', 'sqrt', 2, 3, 4],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [1, 2, 3, 4, 5],
                         'n_estimators': [100, 200, 300, 400, 500, 600, 700,
                                          800, 900, 1000, 1100]},
             verbose=2)

In [15]:
best_grid = grid_search.best_estimator_
print(best_grid)

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=3,
                          min_samples_split=5, n_estimators=800)
