In [1]:
import pandas as pd
from pandas.plotting import radviz
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
#import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm, preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn import linear_model
from sklearn.ensemble import BaggingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

In [2]:
def read(file):
    return pd.read_csv(file)

#features.set_index(['Index'], inplace = True)
#features.drop(columns = ['Index'], inplace = True)

features = read('Pmax_forward_Final.csv')
features.head(5)

Unnamed: 0,α [°],H [mm],b [mm],Ls [mm],γ [N/mm3],Pmax [N],Vroot [mm3],φ [°]
0,15,90.0,19.4114,17.555563,1.6e-05,9.080644,4534.350557,39.33
1,15,90.0,12.941,41.703709,1.6e-05,8.758713,3535.876732,39.33
2,15,90.0,19.4114,17.555563,1.6e-05,11.753734,8267.564868,39.33
3,15,90.0,12.941,41.703709,1.6e-05,10.797683,5796.469487,39.33
4,15,90.0,19.4114,17.555563,1.6e-05,14.095079,23281.011643,39.33


In [3]:
#summary statistics
features.describe()
features.columns.values

array(['α [°]', 'H [mm]', 'b [mm]', 'Ls [mm]', 'γ [N/mm3]', 'Pmax [N]',
       'Vroot [mm3]', 'φ [°]'], dtype=object)

In [4]:
#pd.options.display.max_rows = 181
pd.options.display.max_columns = 30
#features.sort_values(by = "L [mm]", ascending = True, inplace = True)
#display(features)

In [5]:
#one hot encoding
pd.options.display.max_columns = 181
pd.options.display.max_rows = 181

if 'n [count]' in features.columns and 'Material' in features.columns:
    features = pd.get_dummies(features, columns = ['n [count]', 'Material'])
elif 'n [count]' in features.columns:
    features = pd.get_dummies(features, columns = ['n [count]'])
elif 'Material' in features.columns:
    features = pd.get_dummies(features, columns = ['Material'])
#features.head(5)
#display(features)
#print_shape(features)

In [6]:
#target pmax/f
pmax = np.array(features['Pmax [N]']).reshape(1, -1)

#Remove labels from the features
features = features.drop('Pmax [N]', axis = 1)
feature_list = list(features.columns)
features = StandardScaler().fit_transform(features)
pmax = Normalizer().fit_transform(pmax).reshape(-1, 1)
#pca = PCA(n_components = 11)
#principalComponents = pca.fit_transform(features)
#features = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10','pc11'])
#display(features)
#print(pca.explained_variance_ratio_.cumsum())

#Covert to numpy arrays
features = np.array(features)


In [7]:
#split the data into training set and testing set 
train_features, test_features, train_labels, test_labels = train_test_split(features, pmax, test_size = 0.25, random_state = 42)

In [8]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (132, 7)
Training Labels Shape: (132, 1)
Testing Features Shape: (45, 7)
Testing Labels Shape: (45, 1)


In [9]:
gb = GradientBoostingRegressor(random_state=42, 
                               n_estimators=800, 
                               learning_rate=0.1, 
                               max_depth=3, 
                               max_features='sqrt', 
                               min_samples_leaf=3, 
                               min_samples_split=5)
gb.fit(train_features, train_labels.ravel())

GradientBoostingRegressor(max_features='sqrt', min_samples_leaf=3,
                          min_samples_split=5, n_estimators=800,
                          random_state=42)

In [10]:
gb_predictions = gb.predict(test_features)
print(gb_predictions)

[0.04011444 0.02897014 0.02440353 0.09413022 0.01555164 0.06553584
 0.01932835 0.12295398 0.02033862 0.03120836 0.0429111  0.03831928
 0.04397093 0.01316888 0.03626447 0.03666495 0.02931107 0.05276596
 0.0266971  0.06672165 0.1096849  0.04596796 0.08141044 0.09974266
 0.12511493 0.02686204 0.11906399 0.04740212 0.17309444 0.04861636
 0.05778025 0.02861797 0.04262777 0.01954973 0.09719807 0.04390889
 0.08522092 0.01576956 0.02165884 0.07353378 0.10790331 0.02928587
 0.07855339 0.02245138 0.05919226]


In [11]:
gb_mean_absolute_error = mean_absolute_error(test_labels, gb_predictions)
print('Mean Absolute Error:', round(gb_mean_absolute_error, 5))

Mean Absolute Error: 0.00346


In [12]:
mean_square_error = mean_squared_error(test_labels, gb_predictions, squared=False)

print("Mean Square Error: ", round(mean_square_error, 5))

Mean Square Error:  0.00478


In [13]:
gb_explained_variance_score = explained_variance_score(test_labels, gb_predictions)

print("Explained variance score: ", round(gb_explained_variance_score, 7))

Explained variance score:  0.9827888
