In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import chi2, f_regression
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import KFold, train_test_split,RandomizedSearchCV, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
import statsmodels.api as sm

In [2]:
data = pd.read_csv("Corrosion_synthesized_data.csv")
test_data = pd.read_csv("Corrosion_synthesized_Test_data.csv")

In [3]:
pipe1 = data[data['Pipe_type']=="Pipe_1"]
pipe2 = data[data['Pipe_type']=="Pipe_2"]
pipe3 = data[data['Pipe_type']=="Pipe_3"]

In [4]:
def feature_importance(dfx):
    dfx.drop('Pipe_type',axis=1,inplace=True)
    response_col = 'Defect_Depth'
    Scaler = StandardScaler()
    X = dfx.drop(response_col,axis=1)
    X = pd.DataFrame(Scaler.fit_transform(X),columns=X.columns)  
    y = dfx[response_col]
    bestfeatures = SelectKBest(score_func=f_regression, k=10)
    fit = bestfeatures.fit(X,y)
    featureScores = pd.DataFrame({'Features':X.columns,'Importance_Score':fit.scores_}).reset_index().sort_values(by='Importance_Score',ascending=False)
    selected_features = list(featureScores[featureScores['Importance_Score']>50]['Features'].values)
    return selected_features

In [11]:
pipe_types = list(data['Pipe_type'].unique())
for pipe in pipe_types:
    print(pipe)
    pipe_data = data[data['Pipe_type']==pipe]
    pipe_data = pipe_data.drop(['BSW','CL'],axis=1)
    selected_features = feature_importance(pipe_data)
    print(selected_features)

Pipe_3
['CA', 'PS', 'SO', 'TM', 'pH', 'PCO2', 'HCO3', 'FE']
Pipe_2
['CA', 'PS', 'SO', 'TM', 'PCO2', 'pH', 'HCO3', 'FE']
Pipe_1
['CA', 'PS', 'SO', 'TM', 'pH', 'PCO2', 'HCO3', 'FE']


### Prepare data for modelling

In [12]:
# Function to evaluate the model
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    rmse = np.sqrt(mean_squared_error(model.predict(test_features),test_labels))
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.4f}%.'.format(accuracy))   
    print('RMSE = {:0.4f}'.format(rmse))
    return accuracy,rmse

In [15]:
### Pipe 1

In [14]:
##### RandomizedSearchCV #####
model_data = data[data['Pipe_type']=="Pipe_1"]
X = model_data[selected_features]
y = model_data['Defect_Depth']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size = 0.2, random_state=42)
rf = RandomForestRegressor()

n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 6, 9, 12, 15, 18, 21,
                                                      24, 27, 30, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 144, 188, 233,
                                                         277, 322, 366, 411,
                                                         455, 500]},
                   random_state=42, verbose=2)

In [18]:
evaluate(RandomizedSearchCV_Model,X_train, y_train)

Model Performance
Average Error: 0.0824 degrees.
Accuracy = 74.5960%.
RMSE = 0.1630


(74.59603049712189, 0.1629774701949919)

In [19]:
evaluate(RandomizedSearchCV_Model,X_test, y_test)

Model Performance
Average Error: 0.0786 degrees.
Accuracy = 74.2579%.
RMSE = 0.1574


(74.2578647516367, 0.15740858108821296)

In [16]:
### Pipe 2

In [20]:
##### RandomizedSearchCV #####
rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model2 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model2.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 6, 9, 12, 15, 18, 21,
                                                      24, 27, 30, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 144, 188, 233,
                                                         277, 322, 366, 411,
                                                         455, 500]},
                   random_state=42, verbose=2)

In [21]:
evaluate(RandomizedSearchCV_Model2,X_train, y_train)

Model Performance
Average Error: 0.0825 degrees.
Accuracy = 74.5827%.
RMSE = 0.1631


(74.58269846161903, 0.16309103326884605)

In [22]:
evaluate(RandomizedSearchCV_Model2,X_test, y_test)

Model Performance
Average Error: 0.0786 degrees.
Accuracy = 74.2594%.
RMSE = 0.1573


(74.25935616690914, 0.1572776409535794)

In [None]:
### Pipe 3

In [23]:
rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model3 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model3.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [3, 6, 9, 12, 15, 18, 21,
                                                      24, 27, 30, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 144, 188, 233,
                                                         277, 322, 366, 411,
                                                         455, 500]},
                   random_state=42, verbose=2)

In [24]:
evaluate(RandomizedSearchCV_Model3,X_train, y_train)

Model Performance
Average Error: 0.0827 degrees.
Accuracy = 74.5275%.
RMSE = 0.1634
Model Performance
Average Error: 0.0787 degrees.
Accuracy = 74.2177%.
RMSE = 0.1574


(74.21771005791865, 0.15736643359271182)

In [25]:
import pickle
filename = 'RandomizedCV_Model_Pipe_1_New.sav'
pickle.dump(RandomizedSearchCV_Model, open(filename, 'wb'))

In [26]:
filename = 'RandomizedCV_Model_Pipe_2_New.sav'
pickle.dump(RandomizedSearchCV_Model2, open(filename, 'wb'))

In [27]:
filename = 'RandomizedCV_Model_Pipe_3_New.sav'
pickle.dump(RandomizedSearchCV_Model3, open(filename, 'wb'))

In [None]:
#### Pipe 1
model_data = data[data['Pipe_type']=="Pipe_1"]
X = model_data[selected_features]
y = model_data['Defect_Depth']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size = 0.2, random_state=42)
##### Base Model #####
Base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
Base_model.fit(X_train, y_train)

##### RandomizedSearchCV #####
rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model.fit(X_train, y_train)

#### GridSearchCV#####
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2,3,5,6,7],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300]
}

rf = RandomForestRegressor()
Grid_search_Model = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
Grid_search_Model.fit(X_train, y_train)
Best_grid_Model = Grid_search_Model.best_estimator_


#### Linear Regression Model ####

Linear_model = sm.OLS(y_train, X_train).fit()

In [63]:
models_trained = [Base_model,RandomizedSearchCV_Model,Grid_search_Model,Linear_model]
Model_name = ['Base_Model','RandomizedSearchCV_Model','GridSearchCV_Model','Linear_Model']

Accuracy_Train = []
RMSE_Train = []

Accuracy_Test = []
RMSE_Test = []
Pipe_1_Model_Performance = pd.DataFrame()
for models in models_trained:
    Accuracy_Train,RMSE_Train = evaluate(models,X_train, y_train)
    Accuracy_Test,RMSE_Test = evaluate(models,X_test, y_test)
    temp = pd.DataFrame({'Accuracy_Train':Accuracy_Train,'Accuracy_Test':Accuracy_Test,'RMSE_Train':RMSE_Train,'RMSE_Test':RMSE_Test},index=[0])
    Pipe_1_Model_Performance = Pipe_1_Model_Performance.append(temp)

Pipe_1_Model_Performance.index = list(Model_name)
Pipe_1_Model_Performance

Model Performance
Average Error: 0.0334 degrees.
Accuracy = 89.5302%.
RMSE = 0.0731
Model Performance
Average Error: 0.0852 degrees.
Accuracy = 72.8606%.
RMSE = 0.1754
Model Performance
Average Error: 0.0822 degrees.
Accuracy = 74.6094%.
RMSE = 0.1625
Model Performance
Average Error: 0.0788 degrees.
Accuracy = 74.2371%.
RMSE = 0.1575
Model Performance
Average Error: 0.0624 degrees.
Accuracy = 81.0431%.
RMSE = 0.1262
Model Performance
Average Error: 0.0802 degrees.
Accuracy = 74.1476%.
RMSE = 0.1608
Model Performance
Average Error: 0.1160 degrees.
Accuracy = 53.4063%.
RMSE = 0.1857
Model Performance
Average Error: 0.1118 degrees.
Accuracy = 52.3579%.
RMSE = 0.1721


Unnamed: 0,Accuracy_Train,Accuracy_Test,RMSE_Train,RMSE_Test
Base_Model,89.530218,72.860552,0.073094,0.175378
RandomizedSearchCV_Model,74.609358,74.237121,0.162514,0.157519
GridSearchCV_Model,81.043087,74.147619,0.126207,0.160843
Linear_Model,53.406329,52.357923,0.185731,0.172069


In [78]:
import pickle
filename = 'RandomizedCV_Model_Pipe_1.sav'
pickle.dump(RandomizedSearchCV_Model, open(filename, 'wb'))

In [72]:
#### Pipe 2
model_data = data[data['Pipe_type']=="Pipe_2"]
X = model_data[selected_features]
y = model_data['Defect_Depth']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size = 0.2, random_state=42)
##### Base Model #####
Base_model2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
Base_model2.fit(X_train, y_train)

##### RandomizedSearchCV #####
rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model2 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model2.fit(X_train, y_train)

#### GridSearchCV#####
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2,3,5,6,7],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300]
}

rf = RandomForestRegressor()
Grid_search_Model2 = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
Grid_search_Model2.fit(X_train, y_train)
Best_grid_Model2 = Grid_search_Model2.best_estimator_


#### Linear Regression Model ####

Linear_model2 = sm.OLS(y_train, X_train).fit()


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 540 candidates, totalling 1620 fits


In [66]:
########################################################################################
models_trained = [Base_model2,RandomizedSearchCV_Model2,Grid_search_Model2,Linear_model2]
Model_name = ['Base_Model','RandomizedSearchCV_Model','GridSearchCV_Model','Linear_Model']

Accuracy_Train = []
RMSE_Train = []

Accuracy_Test = []
RMSE_Test = []
Pipe_2_Model_Performance = pd.DataFrame()
for models in models_trained:
    Accuracy_Train,RMSE_Train = evaluate(models,X_train, y_train)
    Accuracy_Test,RMSE_Test = evaluate(models,X_test, y_test)
    temp = pd.DataFrame({'Accuracy_Train':Accuracy_Train,'Accuracy_Test':Accuracy_Test,'RMSE_Train':RMSE_Train,'RMSE_Test':RMSE_Test},index=[0])
    Pipe_2_Model_Performance = Pipe_2_Model_Performance.append(temp)

Pipe_2_Model_Performance.index = list(Model_name)
Pipe_2_Model_Performance

Model Performance
Average Error: 0.0340 degrees.
Accuracy = 89.7262%.
RMSE = 0.0718
Model Performance
Average Error: 0.0796 degrees.
Accuracy = 73.9809%.
RMSE = 0.1715
Model Performance
Average Error: 0.0817 degrees.
Accuracy = 74.9862%.
RMSE = 0.1595
Model Performance
Average Error: 0.0726 degrees.
Accuracy = 74.5119%.
RMSE = 0.1512
Model Performance
Average Error: 0.0531 degrees.
Accuracy = 84.0994%.
RMSE = 0.1060
Model Performance
Average Error: 0.0734 degrees.
Accuracy = 74.4389%.
RMSE = 0.1539
Model Performance
Average Error: 0.1174 degrees.
Accuracy = 51.7872%.
RMSE = 0.1837
Model Performance
Average Error: 0.1067 degrees.
Accuracy = 50.9289%.
RMSE = 0.1710


Unnamed: 0,Accuracy_Train,Accuracy_Test,RMSE_Train,RMSE_Test
Base_Model,89.726247,73.980932,0.071755,0.171474
RandomizedSearchCV_Model,74.986234,74.511862,0.159467,0.151248
GridSearchCV_Model,84.099435,74.438938,0.10602,0.153888
Linear_Model,51.787223,50.928916,0.183661,0.171043


In [76]:
filename = 'RandomizedCV_Model_Pipe_2.sav'
pickle.dump(RandomizedSearchCV_Model2, open(filename, 'wb'))

In [74]:
#### Pipe 3
model_data = data[data['Pipe_type']=="Pipe_3"]
X = model_data[selected_features]
y = model_data['Defect_Depth']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size = 0.2, random_state=42)
##### Base Model #####
Base_model3 = RandomForestRegressor(n_estimators = 10, random_state = 42)
Base_model3.fit(X_train, y_train)

##### RandomizedSearchCV #####
rf = RandomForestRegressor()
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 10)]   
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 30, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RandomizedSearchCV_Model3 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RandomizedSearchCV_Model3.fit(X_train, y_train)

#### GridSearchCV#####
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2,3,5,6,7],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [8, 10, 12],
'n_estimators': [100, 200, 300]
}

rf = RandomForestRegressor()
Grid_search_Model3 = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
Grid_search_Model3.fit(X_train, y_train)
Best_grid_Model3 = Grid_search_Model2.best_estimator_


#### Linear Regression Model ####

Linear_model3 = sm.OLS(y_train, X_train).fit()



Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 540 candidates, totalling 1620 fits


In [69]:
########################################################################################


models_trained = [Base_model3,RandomizedSearchCV_Model3,Grid_search_Model3,Linear_model3]
Model_name = ['Base_Model','RandomizedSearchCV_Model','GridSearchCV_Model','Linear_Model']

Accuracy_Train = []
RMSE_Train = []

Accuracy_Test = []
RMSE_Test = []
Pipe_3_Model_Performance = pd.DataFrame()
for models in models_trained:
    Accuracy_Train,RMSE_Train = evaluate(models,X_train, y_train)
    Accuracy_Test,RMSE_Test = evaluate(models,X_test, y_test)
    temp = pd.DataFrame({'Accuracy_Train':Accuracy_Train,'Accuracy_Test':Accuracy_Test,'RMSE_Train':RMSE_Train,'RMSE_Test':RMSE_Test},index=[0])
    Pipe_3_Model_Performance = Pipe_3_Model_Performance.append(temp)

Pipe_3_Model_Performance.index = list(Model_name)
Pipe_3_Model_Performance

Model Performance
Average Error: 0.0334 degrees.
Accuracy = 89.8333%.
RMSE = 0.0752
Model Performance
Average Error: 0.0943 degrees.
Accuracy = 72.4599%.
RMSE = 0.1850
Model Performance
Average Error: 0.0790 degrees.
Accuracy = 75.2903%.
RMSE = 0.1556
Model Performance
Average Error: 0.0868 degrees.
Accuracy = 74.4943%.
RMSE = 0.1703
Model Performance
Average Error: 0.0544 degrees.
Accuracy = 83.4029%.
RMSE = 0.1099
Model Performance
Average Error: 0.0884 degrees.
Accuracy = 74.1046%.
RMSE = 0.1737
Model Performance
Average Error: 0.1129 degrees.
Accuracy = 53.6066%.
RMSE = 0.1784
Model Performance
Average Error: 0.1130 degrees.
Accuracy = 55.2264%.
RMSE = 0.1841


Unnamed: 0,Accuracy_Train,Accuracy_Test,RMSE_Train,RMSE_Test
Base_Model,89.833297,72.459869,0.075194,0.184961
RandomizedSearchCV_Model,75.290276,74.494294,0.155647,0.170328
GridSearchCV_Model,83.402911,74.104617,0.109885,0.173736
Linear_Model,53.606567,55.226352,0.178352,0.184106


In [77]:
filename = 'RandomizedCV_Model_Pipe_3.sav'
pickle.dump(RandomizedSearchCV_Model3, open(filename, 'wb'))

In [1]:
data

NameError: name 'data' is not defined