In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv("data/Foam Rheometer Data for ML.csv")
df.head()

Unnamed: 0,Gas,Pressure,Temperature,Surfactant,Concentration,Water Type,Chelating Agent,Corrosion Inhibitor,Shear Rate,Foam Quality,Viscosity
0,CO2,1000,300,Armovis EHS,1%,PW,0,0%,100,80.61,16.320256
1,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,250,78.395238,10.194167
2,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,500,76.326667,6.772267
3,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,750,75.541304,5.325935
4,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,1000,74.802,5.0888


In [4]:
print(df["Gas"].unique())
print(df["Surfactant"].unique())
print(df["Water Type"].unique())
df = pd.get_dummies(df, columns=["Gas", "Surfactant", "Water Type"], prefix=["Gas", "Surfactant", "Water"], drop_first = True)
print(df.columns)

['CO2' 'N2']
['Armovis EHS' 'Armovis EHS + TTM' 'TTM' 'Armogel O' 'Ethoduomeen T/13'
 'Ethomeen C12' 'CAS 50']
['PW' 'DI' 'SW' 'FW']
Index(['Pressure', 'Temperature', 'Concentration', 'Chelating Agent',
       'Corrosion Inhibitor', 'Shear Rate', 'Foam Quality', 'Viscosity',
       'Gas_N2', 'Surfactant_Armovis EHS', 'Surfactant_Armovis EHS + TTM',
       'Surfactant_CAS 50', 'Surfactant_Ethoduomeen T/13',
       'Surfactant_Ethomeen C12', 'Surfactant_TTM', 'Water_FW', 'Water_PW',
       'Water_SW'],
      dtype='object')


In [5]:
def remove_pc(string):
    if isinstance(string, str):
        string = string.replace('%', '')
        return float(string)

df['Concentration']=df['Concentration'].apply(remove_pc)
df['Corrosion Inhibitor']=df['Corrosion Inhibitor'].apply(remove_pc)
df['Chelating Agent']=df['Chelating Agent'].apply(remove_pc)

In [6]:
df.head(5)

Unnamed: 0,Pressure,Temperature,Concentration,Chelating Agent,Corrosion Inhibitor,Shear Rate,Foam Quality,Viscosity,Gas_N2,Surfactant_Armovis EHS,Surfactant_Armovis EHS + TTM,Surfactant_CAS 50,Surfactant_Ethoduomeen T/13,Surfactant_Ethomeen C12,Surfactant_TTM,Water_FW,Water_PW,Water_SW
0,1000,300,1.0,0.0,0.0,100,80.61,16.320256,0,1,0,0,0,0,0,0,1,0
1,1000,300,1.0,0.0,0.0,250,78.395238,10.194167,0,1,0,0,0,0,0,0,1,0
2,1000,300,1.0,0.0,0.0,500,76.326667,6.772267,0,1,0,0,0,0,0,0,1,0
3,1000,300,1.0,0.0,0.0,750,75.541304,5.325935,0,1,0,0,0,0,0,0,1,0
4,1000,300,1.0,0.0,0.0,1000,74.802,5.0888,0,1,0,0,0,0,0,0,1,0


In [7]:
y = df['Viscosity'].values
X = df.loc[:, df.columns!='Viscosity'].values

In [8]:
# Simple Test
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25)
pipeline = Pipeline([('clf', RandomForestRegressor(n_estimators = 1000, criterion="mse", max_depth=5))])
model = pipeline.fit(X_train, y_train)

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE {mean_squared_error(y_test, predictions_test)}')



R2 Score Train 0.9483573483000322
MSE Train 4.818136619503128
R2 Score Test 0.870207402341304
MSE 8.71359541807984


In [9]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', RandomForestRegressor())])

cross_val_score(pipeline, X_train, y_train, cv=10)

array([0.94884587, 0.87348606, 0.88271245, 0.92586588, 0.87080243,
       0.91183355, 0.91069796, 0.93862824, 0.94861177, 0.84976017])

In [10]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'clf', 'clf__bootstrap', 'clf__ccp_alpha', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__max_samples', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [16]:
param_grid = {'clf__n_estimators':[1000], 
              'clf__max_depth': [None, 5, 10, 15], 
              'clf__max_leaf_nodes': [None, 5, 10, 15], 
              'clf__min_samples_leaf': [None, 5, 10, 15]}
grid = GridSearchCV(pipeline, param_grid)
grid.fit(X_train, y_train)


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parall

KeyboardInterrupt: 

In [12]:
grid.best_params_

{'clf__max_depth': 15,
 'clf__max_leaf_nodes': 15,
 'clf__min_samples_leaf': 5,
 'clf__n_estimators': 1000}

In [20]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', RandomForestRegressor(n_estimators=100))])
model = pipeline.fit(X_train, y_train)

print(cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=10))

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE Test {mean_squared_error(y_test, predictions_test)}')

[0.94148557 0.83197947 0.70974757 0.82337822 0.86097669 0.92924616
 0.9481784  0.95420708 0.90051399 0.93282156]
R2 Score Train 0.9883011272952514
MSE Train 0.9013283547567117
R2 Score Test 0.9217657360125376
MSE Test 10.687936426939048


In [25]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', RandomForestRegressor(n_estimators=500, max_features=5))])
model = pipeline.fit(X_train, y_train)

print(cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=10))

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE Test {mean_squared_error(y_test, predictions_test)}')

[0.83661869 0.92011911 0.84916359 0.96122914 0.73040259 0.94803629
 0.86022802 0.96413303 0.90806664 0.87758106]
R2 Score Train 0.9852094172008631
MSE Train 1.375266897476336
R2 Score Test 0.9624361961507425
MSE Test 1.9180793756646328


In [31]:
from sklearn.ensemble import GradientBoostingRegressor
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', GradientBoostingRegressor())])
model = pipeline.fit(X_train, y_train)

print(cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=10))

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE Test {mean_squared_error(y_test, predictions_test)}')

[0.94559978 0.9422593  0.86032648 0.83263832 0.96110617 0.93600759
 0.83552393 0.79755538 0.87460395 0.92123028]
R2 Score Train 0.9823667331261857
MSE Train 1.3892520928103314
R2 Score Test 0.8904428193058854
MSE Test 14.208214250766968
