In [122]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [108]:
df = pd.read_csv("data/Foam Rheometer Data for ML.csv")
df.head()

Unnamed: 0,Gas,Pressure,Temperature,Surfactant,Concentration,Water Type,Chelating Agent,Corrosion Inhibitor,Shear Rate,Foam Quality,Viscosity
0,CO2,1000,300,Armovis EHS,1%,PW,0,0%,100,80.61,16.320256
1,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,250,78.395238,10.194167
2,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,500,76.326667,6.772267
3,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,750,75.541304,5.325935
4,CO2,1000,300,Armovis EHS,1%,PW,0,0.0%,1000,74.802,5.0888


In [109]:
print(df["Gas"].unique())
print(df["Surfactant"].unique())
print(df["Water Type"].unique())
df = pd.get_dummies(df, columns=["Gas", "Surfactant", "Water Type"], prefix=["Gas", "Surfactant", "Water"], drop_first = True)
print(df.columns)

['CO2' 'N2']
['Armovis EHS' 'Armovis EHS + TTM' 'TTM' 'Armogel O' 'Ethoduomeen T/13'
 'Ethomeen C12' 'CAS 50']
['PW' 'DI' 'SW' 'FW']
Index(['Pressure', 'Temperature', 'Concentration', 'Chelating Agent',
       'Corrosion Inhibitor', 'Shear Rate', 'Foam Quality', 'Viscosity',
       'Gas_N2', 'Surfactant_Armovis EHS', 'Surfactant_Armovis EHS + TTM',
       'Surfactant_CAS 50', 'Surfactant_Ethoduomeen T/13',
       'Surfactant_Ethomeen C12', 'Surfactant_TTM', 'Water_FW', 'Water_PW',
       'Water_SW'],
      dtype='object')


In [110]:
def remove_pc(string):
    if isinstance(string, str):
        string = string.replace('%', '')
        return float(string)

df['Concentration']=df['Concentration'].apply(remove_pc)
df['Corrosion Inhibitor']=df['Corrosion Inhibitor'].apply(remove_pc)
df['Chelating Agent']=df['Chelating Agent'].apply(remove_pc)

In [111]:
df.head(5)

Unnamed: 0,Pressure,Temperature,Concentration,Chelating Agent,Corrosion Inhibitor,Shear Rate,Foam Quality,Viscosity,Gas_N2,Surfactant_Armovis EHS,Surfactant_Armovis EHS + TTM,Surfactant_CAS 50,Surfactant_Ethoduomeen T/13,Surfactant_Ethomeen C12,Surfactant_TTM,Water_FW,Water_PW,Water_SW
0,1000,300,1.0,0.0,0.0,100,80.61,16.320256,0,1,0,0,0,0,0,0,1,0
1,1000,300,1.0,0.0,0.0,250,78.395238,10.194167,0,1,0,0,0,0,0,0,1,0
2,1000,300,1.0,0.0,0.0,500,76.326667,6.772267,0,1,0,0,0,0,0,0,1,0
3,1000,300,1.0,0.0,0.0,750,75.541304,5.325935,0,1,0,0,0,0,0,0,1,0
4,1000,300,1.0,0.0,0.0,1000,74.802,5.0888,0,1,0,0,0,0,0,0,1,0


In [112]:
y = df['Viscosity'].values
X = df.loc[:, df.columns!='Viscosity'].values

In [118]:
# Simple Test
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.25)
pipeline = Pipeline([('clf', RandomForestRegressor(n_estimators = 1000, criterion="mse", max_depth=5))])
model = pipeline.fit(X_train, y_train)

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE {mean_squared_error(y_test, predictions_test)}')



R2 Score Train 0.9446109239307702
MSE Train 5.030929659987036
R2 Score Test 0.9184444461017257
MSE 5.889865281091555


In [130]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', RandomForestRegressor())])

cross_val_score(pipeline, X_train, y_train, cv=10)

array([0.9312076 , 0.94397477, 0.83148097, 0.91892148, 0.8618523 ,
       0.95194183, 0.88568313, 0.92826962, 0.85608675, 0.88005492])

In [131]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'clf', 'clf__bootstrap', 'clf__ccp_alpha', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__max_samples', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [133]:
param_grid = {'clf__n_estimators':[1000], 
              'clf__max_depth': [5, 10, 15], 
              'clf__max_leaf_nodes': [5, 10, 15], 
              'clf__min_samples_leaf': [5, 10, 15]}
grid = GridSearchCV(pipeline, param_grid)
grid.fit(X_train, y_train)


GridSearchCV(estimator=Pipeline(steps=[('clf', RandomForestRegressor())]),
             param_grid={'clf__max_depth': [5, 10, 15],
                         'clf__max_leaf_nodes': [5, 10, 15],
                         'clf__min_samples_leaf': [5, 10, 15],
                         'clf__n_estimators': [1000]})

In [134]:
grid.best_params_

{'clf__max_depth': 5,
 'clf__max_leaf_nodes': 15,
 'clf__min_samples_leaf': 5,
 'clf__n_estimators': 1000}

In [139]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.15, shuffle=True)
pipeline = Pipeline([('clf', RandomForestRegressor(n_estimators=2000, max_depth=5, max_leaf_nodes=15, min_samples_leaf=5))])

print(cross_val_score(pipeline, X_train, y_train, cv=10))

predictions_train = model.predict(X_train)
print(f'R2 Score Train {r2_score(y_train, predictions_train)}')
print(f'MSE Train {mean_squared_error(y_train, predictions_train)}')

predictions_test = model.predict(X_test)
print(f'R2 Score Test {r2_score(y_test, predictions_test)}')
print(f'MSE {mean_squared_error(y_test, predictions_test)}')