In [1]:
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score, classification_report, accuracy_score, roc_curve, confusion_matrix, average_precision_score, precision_recall_curve, r2_score
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
from optimize_classification import create_dataset
from find_threshold import get_threshold_list

In [3]:
df = create_dataset()

In [7]:
threshholds = get_threshold_list(df, X)

In [6]:
def optimize_classification(particle, threshholds, rs=42):
    y = df[particle].to_list()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=rs, shuffle=True
    )
    rf_def = RandomForestClassifier(random_state=rs)
    rf_def.fit(X_train, y_train)
    y_pred = (rf_def.predict_proba(X_test)[:, 1] >= threshholds[particle]).astype(
        int
    )
    rfr_acc = accuracy_score(y_test, y_pred)
    rfr_f1 = f1_score(y_test, y_pred, average="macro")
    #print('Random forest accuracy before optimization', "%.2f" % rfr_acc)
    #print('Random forest F1 before optimization', "%.2f" % rfr_f1)
    param_grid = { 
    'n_estimators': [25, 50, 100, 150, 200, 500], 
    'max_features': ['sqrt', 'log2', None], 
    'max_depth': [3, 6, 9], 
    'max_leaf_nodes': [3, 6, 9], 
    'min_samples_leaf': [1, 2, 4],
    } 
    # Create a based model
    rf = RandomForestClassifier(random_state=rs)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                              cv = 5, n_jobs = -1, verbose = 2, scoring = 'f1')
    grid_search.fit(X_train, y_train)
    best_grid = grid_search.best_estimator_
    y_pred = (best_grid.predict_proba(X_test)[:, 1] >= threshholds[particle]).astype(
        int
    )
    rfr_acc_optimized = accuracy_score(y_test, y_pred)
    rfr_f1_optimized = f1_score(y_test, y_pred, average="macro")
    #print('Random forest accuracy after optimization', "%.2f" % rfr_acc_optimized)
    #print('Random forest F1 after optimization', "%.2f" % rfr_f1_optimized)
    return rfr_acc, rfr_f1, grid_search.best_params_, rfr_acc_optimized, rfr_f1_optimized

In [38]:
rf_def = xgb.XGBClassifier(random_state=42)

In [36]:
def optimize_classification_xgb(particle, threshholds, rs=42):
    y = df[particle].to_list()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=rs, shuffle=True
    )
    rf_def = xgb.XGBClassifier(random_state=rs)
    rf_def.fit(X_train, y_train)
    y_pred = (rf_def.predict_proba(X_test)[:, 1] >= threshholds[particle]).astype(int)
    rfr_acc = accuracy_score(y_test, y_pred)
    rfr_f1 = f1_score(y_test, y_pred, average="macro")
    # print('Random forest accuracy before optimization', "%.2f" % rfr_acc)
    # print('Random forest F1 before optimization', "%.2f" % rfr_f1)
    param_grid = {
            "gamma": [0.5, 1, 1.5, 2, 5],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "max_depth": [3, 4, 5],
            'n_estimators': [25, 50, 100, 150, 200, 500],
            'learning_rate': [0.2, 0.1, 0.01, 0.05]
    }
    # Create a based model
    rf = xgb.XGBClassifier(random_state=rs)
    # Instantiate the grid search model
    grid_search = GridSearchCV(
        estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring="f1"
    )
    grid_search.fit(X_train, y_train)
    best_grid = grid_search.best_estimator_
    y_pred = (best_grid.predict_proba(X_test)[:, 1] >= threshholds[particle]).astype(
        int
    )
    rfr_acc_optimized = accuracy_score(y_test, y_pred)
    rfr_f1_optimized = f1_score(y_test, y_pred, average="macro")
    # print('Random forest accuracy after optimization', "%.2f" % rfr_acc_optimized)
    # print('Random forest F1 after optimization', "%.2f" % rfr_f1_optimized)
    return (
        rfr_acc,
        rfr_f1,
        grid_search.best_params_,
        rfr_acc_optimized,
        rfr_f1_optimized,
    )

In [4]:
import pickle

In [5]:
from optimize_classification import calculate_metrics

In [6]:
X = df.iloc[:, 0:29]
scaler = MinMaxScaler()
num_cols = X.iloc[:, 0:10].columns
X[num_cols] = scaler.fit_transform(X[num_cols])

In [15]:
df.iloc[:, 0:22]

Unnamed: 0,"HCO3 ion, mM","Polymer Mwt, kDa","Polymer, % wt.","Surfactant, % wt.","Solvent, % vol.","Temperature, C",Synthesis time,Myristyltrimethylammonium bromide,No_surfactant,Sodium dodecylsulfate,...,Isopropyl alcohol,No_solvent,Propylene glycol,tert-Butanol,No_polymer,PAA,PEG,PEI,PSS,PVP
0,0,40.0,0.335,0.425,0.0,58,5340,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,240,3.0,0.375,0.000,0.0,68,500,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,0,2.1,0.395,0.000,0.0,76,2090,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,31,1000.0,0.320,0.000,0.0,58,1729,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,280,25.0,0.875,0.000,0.0,68,520,0,1,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,240,0.0,0.000,0.000,0.0,35,680,0,1,0,...,0,1,0,0,1,0,0,0,0,0
205,31,0.0,0.000,0.000,0.0,35,669,0,1,0,...,0,1,0,0,1,0,0,0,0,0
206,62,0.0,0.000,0.000,0.0,68,2874,0,1,0,...,0,1,0,0,1,0,0,0,0,0
207,0,0.0,0.000,0.000,0.0,68,2865,0,1,0,...,0,1,0,0,1,0,0,0,0,0


In [8]:
y = df['Cube'].to_list()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=0, shuffle=True
)

In [9]:
with open('rf_results_filtered/best_rf_filtered_Cube_0.pickle', 'rb') as f:
    model = pickle.load(f)

In [10]:
calculate_metrics(model, X_test, y_test)

(1.0, 1.0)

In [34]:
optimize_classification_xgb('Cube', threshholds, rs=42)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


(0.8695652173913043,
 0.8438521498617048,
 {'colsample_bytree': 0.6,
  'gamma': 5,
  'learning_rate': 0.1,
  'max_depth': 5,
  'n_estimators': 25},
 0.855072463768116,
 0.8369565217391304)

In [6]:
def calculate_different_rs_results(particle):
    rfr_accuracies = []
    rfr_f1s = []
    rfr_accuracies_opt = []
    rfr_f1s_opt = []
    all_best_params = []
    for rs in [0, 10, 20, 30, 40]:
       rfr_acc, rfr_f1, best_params, rfr_acc_optimized, rfr_f1_optimized = optimize_classification(particle, rs=rs)
       rfr_accuracies.append(rfr_acc)
       rfr_f1s.append(rfr_f1)
       rfr_accuracies_opt.append(rfr_acc_optimized)
       rfr_f1s_opt.append(rfr_f1_optimized)
       all_best_params.append(best_params)
    return rfr_accuracies, rfr_f1s, rfr_accuracies_opt, rfr_f1s_opt, all_best_params


In [7]:
rfr_accuracies, rfr_f1s, rfr_accuracies_opt, rfr_f1s_opt, all_best_params = calculate_different_rs_results('Amorphous')

Random forest accuracy before optimization 0.77
Random forest F1 before optimization 0.62
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Random forest accuracy after optimization 0.78
Random forest F1 after optimization 0.58
Random forest accuracy before optimization 0.75
Random forest F1 before optimization 0.61
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Random forest accuracy after optimization 0.77
Random forest F1 after optimization 0.60
Random forest accuracy before optimization 0.74
Random forest F1 before optimization 0.55
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Random forest accuracy after optimization 0.74
Random forest F1 after optimization 0.55
Random forest accuracy before optimization 0.77
Random forest F1 before optimization 0.62
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Random forest accuracy after optimization 0.74
Random forest F1 after optimization 0.57
Random forest accuracy before op

In [8]:
rfr_accuracies_opt

[0.782608695652174,
 0.7681159420289855,
 0.7391304347826086,
 0.7391304347826086,
 0.855072463768116]

In [9]:
rfr_f1s_opt

[0.5787545787545787,
 0.5964912280701754,
 0.5460526315789473,
 0.573489010989011,
 0.7296238244514106]

In [10]:
np.mean(rfr_f1s_opt)

0.6048822547688246

In [11]:
np.mean(rfr_accuracies_opt)

0.7768115942028986

In [12]:
np.std(rfr_f1s_opt)

0.06443599042386182

In [19]:
all_best_params

[{'max_depth': 3,
  'max_features': None,
  'max_leaf_nodes': 6,
  'min_samples_leaf': 4,
  'n_estimators': 100},
 {'max_depth': 6,
  'max_features': None,
  'max_leaf_nodes': 9,
  'min_samples_leaf': 1,
  'n_estimators': 150},
 {'max_depth': 6,
  'max_features': None,
  'max_leaf_nodes': 9,
  'min_samples_leaf': 1,
  'n_estimators': 500},
 {'max_depth': 3,
  'max_features': None,
  'max_leaf_nodes': 6,
  'min_samples_leaf': 1,
  'n_estimators': 50},
 {'max_depth': 3,
  'max_features': 'sqrt',
  'max_leaf_nodes': 6,
  'min_samples_leaf': 1,
  'n_estimators': 25}]

In [10]:
particle = 'Amorphous'
rs = 0
y = df[particle].to_list()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=rs, shuffle=True
)
rfr = RandomForestClassifier(**{'max_depth': 3,
  'max_features': None,
  'max_leaf_nodes': 6,
  'min_samples_leaf': 4,
  'n_estimators': 100}, random_state=rs)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict_proba(X_test)[:, 1] >= 0.253).astype(
    int
)
rfr_acc = "%.2f" % accuracy_score(y_test, y_pred)
rfr_f1 = "%.2f" % f1_score(y_test, y_pred, average="macro")
print('Random forest accuracy before optimization', rfr_acc)
print('Random forest F1 before optimization', rfr_f1)

Random forest accuracy before optimization 0.81
Random forest F1 before optimization 0.60


In [None]:
0.86 0.65