# Random forest regressor - one spacing data

In [1]:
from load_data import load
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

## Load data

In [2]:
(df, diff, dataX, dataY) = load()

0        True
1        True
2        True
3        True
4        True
        ...  
2208    False
2209    False
2210    False
2211    False
2212    False
Name: Pressure (bar), Length: 2213, dtype: bool
2213


## Split data - one spacing

In [4]:
#SETUP 1 -> leave one configuration out; configuration = ppf-speed-pressure
def split_data_one_spacing(ppf, speed, pressure, dataX, dataY):
    cond1 = df["wt% PPF"]==ppf
    cond2 = df["Pressure (bar)"]==pressure
    cond3 = df["Speed (mm/s)"]==speed
    cond4 = df["Spacing (mm)"]==1.2

    cond1=cond1.to_numpy()
    cond2=cond2.to_numpy()
    cond3=cond3.to_numpy()
    cond4=cond4.to_numpy()
    
    c = [c1 and cond2[i] and cond3[i] and cond4[i] for i, c1 in enumerate(cond1)]
    nc = [not (c1 and cond2[i] and cond3[i] and cond4[i]) for i, c1 in enumerate(cond1)]
    X_test = dataX[c]
    X_train = dataX[nc]

    y_test = dataY[c]
    y_train = dataY[nc]
    return (X_test, X_train, y_test, y_train)

In [10]:
dataY["labels"] = (dataY["material accuracy(%)"]>=50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Train and evaluate the model

In [7]:
def calc_accuracy(pred_val, real_val, thr):
    N = len(pred_val)
    return (N + np.sum(np.sign((pred_val-thr)*(real_val-thr))))/(2*N)

In [8]:
def value_to_prob(val, thr):
    #sigmoid
    return 1/(1+np.power(np.e,(thr-val)))

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

combos_l = df[["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]].groupby(["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]).size().reset_index()
combos = combos_l[["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]].values


accus = []
aurocs = []
combos_labels = []

for combo in combos:
    (X_test, X_train, y_test, y_train) = split_data_one_spacing(combo[0],combo[1],combo[2],dataX, dataY)
    if len(X_test)==0 or len(X_train)==0:
        print("no_data")
        continue
    regr = RandomForestRegressor(max_depth=6, random_state=0,n_estimators=100)
    regr.fit(X_train.values, y_train.values[:,1])
    y_pred_material =  regr.predict(X_test.values)
    acc = calc_accuracy(y_pred_material, y_test.values[:, 1], 50)
    y_pred_prob = value_to_prob(y_pred_material, 50)
    y_test = y_test["labels"].astype(int).to_numpy()
    if not 1 in y_test:
        y_test[0]=1
    if not 0 in y_test:
        y_test[0]=0
    aur = roc_auc_score(y_test, y_pred_prob)
    accus.append(acc)
    aurocs.append(aur)
    combos_labels.append(str(int(combo[0]))+";"+str(combo[2])+";"+str(combo[1]))
print(np.average(np.array(aurocs), axis = 0))

0.678883965243376


In [13]:
pd.DataFrame({"configuration": combos_labels, "accuracy": accus, "AUROC": aurocs})

Unnamed: 0,configuration,accuracy,AUROC
0,85;2.0;5.0,0.975,0.961538
1,85;2.5;5.0,1.0,0.858974
2,85;3.0;5.0,1.0,0.858974
3,85;2.0;7.5,1.0,0.961538
4,85;2.5;7.5,1.0,0.552632
5,85;3.0;7.5,0.975,0.551282
6,85;2.0;10.0,0.851852,0.826087
7,85;2.5;10.0,1.0,0.858974
8,85;3.0;10.0,1.0,0.957143
9,85;2.0;15.0,0.882353,0.738095
