# Random Forest classification

In [1]:
from load_data import load
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

In [8]:
(df, diff, dataX, dataY) = load()

0        True
1        True
2        True
3        True
4        True
        ...  
2208    False
2209    False
2210    False
2211    False
2212    False
Name: Pressure (bar), Length: 2213, dtype: bool
2213


## Leave-one-configuration out split

In [9]:
#SETUP 1 -> leave one configuration out; configuration = ppf-speed-pressure
def split_data(ppf, speed, pressure, dataX, dataY):
    cond1 = df["wt% PPF"]==ppf
    cond2 = df["Pressure (bar)"]==pressure
    cond3 = df["Speed (mm/s)"]==speed

    cond1=cond1.to_numpy()
    cond2=cond2.to_numpy()
    cond3=cond3.to_numpy()
    
    c = [c1 and cond2[i] and cond3[i] for i, c1 in enumerate(cond1)]
    nc = [not (c1 and cond2[i] and cond3[i]) for i, c1 in enumerate(cond1)]
    X_test = dataX[c]
    X_train = dataX[nc]

    y_test = dataY[c]
    y_train = dataY[nc]
    return (X_test, X_train, y_test, y_train)

In [10]:
dataY["labels"] = (dataY["machine precision(%)"]>=6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# leave one out, create all combinations
combos_spp = df[["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]].groupby(["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]).size().reset_index()
combos = combos_spp[["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]].values
accus = []
aurocs = []
combos_labels = []
for combo in combos:
    (X_test, X_train, y_test, y_train) = split_data(combo[2],combo[0],combo[1],dataX, dataY)

    if len(X_test)==0 or len(X_train)==0:
        print(combo)
        print("no_data")
        continue
    y_test = y_test["labels"].astype(int).to_numpy()
    if not 1 in y_test:
        y_test[0]=1
    if not 0 in y_test:
        y_test[0]=0
        
    model = RandomForestClassifier(max_depth=6, random_state=0,n_estimators=100)
    model.fit(X_train, y_train["labels"].astype(int))
    y_pred_machine =  model.predict_proba(X_test.values)
    y_pred = model.predict(X_test.values)
    cond = "Prediction on: speed="+str(combo[0])+" pressure="+str(combo[1])+" wt% PPF="+str(combo[2])
    aur = roc_auc_score(y_test, y_pred_machine[:, 1])
    acc = accuracy_score(y_test, y_pred)
    accus.append(acc)
    aurocs.append(aur)
    combos_labels.append(str(int(combo[2]))+";"+str(combo[1])+";"+str(combo[0]))
print(np.average(np.array(aurocs), axis = 0))  
print(np.average(np.array(accus), axis = 0)) 
    #print(cond)
    #print("auroc = "+str(acc))

0.6332301197258194
0.6276641963245884


In [14]:
pd.DataFrame({"configuration": combos_labels, "accuracy": accus, "AUROC": aurocs})

Unnamed: 0,configuration,accuracy,AUROC
0,85;2.0;5.0,0.621849,0.532833
1,85;2.5;5.0,0.55,0.645717
2,90;2.5;5.0,0.634409,0.637662
3,85;3.0;5.0,0.556818,0.53416
4,90;3.0;5.0,0.676768,0.719927
5,90;4.0;5.0,0.48,0.569386
6,85;2.0;7.5,0.608333,0.598339
7,85;2.5;7.5,0.420168,0.476573
8,85;3.0;7.5,0.591667,0.654965
9,90;3.0;7.5,0.703125,0.768223


# Random Forest regression

In [15]:
def calc_accuracy(pred_val, real_val, thr):
    N = len(pred_val)
    return (N + np.sum(np.sign((pred_val-thr)*(real_val-thr))))/(2*N)

In [16]:
def value_to_prob(val, thr):
    #sigmoid
    return 1/(1+np.power(np.e,(thr-val)))

In [18]:
from sklearn.ensemble import RandomForestRegressor

combos_l = df[["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]].groupby(["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]).size().reset_index()
combos = combos_l[["wt% PPF", "Speed (mm/s)", "Pressure (bar)"]].values


accus = []
aurocs = []
combos_labels = []

for combo in combos:
    (X_test, X_train, y_test, y_train) = split_data(combo[0],combo[1],combo[2],dataX, dataY)
    if len(X_test)==0 or len(X_train)==0:
        print("no_data")
        continue
    regr = RandomForestRegressor(max_depth=6, random_state=0,n_estimators=100)
    regr.fit(X_train.values, y_train.values[:,0])
    y_pred_mach =  regr.predict(X_test.values)
    acc = calc_accuracy(y_pred_mach, y_test.values[:, 0], 6)
    y_pred_prob = value_to_prob(y_pred_mach, 6)
    y_test = y_test["labels"].astype(int).to_numpy()
    if not 1 in y_test:
        y_test[0]=1
    if not 0 in y_test:
        y_test[0]=0
    aur = roc_auc_score(y_test, y_pred_prob)
    accus.append(acc)
    aurocs.append(aur)
    combos_labels.append(str(int(combo[0]))+";"+str(combo[2])+";"+str(combo[1]))
print(np.average(np.array(aurocs), axis = 0))
print(np.average(np.array(accus), axis = 0))

0.6104470436800478
0.6320794675763033


In [19]:
pd.DataFrame({"configuration": combos_labels, "accuracy": accus, "AUROC": aurocs})

Unnamed: 0,configuration,accuracy,AUROC
0,85;2.0;5.0,0.563025,0.551907
1,85;2.5;5.0,0.633333,0.62792
2,85;3.0;5.0,0.693182,0.675207
3,85;2.0;7.5,0.591667,0.443341
4,85;2.5;7.5,0.453782,0.474565
5,85;3.0;7.5,0.608333,0.67958
6,85;2.0;10.0,0.818182,0.601111
7,85;2.5;10.0,0.584746,0.642713
8,85;3.0;10.0,0.655172,0.684885
9,85;2.0;15.0,0.675439,0.679199


# Linear model

In [20]:
from sklearn.linear_model import LinearRegression

# leave one out, create all combinations
combos_spp = df[["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]].groupby(["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]).size().reset_index()
combos = combos_spp[["Speed (mm/s)", "Pressure (bar)", "wt% PPF"]].values

#get mean per combo
X = []
y = []
for comb in combos:
    if comb[2]==85:
        cond_p = (dataX["Pressure (bar)"]==comb[1]).to_numpy()
        cond_s = (dataX["Speed (mm/s)"]==comb[0]).to_numpy()
        c = [ e and cond_s[i] for i, e in enumerate(cond_p)]
        X.append([comb[0], comb[1]])
        y.append(np.mean(dataY[c])[0])
        
y_scale=[0.01*i for i in y]

reg = LinearRegression().fit(X, y_scale)

In [21]:
#learnt weights of the linear model
(reg.intercept_, reg.coef_)

(0.024794032116404456, array([-0.00120225,  0.01822487]))

In [22]:
combos_labels = []
accus = []
thr = 50
for i, comb in enumerate(X):
    X_train = X[:i]+X[i+1:]
    y_train = y[:i]+y[i+1:]
    X_test = [X[i]]
    y_test = y[i]
    combos_labels.append("85;"+str(comb[1])+";"+str(comb[0]))
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    acc = 0
    if np.sign(y_pred-thr)*np.sign(y_test-thr) > 0: acc=1
    accus.append(acc) 

In [23]:
pd.DataFrame({"configuration": combos_labels, "accuracy": accus})

Unnamed: 0,configuration,accuracy
0,85;2.0;5.0,1
1,85;2.5;5.0,1
2,85;3.0;5.0,1
3,85;2.0;7.5,1
4,85;2.5;7.5,1
5,85;3.0;7.5,1
6,85;2.0;10.0,1
7,85;2.5;10.0,1
8,85;3.0;10.0,1
9,85;2.0;15.0,1
