In [1]:
import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF



from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.disagreement import max_std_sampling


%matplotlib inline

In [2]:
# cleaned version of the data after removing outliers and negative values
path='/workspace/malathi/steam_turbine_stage_1_POC/notebooks/version_1/df_sorted.csv'
data=pd.read_csv(path)
data.shape

(7077, 6)

In [3]:
INPUT_COLUMNS=[ 'expansion ratio','corrected speed','ImpulseStage_CurrentStroke']
OUTPUT_COLUMNS=['ImpulseStage_Power', 'Impulse_Discharge_Temperature', 'corrected mass flow']

In [4]:
data.sample(5)

Unnamed: 0,expansion ratio,corrected speed,ImpulseStage_CurrentStroke,ImpulseStage_Power,Impulse_Discharge_Temperature,corrected mass flow
5867,1.434247,137.233751,42.07,1741.749512,381.360291,17.851823
3601,2.382937,141.738218,27.76,1562.036987,329.530823,10.549638
208,7.421228,148.789477,10.59,331.522613,302.009766,2.587987
2513,3.196556,136.013304,22.03,1122.359619,384.82843,7.562243
1563,3.736741,122.232705,19.17,851.699036,348.89621,6.403618


In [5]:
# predictions for 'Impulsestage_Power'
X=data.drop(OUTPUT_COLUMNS,axis=1)
y=data['ImpulseStage_Power']
print(X.shape,y.shape)
XX=X.values.reshape(-1,3)
yy=y.values.reshape(-1,1)
print(XX.shape,yy.shape)

(7077, 3) (7077,)
(7077, 3) (7077, 1)


In [6]:
from sklearn.model_selection import train_test_split

n_initial=100
X_train, X_test, y_train, y_test = train_test_split(XX, yy)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)

#X_initial, y_initial = X_train[initial_idx], y_train[initial_idx]
X_pool, y_pool = np.delete(X_train, initial_idx, axis=0), np.delete(y_train, initial_idx, axis=0)
print(X_pool.shape,y_pool.shape)

(5307, 3) (5307, 1) (1770, 3) (1770, 1)
(5207, 3) (5207, 1)


In [8]:
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.ensemble import RandomForestRegressor

def treebased_estimate_error(model:RegressorMixin, X:np.array, percentile=70) -> tuple:

    '''Estimate UQ based on enseemble'''

    err_down = []
    err_up = []
    for x in range(len(X)):
        preds = []
        for pred in model.estimators_:
            preds.append(pred.predict([X[x]])) # [0]
        err_down.append(np.percentile(preds, (100 - percentile) / 2. ))
        err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.))
    return err_down, err_up

class MalathiCustomRegressor(BaseEstimator, RegressorMixin):
    
    def __init__(self):
        self.model = RandomForestRegressor(max_depth=2, random_state=0)

    def fit(self, X, y=None):
        return self.model.fit(X,y)
    
    def predict(self, X, y=None, return_std=False, return_cov=False):
        y_pred = self.model.predict(X)
        
        if return_std:
            # our trick
            err_down, err_up = treebased_estimate_error(self.model, X)

            return y_pred, np.abs(np.array(err_up) - np.array(err_down))
        else:
            return y_pred

In [9]:
initial_idx = list()
initial_idx.append(np.random.choice(range(100), size=n_initial, replace=False))
initial_idx.append(np.random.choice(range(100, 200), size=n_initial, replace=False))
learner_list = [ActiveLearner(
                        estimator=MalathiCustomRegressor(),
                        X_training=X_train[idx], y_training=y_train[idx].ravel()
                )
                for idx in initial_idx]

In [10]:
# initializing the Committee
committee = CommitteeRegressor(
    learner_list=learner_list,
    query_strategy=max_std_sampling
)

In [11]:
pred, std = committee.predict(X_pool, return_std=True)
#pred = pred.reshape(-1, )

In [12]:
pred[10:16]

array([ 963.37288107,  482.56440339, 1396.06180503,  963.37288107,
       1726.64702429, 1415.05274278])

In [13]:
std[10:16]

array([ 65.66918053,  47.01812391,  88.11903449,  65.66918053,
         0.26919923, 107.10997225])

In [14]:
y_pool[10:16]

array([[ 878.8761597],
       [ 335.9832764],
       [1246.7298584],
       [ 890.2052002],
       [1679.5843506],
       [1378.883667 ]])

In [15]:
# active regression
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = committee.query(X_train)
    committee.teach(X_train[query_idx], y_train[query_idx].ravel())

In [16]:
pred_1, std_1 = committee.predict(X_pool, return_std=True)

In [17]:
pred_1[0:6]

array([1497.1876421 , 1699.60942432, 1699.60942432, 1699.60942432,
       1699.60942432,  969.13871248])

In [20]:
std_1[0:6] # if there is an improvement for some examples, uncertain good examples to the training set as they are less
# should be pushed to the traing set and training set should be extended with these good examples

array([102.27412639,   0.83374791,   0.83374791,   0.83374791,
         0.83374791,  65.50625218])

In [21]:
y_pool[0:6]

array([[1315.9033203],
       [1909.7054443],
       [1776.4014893],
       [1877.9771729],
       [2015.9927979],
       [ 939.3825684]])

In [None]:
# cas1: after getting the more informative examples train the model and query again
# case2: find most ambiguous examples and add to the trainig set to get best performance