In [2]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random

### Append Path ###
import sys
sys.path.append('..')

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

# Inputs

In [3]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, "..",".."))

### DataType ###
DataFileInput = "BankNote"  # BankNote  Bar7  BreastCancer  CarEvaluation  COMPAS  FICO  Haberman  Iris  MONK1  MONK3

In [5]:
SimulationConfigInput = {'DataFileInput': DataFileInput,
    'Seed': 1,
    'TestProportion': 0.2,
    'CandidateProportion': 0.8,
    'SelectorType': 'BatchQBCSelector', # PassiveLearningSelector, BALDSelector, BatchQBCSelector
    'ModelType': 'LFRPredictor', # RandomForestClassifierPredictor, BayesianNeuralNetworkPredictor, TreeFarmsPredictor, GaussianProcessClassifierPredictor
    'UniqueErrorsInput': 0,
    'n_estimators': 100,
    'regularization': 0.01,
    'RashomonThresholdType': "Adder",
    'RashomonThreshold': 0.015,
    'Type': 'Classification',
    "DiversityWeight" : 0,
    "DensityWeight" : 0,
    "BatchSize" : 10}
Results = OneIterationFunction(SimulationConfigInput)

Iteration: 0
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 3
null
Finding Optimal Objective...
{
  "false": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.004566209856420755,
      "name": "Y",
      "prediction": 0
    },
    "feature": 0,
    "name": "variance_leq_-3",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": 1
    },
    "type": "integral"
  },
  "feature": 5,
  "model_objective": 0.09022830426692963,
  "name": "skewness_leq_5",
  "reference": 1,
  "relation": "==",
  "true": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.013698630034923553,
      "name": "Y",
      "prediction": 0
    },
    "feature": 3,
    "name": "variance_leq_1",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.03196346759796142

In [7]:
Results["ErrorVec"]

Unnamed: 0,Error
0,0.880000
1,0.909091
2,0.909091
3,0.909091
4,0.909091
...,...
83,0.909091
84,0.909091
85,0.909091
86,0.909091


---
---

In [4]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Load Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Add Batch Active Learning Metrics ###
df_Candidate = DiversityMetricsFunction(df_Candidate, df_Train, k=10)
SimulationConfigInput['auxiliary_data_cols'] = ['DiversityScores', 'DensityScores']

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test
SimulationConfigInput["df_Candidate"] = df_Candidate

In [6]:
SimulationConfigInput.keys()

dict_keys(['DataFileInput', 'Seed', 'TestProportion', 'CandidateProportion', 'SelectorType', 'ModelType', 'UniqueErrorsInput', 'n_estimators', 'regularization', 'RashomonThresholdType', 'RashomonThreshold', 'Type', 'DiversityWeight', 'DensityWeight', 'BatchSize', 'auxiliary_data_cols', 'df_Train', 'df_Test', 'df_Candidate', 'Model'])

# Learning Procedure

In [6]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

In [7]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []
TreeCount = {"AllTreeCount": [], "UniqueTreeCount": []}

In [8]:
### Set Up ###
i=0


## Initialize model

In [None]:
# Initialize the model instance *once* before the loop
ModelClass = globals().get(SimulationConfigInputUpdated["ModelType"], None)

# Extract only relevant args for the ModelClass __init__
model_init_args = {k: v for k, v in SimulationConfigInputUpdated.items() 
                    if k in inspect.signature(ModelClass.__init__).parameters}

# Create the model instance
predictor_model = ModelClass(**model_init_args) 

# We will pass this instance around instead of the class itself
SimulationConfigInputUpdated['Model'] = predictor_model 


## Train and test model

In [11]:
print("Iteration: " + str(i))

# Get features and target for the current training set
X_train_df, y_train_series = get_features_and_target(
    df=SimulationConfigInputUpdated["df_Train"],
    target_column_name="Y",
    auxiliary_columns=SimulationConfigInputUpdated.get('auxiliary_data_cols', [])
)

# Train Prediction Model: 
predictor_model.fit(X_train_df=X_train_df, y_train_series=y_train_series)

### Test Error ###
TestErrorOutput = TestErrorFunction(InputModel=predictor_model, # Pass the instance
                                    df_Test=SimulationConfigInputUpdated["df_Test"],
                                    Type=SimulationConfigInputUpdated["Type"],
                                    auxiliary_columns=SimulationConfigInputUpdated.get('auxiliary_data_cols', []))
CurrentError = TestErrorOutput["ErrorVal"]
ErrorVec.append(CurrentError)

Iteration: 0


## Sampling procedure

In [12]:
### Sampling Procedure ###
SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated) 
SelectorArgsFiltered['auxiliary_columns'] = SimulationConfigInputUpdated.get('auxiliary_data_cols', [])
SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)
QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]
QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[QueryObservationIndex]
SelectedObservationHistory.append(QueryObservationIndex)

## Update and store

In [13]:

### Update Train and Candidate Sets ###
SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation]).drop(columns=['DiversityScores', 'DensityScores'])
SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex)

### Store Number of (Unique) Trees ###
if hasattr(predictor_model, 'get_tree_count'): 
        TreeCount["AllTreeCount"].append(TestErrorOutput.get("AllTreeCount", 0)) 
        TreeCount["UniqueTreeCount"].append(TestErrorOutput.get("UniqueTreeCount", 0))

# Increase iteration #
i+=1 

# Save

In [14]:
# ### Return Simulation Parameters ###
# SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
#                             "Seed" : str(SimulationConfigInput["Seed"]),
#                             "TestProportion" : str(SimulationConfigInput["TestProportion"]),
#                             "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
#                             "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
#                             "ModelType" :  str(SimulationConfigInput["ModelType"]),
#                             'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
#                             'n_estimators': str(SimulationConfigInput["n_estimators"]),
#                             'regularization': str(SimulationConfigInput["regularization"]),
#                             'RashomonThreshold': str(SimulationConfigInput["RashomonThreshold"]),
#                             'Type': 'Classification',
#                             }

In [15]:
# ### Return Time ###
# ElapsedTime = time.time() - StartTime

# ### Return Dictionary ###
# SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
#                             "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
#                             "SimulationParameters" : SimulationParameters,
#                             "ElapsedTime" : ElapsedTime}