-
Notifications
You must be signed in to change notification settings - Fork 44
/
sklearnmodel.py
164 lines (142 loc) · 5.67 KB
/
sklearnmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from typing import List
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, BaseEstimator
from bartpy.model import Model
from bartpy.data import Data
from bartpy.samplers.schedule import SampleSchedule
from bartpy.samplers.modelsampler import ModelSampler
from bartpy.sigma import Sigma
from bartpy.samplers.treemutation.uniform.likihoodratio import UniformTreeMutationLikihoodRatio
from bartpy.samplers.treemutation.uniform.proposer import UniformMutationProposer
from bartpy.samplers.treemutation.treemutation import TreeMutationSampler
from bartpy.samplers.sigma import SigmaSampler
from bartpy.samplers.leafnode import LeafNodeSampler
class SklearnModel(BaseEstimator, RegressorMixin):
"""
The main access point to building BART models in BartPy
Parameters
----------
n_trees: int
the number of trees to use, more trees will make a smoother fit, but slow training and fitting
sigma_a: float
shape parameter of the prior on sigma
sigma_b: float
scale parameter of the prior on sigma
n_samples: int
how many recorded samples to take
n_burn: int
how many samples to run without recording to reach convergence
thin: float
percentage of samples to store.
use this to save memory when running large models
p_grow: float
probability of choosing a grow mutation in tree mutation sampling
p_prune: float
probability of choosing a prune mutation in tree mutation sampling
alpha: float
prior parameter on tree structure
beta: float
prior parameter on tree structure
store_in_sample_predictions: bool
whether to store full prediction samples
set to False if you don't need in sample results - saves a lot of memory
"""
def __init__(self,
n_trees: int=50,
sigma_a: float=0.001,
sigma_b: float=0.001,
n_samples: int=200,
n_burn: int=200,
thin: float=0.1,
p_grow: float=0.5,
p_prune: float=0.5,
alpha: float=0.95,
beta: float=2.,
store_in_sample_predictions: bool=True):
self.n_trees = n_trees
self.sigma_a = sigma_a
self.sigma_b = sigma_b
self.n_burn = n_burn
self.n_samples = n_samples
self.p_grow = p_grow
self.p_prune = p_prune
self.alpha = alpha
self.beta = beta
self.thin = thin
self.store_in_sample_predictions = store_in_sample_predictions
self.sigma, self.data, self.model, self.proposer, self.likihood_ratio, self.sampler, self._prediction_samples, self._model_samples, self.schedule = [None] * 9
def fit(self, X: pd.DataFrame, y: np.ndarray) -> 'SklearnModel':
"""
Learn the model based on training data
Parameters
----------
X: pd.DataFrame
training covariates
y: np.ndarray
training targets
Returns
-------
SklearnModel
self with trained parameter values
"""
from copy import deepcopy
if type(X) == pd.DataFrame:
X = X.values
self.data = Data(deepcopy(X), deepcopy(y), normalize=True)
self.sigma = Sigma(self.sigma_a, self.sigma_b, self.data.normalizing_scale)
self.model = Model(self.data, self.sigma, n_trees=self.n_trees, alpha=self.alpha, beta=self.beta)
self.proposer = UniformMutationProposer([self.p_grow, self.p_prune])
self.likihood_ratio = UniformTreeMutationLikihoodRatio([self.p_grow, self.p_prune])
self.tree_sampler = TreeMutationSampler(self.proposer, self.likihood_ratio)
self.schedule = SampleSchedule(self.tree_sampler, LeafNodeSampler(), SigmaSampler())
self.sampler = ModelSampler(self.schedule)
self._model_samples, self._prediction_samples = self.sampler.samples(self.model, self.n_samples, self.n_burn, thin=self.thin, store_in_sample_predictions=self.store_in_sample_predictions)
return self
def predict(self, X: np.ndarray=None):
"""
Predict the target corresponding to the provided covariate matrix
If X is None, will predict based on training covariates
Prediction is based on the mean of all samples
Parameters
----------
X: pd.DataFrame
covariates to predict from
Returns
-------
np.ndarray
predictions for the X covariates
"""
if X is None:
return self.data.unnormalize_y(self._prediction_samples.mean(axis=0))
else:
return self._out_of_sample_predict(X)
def _out_of_sample_predict(self, X):
return self.data.unnormalize_y(np.mean([x.predict(X) for x in self._model_samples], axis=0))
def fit_predict(self, X, y):
self.fit(X, y)
return self.predict()
@property
def model_samples(self) -> List[Model]:
"""
Array of the model as it was after each sample.
Useful for examining for:
- examining the state of trees, nodes and sigma throughout the sampling
- out of sample prediction
Returns None if the model hasn't been fit
Returns
-------
List[Model]
"""
return self._model_samples
@property
def prediction_samples(self):
"""
Matrix of prediction samples at each point in sampling
Useful for assessing convergence, calculating point estimates etc.
Returns
-------
np.ndarray
prediction samples with dimensionality n_samples * n_points
"""
return self.prediction_samples