# Locluster predictions

## Generate data

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
os.chdir("../results/")

In [3]:
from simulation import simulation

In [4]:
sim = simulation(coef=2)
sim.homoscedastic(n=2 * 10**3, random_seed=0)

In [5]:
X_train, X_calib, y_train, y_calib = train_test_split(sim.X, sim.y, test_size=10 ** 3)

In [6]:
rng = np.random.RandomState(0)

In [7]:
test_size = 5 * 10**3
X_test = rng.uniform(-1.5, 1.5, size=(test_size, sim.dim))

In [8]:
n_y = 10 ** 3
y_test = sim.homoscedastic_r(X_test[:, 0], B=n_y)

## Utils

In [9]:
def coverage_fraction(y, y_low, y_high):
    return np.mean(np.logical_and(y >= y_low, y <= y_high))

## Model

In [10]:
from lcv.locluster import KmeansSplit
from lcv.models import QuantileGradientBoosting
from lcv.scores import QuantileScore

In [11]:
locluster = KmeansSplit(QuantileScore, QuantileGradientBoosting, alpha=0.05)

In [12]:
locluster.fit(X_train, y_train)

## Calibration (explicit)

In [13]:
from sklearn.preprocessing import StandardScaler
from lcv.locluster import GradientBoostingQuantileEnsemble

In [14]:
scores = locluster.nc_score.compute(X_calib, y_calib)

In [15]:
locluster.update_limits(np.min(y_calib), np.max(y_calib))

In [16]:
X_train, X_val, res_train, res_val = train_test_split(X_calib, scores, test_size=0.5,
                                                  random_state=rng)

In [17]:
qmodel = GradientBoostingQuantileEnsemble()

In [18]:
qmodel.fit(X_train, res_train); # problems here

In [19]:
train_pred = qmodel.predict(X_train)
val_pred = qmodel.predict(X_val)

In [20]:
scaler = StandardScaler()

In [21]:
new_X = scaler.fit_transform(train_pred)
new_X_val = scaler.transform(val_pred)

In [22]:
current_sil = -1.0

In [23]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [24]:
current_sil = -1.0
for k in np.arange(2, 11):
    model = KMeans(n_clusters=k, random_state=rng, n_init=30)
    model.fit(new_X_val)
    labels = model.labels_
    new_sil = silhouette_score(new_X, labels, metric="euclidean")
    if new_sil > current_sil:
        current_sil = new_sil
        best_model = model

In [25]:
best_model

In [26]:
groups = best_model.predict(new_X_val)

In [27]:
np.unique(groups)

array([0, 1], dtype=int32)

In [28]:
cutoffs = []

In [29]:
cutoffs.append(np.quantile(res_val[groups == 0], q=0.95))

In [30]:
cutoffs.append(np.quantile(res_val[groups == 1], q=0.95))

In [31]:
cutoffs = np.array(cutoffs)

## Prediction (explicit)

In [32]:
y_grid = np.linspace(locluster.min_y, locluster.max_y, 2000)

In [33]:
interval_list = []

In [34]:
scaled_X_test = scaler.transform(qmodel.predict(X_test))

In [47]:
i = 0

In [36]:
a = locluster.nc_score.compute(X_test[i,:].reshape(1, -1), y_grid);

In [37]:
index = best_model.predict(scaled_X_test[i,:].reshape(1, -1))

In [38]:
cutoffs[index] 

array([-0.15345677])

In [39]:
b = np.diff((a <= cutoffs[index]).astype(np.int32))

In [40]:
c = np.where(b != 0)[0]

In [41]:
c

array([ 587, 1096])

In [42]:
b[c[0]], b[c[1]]

(1, -1)

In [43]:
np.tile(np.array([1,0]), c.shape[0] // 2)

array([1, 0])

In [44]:
int_idx = c + np.tile(np.array([1,0]), c.shape[0] // 2)

In [45]:
int_idx

array([ 588, 1096])

In [46]:
y_grid[int_idx]

array([-3.1974783,  0.8822879])