In [1]:
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
err = {}

In [3]:
# o df_melted que retorna é só para printar as distribuicoes dos 
# lambdas com histplots e ver que elas mudam conforme theta

def generate_parameters(thetas, N, iters):
    values = pd.DataFrame()
    ## Parametros para arvore
    lambdas = []
    thetas_ = []
    ##
    for theta in thetas:
        diff = []
        theoretical = np.e**(-theta)
        for i in range(iters):
            exp = np.random.exponential(1/theta, N)
            empirical = (len([i for i in exp if i > 1])/len(exp))
            diff.append(abs(theoretical - empirical))
            lambdas.append(abs(theoretical - empirical))
            thetas_.append(theta)
        values[f"{theta}"] = diff
    df_melted = values.melt(var_name='theta')
    
    return lambdas, thetas_, df_melted

In [4]:
def eval_coverage(quantiles):
    # theta = parametro
    # lambda = estatistica de teste
    # Lambda: |P_teorica(exp > 1) - P_empirica(exp > 1)|
    j = 0
    err = 0
    for theta in thetas:
        theoretical = np.e**(-theta)
        lambdas_ = []
        for i in range(iters):
            distr = np.random.exponential(1/theta, N)
            empirical = len([i for i in distr if i > 1])/len(distr)
            diff = abs(theoretical - empirical)
            lambdas_.append(diff)
        alpha = len([i for i in lambdas_ if i >= quantiles[j]])/len(lambdas_)
        print(f"alpha for theta={theta} : {alpha} --- quantile: {quantiles[j]}")
        err += abs(alpha - 0.05)
        j += 1
    print(f"mean absolute error: {err/j}")
    return err/j

In [5]:
# isso foi usado em um experimento
def params_with_random_thetas(thetas, N):
    chosen_thetas = np.random.choice(thetas, N)
    lambdas = []
    thetas_ = []
    ##
    for theta in chosen_thetas:
        theoretical = np.e**(-theta)
        exp = np.random.exponential(1/theta, 10000)
        empirical = len([i for i in exp if i > 1])/len(exp)
        lambdas.append(abs(theoretical - empirical))
        thetas_.append(theta)
        
    return lambdas, thetas_

In [6]:
# Fica a vontade pra mudar isso aqui se quiser brincar
thetas = np.arange(0.5, 7, 0.5)
N = 10000
iters = 10000


lambdas, thetas_, df_melted = generate_parameters(thetas, N, iters)

# Estimador Ingenuo

In [7]:
# isso aqui eh o que eu to chamando de naive. Basicamente pega o
# parametro e tira o quantil dele. usei o df_melted pq era mais facil
# por algum motivo que nao consigo lembrar de cabeça agora
naive = [
    np.quantile(list(df_melted[df_melted["theta"] == str(theta)].value), 0.95)
    for theta in thetas
]


In [8]:
err["err_naive"] = eval_coverage(naive)

alpha for theta=0.5 : 0.0477 --- quantile: 0.009530659712633449
alpha for theta=1.0 : 0.0544 --- quantile: 0.009479441171442338
alpha for theta=1.5 : 0.0508 --- quantile: 0.008130160148429849
alpha for theta=2.0 : 0.0531 --- quantile: 0.006664716763387285
alpha for theta=2.5 : 0.052 --- quantile: 0.0054150013761011945
alpha for theta=3.0 : 0.0472 --- quantile: 0.004312931632136051
alpha for theta=3.5 : 0.051 --- quantile: 0.003397383422318507
alpha for theta=4.0 : 0.0485 --- quantile: 0.002615638888734187
alpha for theta=4.5 : 0.0493 --- quantile: 0.0020910034617576904
alpha for theta=5.0 : 0.0618 --- quantile: 0.0015620530009145314
alpha for theta=5.5 : 0.0632 --- quantile: 0.0012132285615359317
alpha for theta=6.0 : 0.054 --- quantile: 0.0009787521766663593
alpha for theta=6.5 : 0.0675 --- quantile: 0.000703439192977573
mean absolute error: 0.005007692307692307


# Arvore de regressao


In [10]:
# se colocar o boosting com 1 iteracao, ele vira uma arvore
# e o quantile loss eh o pinball. ref: 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
model = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95, max_iter=1
)

model_thetas = np.array(thetas_).reshape(-1, 1)
model_lambdas = np.array(lambdas).reshape(-1, 1)
model.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [11]:
quantiles = model.predict(thetas.reshape(-1, 1))
quantiles

array([0.00650547, 0.00650035, 0.00636542, 0.00622593, 0.00609091,
       0.00598331, 0.00589214, 0.00581397, 0.00570151, 0.00570151,
       0.00570151, 0.00570151, 0.00570151])

In [12]:
err["err_reg"] = eval_coverage(quantiles)

alpha for theta=0.5 : 0.1877 --- quantile: 0.006505472229893283
alpha for theta=1.0 : 0.1722 --- quantile: 0.006500350375774172
alpha for theta=1.5 : 0.1321 --- quantile: 0.006365422273472923
alpha for theta=2.0 : 0.0716 --- quantile: 0.006225934582291209
alpha for theta=2.5 : 0.0242 --- quantile: 0.0060909061210198175
alpha for theta=3.0 : 0.0057 --- quantile: 0.005983311472879463
alpha for theta=3.5 : 0.0002 --- quantile: 0.005892144600861789
alpha for theta=4.0 : 0.0 --- quantile: 0.005813970147503357
alpha for theta=4.5 : 0.0 --- quantile: 0.0057015066048057075
alpha for theta=5.0 : 0.0 --- quantile: 0.0057015066048057075
alpha for theta=5.5 : 0.0 --- quantile: 0.0057015066048057075
alpha for theta=6.0 : 0.0 --- quantile: 0.0057015066048057075
alpha for theta=6.5 : 0.0 --- quantile: 0.0057015066048057075
mean absolute error: 0.06026923076923078


## Regressor - random

In [13]:
# aqui, a gente gera N (=1000) thetas aleatoriamente entre
# os possiveis valores do grid e computa as metricas
lambdas_r, thetas_r = params_with_random_thetas(thetas, 1000)

In [14]:
model = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95, max_iter=1
)

model_thetas = np.array(thetas_r).reshape(-1, 1)
model_lambdas = np.array(lambdas_r).reshape(-1, 1)
model.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [15]:
quantiles = model.predict(thetas.reshape(-1, 1))
quantiles

array([0.00627967, 0.00634607, 0.00611782, 0.0060046 , 0.00608203,
       0.00580233, 0.00563381, 0.00563381, 0.00563381, 0.00563381,
       0.00563381, 0.00563381, 0.00549899])

In [16]:
err["err_reg_random"] = eval_coverage(quantiles)

alpha for theta=0.5 : 0.195 --- quantile: 0.0062796735316433
alpha for theta=1.0 : 0.184 --- quantile: 0.006346070425798208
alpha for theta=1.5 : 0.1463 --- quantile: 0.006117824707169677
alpha for theta=2.0 : 0.0804 --- quantile: 0.006004597984992704
alpha for theta=2.5 : 0.0267 --- quantile: 0.006082026391220048
alpha for theta=3.0 : 0.0074 --- quantile: 0.0058023331454403705
alpha for theta=3.5 : 0.0007 --- quantile: 0.005633807310625935
alpha for theta=4.0 : 0.0001 --- quantile: 0.005633807310625935
alpha for theta=4.5 : 0.0 --- quantile: 0.005633807310625935
alpha for theta=5.0 : 0.0 --- quantile: 0.005633807310625935
alpha for theta=5.5 : 0.0 --- quantile: 0.005633807310625935
alpha for theta=6.0 : 0.0 --- quantile: 0.005633807310625935
alpha for theta=6.5 : 0.0 --- quantile: 0.005498988740934872
mean absolute error: 0.06313846153846156


## 20 iters in each theta

In [17]:
# Aqui foi uma tentativa de fazer poucas iteracoes em cima de cada theta,
# que é algo mais proximo do que acontece na realidade
lambdas_few, thetas_few, _ = generate_parameters(thetas, N, 20)

In [18]:
model = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95, max_iter=1
)

model_thetas = np.array(thetas_few).reshape(-1, 1)
model_lambdas = np.array(lambdas_few).reshape(-1, 1)
model.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [19]:
quantiles = model.predict(thetas.reshape(-1, 1))
quantiles

array([0.00689843, 0.00694831, 0.00677935, 0.00681054, 0.00634721,
       0.00634721, 0.00634721, 0.00634721, 0.00634721, 0.00634721,
       0.00634721, 0.00634721, 0.00634721])

In [20]:
err["err_reg_few_iters"] = eval_coverage(quantiles)

alpha for theta=0.5 : 0.1556 --- quantile: 0.006898427750885749
alpha for theta=1.0 : 0.1498 --- quantile: 0.006948305896766647
alpha for theta=1.5 : 0.1003 --- quantile: 0.0067793457647794255
alpha for theta=2.0 : 0.0485 --- quantile: 0.006810537270917554
alpha for theta=2.5 : 0.0219 --- quantile: 0.006347208281449993
alpha for theta=3.0 : 0.003 --- quantile: 0.006347208281449993
alpha for theta=3.5 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=4.0 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=4.5 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=5.0 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=5.5 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=6.0 : 0.0 --- quantile: 0.006347208281449993
alpha for theta=6.5 : 0.0 --- quantile: 0.006347208281449993
mean absolute error: 0.05248461538461539


# Boosting

In [21]:


model = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95
)

model_thetas = np.array(thetas_).reshape(-1, 1)
model_lambdas = np.array(lambdas).reshape(-1, 1)
model.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [22]:
model.n_iter_

60

In [23]:
boosting_quantiles = model.predict(thetas.reshape(-1, 1))
boosting_quantiles

array([0.00946341, 0.00941472, 0.00806643, 0.00666383, 0.00541636,
       0.00431627, 0.00330777, 0.00262173, 0.00209733, 0.00164571,
       0.00122275, 0.00098976, 0.00080699])

In [24]:
err["err_boosting"] = eval_coverage(boosting_quantiles)

alpha for theta=0.5 : 0.0503 --- quantile: 0.00946341105109887
alpha for theta=1.0 : 0.0484 --- quantile: 0.009414717253073916
alpha for theta=1.5 : 0.0544 --- quantile: 0.008066425532000305
alpha for theta=2.0 : 0.0525 --- quantile: 0.00666382746447956
alpha for theta=2.5 : 0.0486 --- quantile: 0.005416357828616384
alpha for theta=3.0 : 0.0445 --- quantile: 0.0043162685153323695
alpha for theta=3.5 : 0.0505 --- quantile: 0.003307769007436827
alpha for theta=4.0 : 0.0468 --- quantile: 0.0026217279599283984
alpha for theta=4.5 : 0.0427 --- quantile: 0.0020973333104983504
alpha for theta=5.0 : 0.0448 --- quantile: 0.0016457146385571587
alpha for theta=5.5 : 0.0491 --- quantile: 0.0012227481655570148
alpha for theta=6.0 : 0.04 --- quantile: 0.000989763680167742
alpha for theta=6.5 : 0.0258 --- quantile: 0.0008069877332895657
mean absolute error: 0.0051538461538461556


## Random - Boosting

In [25]:
# boosting com os parametros gerados aleatoriamente
model_r = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95
)

In [26]:
model_thetas = np.array(thetas_r).reshape(-1, 1)
model_lambdas = np.array(lambdas_r).reshape(-1, 1)
model_r.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [27]:
model_r.n_iter_

100

In [28]:
boosting_quantiles_r = model_r.predict(thetas.reshape(-1, 1))
boosting_quantiles_r

array([0.00852349, 0.00903557, 0.00688696, 0.00575472, 0.00652899,
       0.00373213, 0.00302059, 0.00282162, 0.00199557, 0.00145806,
       0.00123545, 0.00095736, 0.00069883])

In [29]:
err["err_boosting_random"] = eval_coverage(boosting_quantiles_r)

alpha for theta=0.5 : 0.0848 --- quantile: 0.008523491672294879
alpha for theta=1.0 : 0.059 --- quantile: 0.009035565817120372
alpha for theta=1.5 : 0.093 --- quantile: 0.0068869612852165835
alpha for theta=2.0 : 0.0951 --- quantile: 0.005754724138048181
alpha for theta=2.5 : 0.0163 --- quantile: 0.006528987634253779
alpha for theta=3.0 : 0.0858 --- quantile: 0.0037321294668956843
alpha for theta=3.5 : 0.0757 --- quantile: 0.0030205936997641254
alpha for theta=4.0 : 0.0309 --- quantile: 0.002821619854825546
alpha for theta=4.5 : 0.0571 --- quantile: 0.001995573011793745
alpha for theta=5.0 : 0.0738 --- quantile: 0.0014580565109485633
alpha for theta=5.5 : 0.0489 --- quantile: 0.0012354549859946032
alpha for theta=6.0 : 0.0517 --- quantile: 0.0009573558685116342
alpha for theta=6.5 : 0.072 --- quantile: 0.0006988294692999759
mean absolute error: 0.02322307692307692


## 20 iters in each theta - Boosting

In [30]:
# boosting com poucas observacoes sobre cada theta
model = HistGradientBoostingRegressor(
    loss="quantile", quantile=.95,
)

model_thetas = np.array(thetas_few).reshape(-1, 1)
model_lambdas = np.array(lambdas_few).reshape(-1, 1)
model.fit(model_thetas, model_lambdas)

  y = column_or_1d(y, warn=True)


In [31]:
model.n_iter_

100

In [32]:
quantiles = model.predict(thetas.reshape(-1, 1))
quantiles

array([0.00831773, 0.00879938, 0.0071356 , 0.00733605, 0.00382144,
       0.00363956, 0.00338805, 0.00214762, 0.002091  , 0.00156403,
       0.00123323, 0.00098568, 0.00092211])

In [33]:
err["err_boosting_few_iters"] = eval_coverage(quantiles)

alpha for theta=0.5 : 0.0891 --- quantile: 0.008317726660699446
alpha for theta=1.0 : 0.0733 --- quantile: 0.008799381541258689
alpha for theta=1.5 : 0.0837 --- quantile: 0.007135595090225507
alpha for theta=2.0 : 0.0295 --- quantile: 0.00733605132893255
alpha for theta=2.5 : 0.16 --- quantile: 0.003821435904523782
alpha for theta=3.0 : 0.0911 --- quantile: 0.0036395574469711544
alpha for theta=3.5 : 0.0457 --- quantile: 0.0033880544952941045
alpha for theta=4.0 : 0.1106 --- quantile: 0.0021476158919946572
alpha for theta=4.5 : 0.048 --- quantile: 0.0020909999479073477
alpha for theta=5.0 : 0.0537 --- quantile: 0.001564026549716039
alpha for theta=5.5 : 0.0491 --- quantile: 0.001233233098854368
alpha for theta=6.0 : 0.043 --- quantile: 0.0009856834840578704
alpha for theta=6.5 : 0.0149 --- quantile: 0.0009221077954422396
mean absolute error: 0.029330769230769233


In [34]:
for key, value in err.items():
    print(f"{key} = {value}")

err_naive = 0.005007692307692307
err_reg = 0.06026923076923078
err_reg_random = 0.06313846153846156
err_reg_few_iters = 0.05248461538461539
err_boosting = 0.0051538461538461556
err_boosting_random = 0.02322307692307692
err_boosting_few_iters = 0.029330769230769233
