In [7]:
import pymc as pm
import arviz as az
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
N = 500
freq = np.random.choice([25, 33, 50, 66], size=N) # frecv procesorului
hd = np.random.choice([80, 120, 240, 540], size=N) # dimensiunea hard disk-ului
premium = np.random.choice([0, 1], size=N) # 0 sau 1
# pp o relatie reala pt a genera preturile
true_alpha, true_beta1, true_beta2, true_sigma = 1000, 10, 200, 150
log_hd = np.log(hd)
price = true_alpha + true_beta1 * freq + true_beta2 * log_hd + np.random.normal(0, true_sigma, N)

df = pd.DataFrame({'price': price, 'speed': freq, 'hd': hd, 'premium': premium})

# preprocesare: calculam ln pt hard disk
df['log_hd'] = np.log(df['hd'])

# datele observate
y_obs = df['price'].values
x1_obs = df['speed'].values
x2_obs = df['log_hd'].values

print("--- Start Modelare Bayesiana ---")

# a) modelul in PyMC
with pm.Model() as model:
    # i) definim priorii (weakly informative)
    # folosim distributii normale cu deviatie mare pentru a fi "neinformative"
    alpha = pm.Normal('alpha', mu=0, sigma=1000)
    beta1 = pm.Normal('beta1', mu=0, sigma=100) # coeficient pt frecv procesorului
    beta2 = pm.Normal('beta2', mu=0, sigma=100) # coeficient pt ln(HD)

    #  trebuie sƒÉ fie pozitiv (HalfNormal sau HalfCauchy)
    sigma = pm.HalfNormal('sigma', sigma=100)

    # ii) definim media determinista (media)
    # mu = alpha + beta1 * x1 + beta2 * x2
    mu = alpha + beta1 * x1_obs + beta2 * x2_obs

    # iii) definim likelihood-ul (distributia datelor observate)
    y = pm.Normal('y', mu=mu, sigma=sigma, observed=y_obs)

    # iv) Simulare  din distributia posterioara
    # Se extrag e»ôantioane folosind MCMC (NUTS sampler)
    idata = pm.sample(draws=200, tune=200, chains=2, return_inferencedata=True)

# b) EstimƒÉri 95% HDI pentru beta1 »ôi beta2
print("\n--- Rezultate (b) ---")
summary_b = az.summary(idata, var_names=['beta1', 'beta2'], hdi_prob=0.95)
print(summary_b[['mean', 'sd', 'hdi_2.5%', 'hdi_97.5%']])

# c) Sunt predictori utili?
# VerificƒÉm dacƒÉ 0 este √Æn intervalul HDI
b1_lower = summary_b.loc['beta1', 'hdi_2.5%']
b1_upper = summary_b.loc['beta1', 'hdi_97.5%']
b2_lower = summary_b.loc['beta2', 'hdi_2.5%']
b2_upper = summary_b.loc['beta2', 'hdi_97.5%']

useful_b1 = not (b1_lower < 0 < b1_upper)
useful_b2 = not (b2_lower < 0 < b2_upper)

print(f"\n--- Rezultate (c) ---")
print(f"Este Frecventa utilƒÉ? {useful_b1} (Intervalul nu con»õine 0)")
print(f"Este HD Size util? {useful_b2} (Intervalul nu con»õine 0)")

# d) + e) Predic»õii pentru un caz specific
# Specific: Frecventa = 33 MHz, HD = 540 MB
new_x1 = 33
new_x2 = np.log(540)

# Extragem lan»õurile posterioare (flattened)
post = idata.posterior
alpha_samples = post['alpha'].values.flatten()
beta1_samples = post['beta1'].values.flatten()
beta2_samples = post['beta2'].values.flatten()
sigma_samples = post['sigma'].values.flatten()

# CalculƒÉm distribu»õia mediei a»ôteptate (mu)
mu_samples = alpha_samples + beta1_samples * new_x1 + beta2_samples * new_x2

# d) 90% HDI pentru media pre»õului (Expected sale price)
hdi_mu = az.hdi(mu_samples, hdi_prob=0.90)
print(f"\n--- Rezultate (d) ---")
print(f"Pre»õ mediu a»ôteptat (mu) 90% HDI: [{hdi_mu[0]:.2f}, {hdi_mu[1]:.2f}]")

# e) 90% HDI pentru o predic»õie concretƒÉ (Posterior Predictive)
# Aici adƒÉugƒÉm zgomotul (sigma) la medie
y_pred_samples = np.random.normal(loc=mu_samples, scale=sigma_samples)
hdi_y = az.hdi(y_pred_samples, hdi_prob=0.90)

print(f"\n--- Rezultate (e) ---")
print(f"Predic»õie v√¢nzare specificƒÉ (y) 90% HDI: [{hdi_y[0]:.2f}, {hdi_y[1]:.2f}]")



--- Start Modelare Bayesiana ---


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [alpha, beta1, beta2, sigma]


Output()

Sampling 2 chains for 200 tune and 200 draw iterations (400 + 400 draws total) took 531 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics
The rhat statistic is larger than 1.01 for some parameters. This indicates problems during sampling. See https://arxiv.org/abs/1903.08008 for details
The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details



--- Rezultate (b) ---
          mean     sd  hdi_2.5%  hdi_97.5%
beta1   10.029  0.395     9.252     10.778
beta2  202.069  8.076   185.178    217.616

--- Rezultate (c) ---
Este Frecventa utilƒÉ? True (Intervalul nu con»õine 0)
Este HD Size util? True (Intervalul nu con»õine 0)

--- Rezultate (d) ---
Pre»õ mediu a»ôteptat (mu) 90% HDI: [2583.62, 2622.46]

--- Rezultate (e) ---
Predic»õie v√¢nzare specificƒÉ (y) 90% HDI: [2379.34, 2852.83]
