In [None]:
from sklearn.datasets import fetch_openml
boston = fetch_openml(name='boston', version=1, as_frame=True)
df_boston = boston.frame

df_sample = df_boston.sample(n=300, random_state=4713)
print(df_sample.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("Estadísticas descriptivas:")
print(df_sample.describe(include="all"))

In [None]:
corr = df_sample.corr(numeric_only=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap="coolwarm", center=0, annot=True)
plt.title("Matriz de correlación - Muestra Boston (n=300)")
plt.show()

# Variables más correlacionadas con MEDV
print("\nCorrelaciones con MEDV:")
print(corr['MEDV'].sort_values(ascending=False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cols_plot = ['MEDV','LSTAT','RM','PTRATIO','TAX','CRIM', 'NOX', 'AGE', 'DIS', 'INDUS','B', 'ZN']
n = len(cols_plot)
fig, axes = plt.subplots(4, 3, figsize=(18, 10))  # 4 filas, 3 columnas

for i, c in enumerate(cols_plot):
    if c in df_sample.columns:
        ax = axes[i // 3, i % 3]
        sns.histplot(df_sample[c], bins=30, kde=True, ax=ax)
        ax.set_title(f"Histograma de {c}")

plt.tight_layout()
plt.show()

In [None]:
top_corr = corr['MEDV'].drop('MEDV').abs().sort_values(ascending=False).head(4).index
for c in top_corr:
    plt.figure()
    sns.scatterplot(x=df_sample[c], y=df_sample['MEDV'])
    plt.xlabel(c); plt.ylabel('MEDV')
    plt.title(f"Dispersión: {c} vs MEDV")
    plt.show()

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats import diagnostic as diag
import scipy.stats as stats
import matplotlib.pyplot as plt

y = df_sample['MEDV'].values
X = df_sample.drop(columns=['MEDV'])
X = sm.add_constant(X, has_constant='add')  # intercepto

In [None]:
#import numpy as np
import pandas as pd
import statsmodels.api as sm

# 1) Normaliza el nombre de la variable objetivo (MEDV o medv)
target_col = 'MEDV' if 'MEDV' in df_sample.columns else 'medv'

# 2) Asegura que TODO sea numérico y sin NaN
df_num = df_sample.copy()

# Convierte todo a numérico; si hay strings los vuelve NaN
for c in df_num.columns:
    df_num[c] = pd.to_numeric(df_num[c], errors='coerce')

# Elimina filas con NaN (puedes reportar cuántas se fueron)
before = len(df_num)
df_num = df_num.dropna(axis=0).reset_index(drop=True)
print(f"Filas eliminadas por NaN: {before - len(df_num)}")

# 3) Separa X, y (solo columnas numéricas)
y = df_num[target_col].astype(float).values
X = df_num.drop(columns=[target_col]).select_dtypes(include=['number']).astype(float)

# 4) Agrega intercepto
X = sm.add_constant(X, has_constant='add')

# 5) Ajusta GLM Gaussiano
glm_gauss = sm.GLM(y, X, family=sm.families.Gaussian())
res = glm_gauss.fit()
print(res.summary())


In [None]:
glm_gauss = sm.GLM(y, X, family=sm.families.Gaussian())
res = glm_gauss.fit()

print(res.summary())  # tabla con coeficientes y p-valores

In [None]:
summary_df = res.summary2().tables[1].copy()
sig = summary_df[summary_df['P>|z|'] < 0.05]
print("Variables significativas (p<0.05):")
display(sig[['Coef.', 'Std.Err.', 'P>|z|']])


In [None]:
ols = sm.OLS(res.model.endog, res.model.exog).fit()
print("\nR²:", round(ols.rsquared, 4))
print("R² ajustado:", round(ols.rsquared_adj, 4))


In [None]:
residuals = res.resid_response
fitted    = res.fittedvalues

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
stats.probplot(residuals, dist="norm", plot=ax1)
ax1.set_title("Q-Q plot de residuales")

In [None]:

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
stats.probplot(residuals, dist="norm", plot=ax1)
ax1.set_title("Q-Q plot de residuales")

ax2.scatter(fitted, residuals, alpha=0.7)
ax2.axhline(0, color='red', linestyle='--')
ax2.set_xlabel("Ajustados")
ax2.set_ylabel("Residuales")
ax2.set_title("Residuales vs Ajustados")
plt.tight_layout()
plt.show()

In [None]:
bp_lm, bp_lm_p, bp_f, bp_f_p = diag.het_breuschpagan(residuals, res.model.exog)
print(f"\nBreusch–Pagan: LM p-value={bp_lm_p:.4f} | F p-value={bp_f_p:.4f}  (p>0.05 sugiere homocedasticidad)")

In [None]:
from statsmodels.stats.stattools import durbin_watson

dw = durbin_watson(residuals)
print(f"Durbin-Watson: {dw:.3f} (≈2 sugiere no autocorrelación)")

In [None]:
X_for_vif = pd.DataFrame(res.model.exog, columns=res.model.exog_names)  # incluye const
vif = pd.DataFrame({
    "Variable": X_for_vif.columns,
    "VIF": [variance_inflation_factor(X_for_vif.values, i) for i in range(X_for_vif.shape[1])]
})
print("\nVIF (multicolinealidad):")
display(vif)