In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

import os
os.chdir("E:/EBAC/Material/10 - Modelos de regresión lineal y series de tiempo")

In [5]:
data_house = pd.read_csv("kc_house_data.csv")

In [6]:
data_house["Intercepto"] = 1
data_house = data_house[["Intercepto", "price", "bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront", "view", "condition", "grade", "sqft_above", "yr_built", "sqft_living15", "sqft_lot15"]]
print(data_house.shape)
data_house.head()

(21613, 15)


Unnamed: 0,Intercepto,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,sqft_living15,sqft_lot15
0,1,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,1955,1340,5650
1,1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,1951,1690,7639
2,1,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,1933,2720,8062
3,1,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,1965,1360,5000
4,1,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,1987,1800,7503


In [7]:
Xdata = data_house[["Intercepto", "bedrooms", "bathrooms", "sqft_living", "waterfront", "view", "condition", "grade", "sqft_lot", "floors", "sqft_above", "yr_built", "sqft_living15", "sqft_lot15"]].values
Ydata = data_house[["price"]].values

X_train, X_test, Y_train, Y_test = train_test_split(Xdata, Ydata, test_size = 0.30, random_state=1)
X = X_train
Y = Y_train

In [8]:
# Opcion de formato numerico
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

XT_X = np.matmul(np.matrix.transpose(X), X)

XT_X_inv = np.linalg.inv(XT_X)

XT_Y = np.matmul(np.matrix.transpose(X), Y)

In [9]:
betas = np.matmul(XT_X_inv, XT_Y)
betas

array([[6233524.831323],
       [-34981.645716],
       [44452.053283],
       [159.083283],
       [567566.675566],
       [41728.788872],
       [17099.355296],
       [119256.271171],
       [-0.020600],
       [30359.292761],
       [-5.053054],
       [-3584.174161],
       [23.321091],
       [-0.485498]])

In [10]:
# Calculo de TSS (Suma total de cuadrados)
TSS = np.matmul(np.matrix.transpose(Y), Y) - len(Y)*(Y.mean()**2)
TSS

array([[1900668073054776.000000]])

In [11]:
# Calculo del ESS (Suma explicada de cuadrados)
ESS = np.matmul(np.matmul(np.matrix.transpose(betas), np.matrix.transpose(X)), np.matmul(X,betas)) - len(Y)*(Y.mean()**2)
ESS

array([[1240221837700666.000000]])

In [12]:
# Calculo de RSS (Residuales al Cuadrado)
RSS = TSS - ESS
RSS

array([[660446235354110.000000]])

In [19]:
# Calculo del Coeficiente de Detminacion R Cuadrada
RSq = float(1 - RSS / TSS)
RSq

0.6525188986351347

In [20]:
# Calculo del Coeficiente de Determinacion R Cuadrada Ajustada
RSqAj = float(1 - (RSS / (X.shape[0] - X.shape[1])) / (TSS / (X.shape[0] - 1)))
RSqAj

0.6522200395998886

In [24]:
# Calculo de la varianza del error de regresion
s_cuad = RSS / (len(Y) - X.shape[1])
s_cuad

43694755896.083496

In [25]:
# Desviacion estandar del error de regresion
import math
s = math.sqrt(s_cuad)
s

209032.90625182318

In [26]:
# Calculo de las t's estadisticas para cada coeficiente de regresion
result_t = []
for i in range(0, X.shape[1]):
    t = float(betas[i] / (s*math.sqrt(XT_X_inv[i][i])))
    result_t.append(t)
result_t

[41.26325152043999,
 -15.139556890112738,
 11.106955514750856,
 29.236168958086743,
 26.5421876398677,
 15.882280258335134,
 6.004160018070486,
 45.67334592756904,
 -0.3675389909198149,
 6.930846880608963,
 -0.9543341411142678,
 -46.153163963065325,
 5.572924592199558,
 -5.516880712973117]

# Criterio 1

In [27]:
# Obtener valor critico de la t de Student de tablas
import scipy.stats

grados_libertad = len(Y) - X.shape[1]

# La t_critica se obtendra a un nivel de confianza del 95% (Alfa = 5%)
t_critico = abs(scipy.stats.t.ppf(q=0.025, df = grados_libertad))
t_critico

1.9601209450413062

In [28]:
for i in range(0, X.shape[1]):
    if (abs(result_t[i]) > t_critico):
        print("Beta", i, "es significativa") # Aqui se rechaza H0
    else:
        print("Beta", i, "NO es significativa") # Aqui NO se rechaza H0

Beta 0 es significativa
Beta 1 es significativa
Beta 2 es significativa
Beta 3 es significativa
Beta 4 es significativa
Beta 5 es significativa
Beta 6 es significativa
Beta 7 es significativa
Beta 8 NO es significativa
Beta 9 es significativa
Beta 10 NO es significativa
Beta 11 es significativa
Beta 12 es significativa
Beta 13 es significativa


# Criterio 2

In [29]:
# Calculo de valores p
for i in range(0, X.shape[1]):
    print("Valor p de Beta", i, ":", scipy.stats.t.sf(abs(result_t[i]), df = grados_libertad) * 2)

Valor p de Beta 0 : 0.0
Valor p de Beta 1 : 2.1150588856773443e-51
Valor p de Beta 2 : 1.4966742060005931e-28
Valor p de Beta 3 : 7.912217847692225e-183
Valor p de Beta 4 : 9.28317470019821e-152
Valor p de Beta 5 : 2.4001519684988917e-56
Valor p de Beta 6 : 1.9672858743689154e-09
Valor p de Beta 7 : 0.0
Valor p de Beta 8 : 0.7132221592075667
Valor p de Beta 9 : 4.352519653172579e-12
Valor p de Beta 10 : 0.3399297931762092
Valor p de Beta 11 : 0.0
Valor p de Beta 12 : 2.5477892249271404e-08
Valor p de Beta 13 : 3.507380559018111e-08


# Criterio 3

In [30]:
# Calculo de intervalos de confianza del 95% para el verdadero valor del coeficiente de cada Beta
for i in range(0, X.shape[1]):
    print("El valor de Beta", i, "se encuentra entre", float(betas[i]) - t_critico * s * math.sqrt(XT_X_inv[i][i]),
         "y",  float(betas[i]) + t_critico * s * math.sqrt(XT_X_inv[i][i]))

El valor de Beta 0 se encuentra entre 5937414.802833049 y 6529634.859813245
El valor de Beta 1 se encuentra entre -39510.72519694968 y -30452.566235253114
El valor de Beta 2 se encuentra entre 36607.293251724484 y 52296.8133140192
El valor de Beta 3 se encuentra entre 148.41764247614748 y 169.7489240087222
El valor de Beta 4 se encuentra entre 525652.2959534817 y 609481.0551785405
El valor de Beta 5 se encuentra entre 36578.805887782124 y 46878.77185596049
El valor de Beta 6 se encuentra entre 11517.091605214675 y 22681.618986712496
El valor de Beta 7 se encuentra entre 114138.25955579683 y 124374.28278562266
El valor de Beta 8 se encuentra entre -0.13046053937562024 y 0.08926093435069522
El valor de Beta 9 se encuentra entre 21773.345525187586 y 38945.239997244294
El valor de Beta 10 se encuentra entre -15.431595066357284 y 5.3254876889392
El valor de Beta 11 se encuentra entre -3736.3937327474923 y -3431.9545896514683
El valor de Beta 12 se encuentra entre 15.118546938188087 y 31.523

In [15]:
# Reporte Automatizado de la regresion en Python
regressor = sm.OLS(Y, X).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     2183.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        20:00:09   Log-Likelihood:            -2.0679e+05
No. Observations:               15129   AIC:                         4.136e+05
Df Residuals:                   15115   BIC:                         4.137e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.234e+06   1.51e+05     41.263      0.0

In [18]:
# Calculo de la varianza del error de regresion
s_cuad = RSS / (len(Y) - X.shape[1])
s_cuad

array([[43694755895.078400]])

Tomando en cuenta betas no significativas en el criterio 1 y el valor de p en el criterio 2, la primer variable a eliminar seria la 8 debido a que su valor p es el mas cercano a 1, seguido de beta 10 en caso de ser necesario

In [39]:
X_Nueva = np.delete(X, 8, 1)
X_Nueva

array([[1.000000, 3.000000, 2.000000, ..., 1998.000000, 2070.000000,
        6250.000000],
       [1.000000, 2.000000, 1.000000, ..., 1944.000000, 1090.000000,
        7158.000000],
       [1.000000, 3.000000, 1.000000, ..., 1954.000000, 1140.000000,
        11250.000000],
       ...,
       [1.000000, 3.000000, 1.750000, ..., 1980.000000, 2460.000000,
        36677.000000],
       [1.000000, 3.000000, 2.250000, ..., 1959.000000, 1550.000000,
        9240.000000],
       [1.000000, 5.000000, 4.000000, ..., 1983.000000, 3430.000000,
        35096.000000]])

In [40]:
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     2365.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        22:43:13   Log-Likelihood:            -2.0679e+05
No. Observations:               15129   AIC:                         4.136e+05
Df Residuals:                   15116   BIC:                         4.137e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.233e+06   1.51e+05     41.263      0.0

El valor P mas alto es el de la variable 9 por lo que procedemos eliminandola para repetir el modelo

In [41]:
X_Nueva = np.delete(X_Nueva, 9, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     2580.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        22:43:27   Log-Likelihood:            -2.0679e+05
No. Observations:               15129   AIC:                         4.136e+05
Df Residuals:                   15117   BIC:                         4.137e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        6.25e+06    1.5e+05     41.640      0.0

Utilizamos Recursive Feature Elimination para seguir seleccionando las variables menos importantes primero

In [42]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

model = LinearRegression()
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_Nueva, Y)

print(selector.ranking_)

[12  8  5  9  1  3  4  2  6  7 10 11]


In [43]:
X_Nueva = np.delete(X_Nueva, 10, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     2830.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:20:03   Log-Likelihood:            -2.0681e+05
No. Observations:               15129   AIC:                         4.136e+05
Df Residuals:                   15118   BIC:                         4.137e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       6.173e+06    1.5e+05     41.267      0.0

In [44]:
X_Nueva = np.delete(X_Nueva, 9, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.603
Method:                 Least Squares   F-statistic:                     2550.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:20:46   Log-Likelihood:            -2.0781e+05
No. Observations:               15129   AIC:                         4.156e+05
Df Residuals:                   15119   BIC:                         4.157e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.711e+05      2e+04    -33.480      0.0

In [45]:
model = LinearRegression()
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_Nueva, Y)

print(selector.ranking_)

[10  7  5  8  1  3  4  2  6  9]


In [46]:
X_Nueva = np.delete(X_Nueva, 8, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.602
Method:                 Least Squares   F-statistic:                     2866.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:21:09   Log-Likelihood:            -2.0781e+05
No. Observations:               15129   AIC:                         4.156e+05
Df Residuals:                   15120   BIC:                         4.157e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.777e+05   1.99e+04    -34.036      0.0

In [47]:
X_Nueva = np.delete(X_Nueva, 5, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.589
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                     3101.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:21:25   Log-Likelihood:            -2.0806e+05
No. Observations:               15129   AIC:                         4.161e+05
Df Residuals:                   15121   BIC:                         4.162e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -7.159e+05   2.02e+04    -35.503      0.0

In [48]:
X_Nueva = np.delete(X_Nueva, 1, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.585
Model:                            OLS   Adj. R-squared:                  0.584
Method:                 Least Squares   F-statistic:                     3546.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:22:59   Log-Likelihood:            -2.0815e+05
No. Observations:               15129   AIC:                         4.163e+05
Df Residuals:                   15122   BIC:                         4.164e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -8.068e+05   1.91e+04    -42.220      0.0

In [49]:
model = LinearRegression()
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_Nueva, Y)

print(selector.ranking_)

[7 4 5 1 3 2 6]


In [50]:
X_Nueva = np.delete(X_Nueva, 5, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.534
Model:                            OLS   Adj. R-squared:                  0.534
Method:                 Least Squares   F-statistic:                     3461.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:23:18   Log-Likelihood:            -2.0902e+05
No. Observations:               15129   AIC:                         4.181e+05
Df Residuals:                   15123   BIC:                         4.181e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.614e+05   1.25e+04    -12.875      0.0

In [51]:
X_Nueva = np.delete(X_Nueva, 1, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.534
Model:                            OLS   Adj. R-squared:                  0.534
Method:                 Least Squares   F-statistic:                     4327.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:23:23   Log-Likelihood:            -2.0902e+05
No. Observations:               15129   AIC:                         4.180e+05
Df Residuals:                   15124   BIC:                         4.181e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -1.58e+05   1.17e+04    -13.503      0.0

In [52]:
model = LinearRegression()
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_Nueva, Y)

print(selector.ranking_)

[5 3 1 2 4]


In [53]:
X_Nueva = np.delete(X_Nueva, 3, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.528
Method:                 Least Squares   F-statistic:                     5651.
Date:                Mon, 03 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:23:38   Log-Likelihood:            -2.0910e+05
No. Observations:               15129   AIC:                         4.182e+05
Df Residuals:                   15125   BIC:                         4.182e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.088e+04   4973.233     -4.199      0.0

In [54]:
X_Nueva = np.delete(X_Nueva, 1, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.076
Model:                            OLS   Adj. R-squared:                  0.076
Method:                 Least Squares   F-statistic:                     620.7
Date:                Mon, 03 Mar 2025   Prob (F-statistic):          8.48e-260
Time:                        23:23:48   Log-Likelihood:            -2.1419e+05
No. Observations:               15129   AIC:                         4.284e+05
Df Residuals:                   15126   BIC:                         4.284e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.176e+05   3056.191    169.354      0.0

In [55]:
model = LinearRegression()
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_Nueva, Y)

print(selector.ranking_)

[3 1 2]


In [56]:
X_Nueva = np.delete(X_Nueva, 1, 1)
regressor = sm.OLS(Y, X_Nueva).fit()
print(regressor.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     98.34
Date:                Mon, 03 Mar 2025   Prob (F-statistic):           4.15e-23
Time:                        23:24:10   Log-Likelihood:            -2.1474e+05
No. Observations:               15129   AIC:                         4.295e+05
Df Residuals:                   15127   BIC:                         4.295e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.246e+05   3161.361    165.943      0.0