# Regresión múltiple sin penalización

Dataset: California housing

Target: Mediana de los valores de las casas 

Predictoras: variables numéricas y categóricas

# Librerias

In [1]:
import pandas as pd
import os
import tarfile
import urllib.request
import numpy as np
from pandas.core.common import flatten
from plotnine import *
from array import *
import scipy.stats as stats
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

# Los datos

Carga de datos

In [2]:
path = r'~/Documents/GitHub/Supervisado/dataset/housing.csv'
housing = pd.read_csv(path)

Variable respuesta y descriptoras

In [3]:
y = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

Variables numéricas

In [4]:
housing_num = housing.drop("ocean_proximity", axis=1)

El imputador

In [5]:
from sklearn.impute import SimpleImputer

El estandarizador

In [6]:
from sklearn.preprocessing import StandardScaler

La clase pipeline para variables numéricas

In [7]:
from sklearn.pipeline import Pipeline

Definición del pipeline que trabaja variables numéricas

In [8]:
num_pipeline = Pipeline([
        ("imputador", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])

Clases para procesar a las variables categóricas

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Variables numéricas y categóricas

In [10]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

Definición del pipeline que trabaja variables numéricas y categóricas

In [11]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(drop = "first"), cat_attribs),
    ])

Aplicación del fullpipeline

In [12]:
X = full_pipeline.fit_transform(housing)

# Regresión Lineal

## Primer modelo

In [13]:
lm_reg1 = LinearRegression()
lm_reg1.fit(X, y)

LinearRegression()

Coeficientes del modelo

In [14]:
print('Intercepto (LM 1):', lm_reg1.intercept_)

Intercepto (LM 1): 219237.0006433122


In [15]:
print('Coeficientes de regresión (LM 1):', lm_reg1.coef_)

Coeficientes de regresión (LM 1): [-52952.95152846 -53767.62485624  13312.88334575 -10320.06092603
  29920.76507621 -44490.47744263  29746.22226671  73636.15586366
 -39766.3987444  156065.71982235  -3697.40166109   4758.75361226]


## Segundo modelo

Es necesario aumentar un intercepto a las variables predictoras

In [16]:
W = sm.add_constant(X)

In [17]:
lm_reg2 = sm.OLS(y, W)
lm_reg2 = lm_reg2.fit()

Coeficientes del modelo

In [18]:
print('Parámetros (LM 2):', lm_reg2.params)

Parámetros (LM 2): const    219237.000643
x1       -52952.951528
x2       -53767.624856
x3        13312.883346
x4       -10320.060926
x5        29920.765076
x6       -44490.477443
x7        29746.222267
x8        73636.155864
x9       -39766.398744
x10      156065.719822
x11       -3697.401661
x12        4758.753612
dtype: float64


Resumen del modelo

In [19]:
lm_reg2.summary()

0,1,2,3
Dep. Variable:,median_house_value,R-squared:,0.645
Model:,OLS,Adj. R-squared:,0.645
Method:,Least Squares,F-statistic:,3129.0
Date:,"Mon, 03 Jan 2022",Prob (F-statistic):,0.0
Time:,12:51:19,Log-Likelihood:,-259170.0
No. Observations:,20640,AIC:,518400.0
Df Residuals:,20627,BIC:,518500.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.192e+05,836.139,262.202,0.000,2.18e+05,2.21e+05
x1,-5.295e+04,2031.313,-26.068,0.000,-5.69e+04,-4.9e+04
x2,-5.377e+04,2135.527,-25.178,0.000,-5.8e+04,-4.96e+04
x3,1.331e+04,550.049,24.203,0.000,1.22e+04,1.44e+04
x4,-1.032e+04,1681.185,-6.139,0.000,-1.36e+04,-7024.805
x5,2.992e+04,2487.720,12.027,0.000,2.5e+04,3.48e+04
x6,-4.449e+04,1204.800,-36.928,0.000,-4.69e+04,-4.21e+04
x7,2.975e+04,2545.710,11.685,0.000,2.48e+04,3.47e+04
x8,7.364e+04,631.147,116.670,0.000,7.24e+04,7.49e+04

0,1,2,3
Omnibus:,5177.939,Durbin-Watson:,0.968
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19747.113
Skew:,1.213,Prob(JB):,0.0
Kurtosis:,7.132,Cond. No.,127.0


Coeficientes del modelo

# Comparativa de modelos

In [20]:
mod = ['Lm1','Lm2']
score = [lm_reg1.score(X,y),lm_reg2.rsquared]
df = pd.DataFrame()
df['Modelo'] = mod
df['Score'] = score
df

Unnamed: 0,Modelo,Score
0,Lm1,0.645453
1,Lm2,0.645453


Elaborado por Jairo Rojas