### 선형회귀 진단과 모델링 - StatsModels 라이브러리 사용

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import scipy.stats as st
from sklearn.datasets import fetch_california_housing
from statsmodels.stats.outliers_influence import variance_inflation_factor
%matplotlib inline

#### 1. Scikit-Learn의 California Housing 데이터셋을 읽어와서 DataFrame 으로 만들어 둔다.

In [2]:
data = fetch_california_housing()

In [3]:
# 설명변수.
X = data['data']
header = data['feature_names']

In [4]:
# 반응변수.
Y = data['target']
Y = Y.reshape(-1, 1)

In [5]:
df = pd.DataFrame(np.append(X,Y,axis = 1))
df.columns = list(header)+['PRICE']

In [6]:
df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### 2. R 스타일의 수식을 사용해서 모델링 해본다.

In [7]:
# 모든 설명 변수들을 사용해 본다.
my_model = smf.ols(formula = 'PRICE ~ MedInc + HouseAge + AveRooms + AveBedrms + AveOccup + Latitude + Longitude ', data=df)
my_model = my_model.fit()
my_model.summary()

0,1,2,3
Dep. Variable:,PRICE,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,4538.0
Date:,"Tue, 13 Jun 2023",Prob (F-statistic):,0.0
Time:,08:13:29,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45260.0
Df Residuals:,20632,BIC:,45330.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-36.9175,0.658,-56.085,0.000,-38.208,-35.627
MedInc,0.4368,0.004,104.089,0.000,0.429,0.445
HouseAge,0.0096,0.000,22.602,0.000,0.009,0.010
AveRooms,-0.1071,0.006,-18.217,0.000,-0.119,-0.096
AveBedrms,0.6449,0.028,22.922,0.000,0.590,0.700
AveOccup,-0.0038,0.000,-7.861,0.000,-0.005,-0.003
Latitude,-0.4207,0.007,-58.763,0.000,-0.435,-0.407
Longitude,-0.4340,0.008,-57.782,0.000,-0.449,-0.419

0,1,2,3
Omnibus:,4406.193,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14155.786
Skew:,1.084,Prob(JB):,0.0
Kurtosis:,6.429,Cond. No.,16800.0


In [8]:
# 모형의 속성을 살펴본다.
print(dir(my_model))

['HC0_se', 'HC1_se', 'HC2_se', 'HC3_se', '_HCCM', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abat_diagonal', '_cache', '_data_attr', '_data_in_cache', '_get_robustcov_results', '_is_nested', '_use_t', '_wexog_singular_values', 'aic', 'bic', 'bse', 'centered_tss', 'compare_f_test', 'compare_lm_test', 'compare_lr_test', 'condition_number', 'conf_int', 'conf_int_el', 'cov_HC0', 'cov_HC1', 'cov_HC2', 'cov_HC3', 'cov_kwds', 'cov_params', 'cov_type', 'df_model', 'df_resid', 'diagn', 'eigenvals', 'el_test', 'ess', 'f_pvalue', 'f_test', 'fittedvalues', 'fvalue', 'get_influence', 'get_prediction', 'get_robustcov_results', 'info_criteria', 'initialize', 'k_constant', 'llf', 'load', 'model', 'mse_mo

In [9]:
# 모형의 파라미터 출력.
my_model.params

Intercept   -36.917537
MedInc        0.436760
HouseAge      0.009555
AveRooms     -0.107134
AveBedrms     0.644902
AveOccup     -0.003819
Latitude     -0.420699
Longitude    -0.434042
dtype: float64

In [10]:
# 설명 변수의 수.
X.shape[1]

8

In [11]:
# VIF를 계산해 본다.
vifs = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
pd.Series(vifs, index=header)

MedInc         11.511140
HouseAge        7.195917
AveRooms       45.993601
AveBedrms      43.590314
Population      2.935745
AveOccup        1.095243
Latitude      559.874071
Longitude     633.711654
dtype: float64

**주의: Adjusted $R^2 = 0.606$, $AIC = 45260$.**