In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.api import qqplot, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV 
from scipy.stats import randint

from sklearn.metrics import confusion_matrix 


# - MEDV : 주택가격(중앙값)
# - CRIM : 범죄율
# - ZN : 주거지 비율
# - INDUS : 비소매업 비율
# - CHAS : 강 조망 여부(1-조망,0-비조망)
# - NOX : 산화질소 농도
# - RM : 주거당 평균 객실 수
# - AGE : 노후 건물 비율
# - DIS : 중심지(노동센터) 접근 거리
# - RAD : 고속도로 접근 편이성 지수
# - TAX : 재산세율
# - PTRATIO : 학생당 교사 비율
# - B : 흑인 인구 비율
# - LSTAT : 저소득층 비율

In [15]:
df_raw = pd.read_csv('C:/Users/moon/Documents/github/posco_academy/boston.csv',engine='python',encoding='CP949')

In [16]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   癤풫EDV    506 non-null    float64
 1   CRIM     506 non-null    float64
 2   ZN       506 non-null    float64
 3   INDUS    506 non-null    int64  
 4   CHAS     506 non-null    int64  
 5   NOX      506 non-null    float64
 6   RM       506 non-null    float64
 7   AGE      506 non-null    float64
 8   DIS      506 non-null    float64
 9   RAD      506 non-null    int64  
 10  TAX      506 non-null    int64  
 11  PTRATIO  506 non-null    float64
 12  B        506 non-null    float64
 13  LSTAT    506 non-null    float64
dtypes: float64(10), int64(4)
memory usage: 55.5 KB


In [17]:
df_raw.describe()

Unnamed: 0,癤풫EDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,22.532806,3.613524,11.363636,2.337945,0.06917,0.554695,6.284634,68.574901,3.795043,2.274704,2.539526,18.455534,356.67403,12.653063
std,9.197104,8.601545,23.322453,0.990261,0.253994,0.115878,0.702617,28.148862,2.10571,1.217705,1.132516,2.164946,91.294863,7.141062
min,5.0,0.00632,0.0,1.0,0.0,0.385,3.561,2.9,1.1296,1.0,1.0,12.6,0.32,1.73
25%,17.025,0.082045,0.0,1.0,0.0,0.449,5.8855,45.025,2.100175,1.0,2.0,17.4,375.377487,6.95
50%,21.200001,0.25651,0.0,2.0,0.0,0.538,6.2085,77.5,3.20745,2.0,3.0,19.05,391.440002,11.36
75%,25.0,3.677083,12.5,3.0,0.0,0.624,6.6235,94.074999,5.188425,4.0,4.0,20.200001,396.225006,16.954999
max,50.0,88.976196,100.0,4.0,1.0,0.871,8.78,100.0,12.1265,4.0,4.0,22.0,396.899994,37.970001


In [18]:
df_raw.CHAS = df_raw.CHAS.astype('object')
df_raw.RAD = df_raw.RAD.astype('object')
df_raw.TAX = df_raw.TAX.astype('object')
df_raw.INDUS = df_raw.INDUS.astype('object')

In [19]:
df_raw.rename(columns={'癤풫EDV':'MEDV'}, inplace=True)

reg_model=smf.ols(formula="MEDV ~ CRIM+ZN+C(INDUS)+C(CHAS)+NOX+RM+AGE+DIS+C(RAD)+C(TAX)+PTRATIO+B+LSTAT", data=df_raw)
reg_result = reg_model.fit()
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     74.99
Date:                Mon, 23 Nov 2020   Prob (F-statistic):          5.12e-131
Time:                        14:46:18   Log-Likelihood:                -1493.9
No. Observations:                 506   AIC:                             3028.
Df Residuals:                     486   BIC:                             3112.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        38.4897      5.377      7.158

In [21]:
#다중공선성 확인
df_raw_x2 = df_raw.drop("MEDV",axis=1)[['CRIM', 'ZN', 'NOX', 'RM', 'AGE', 'DIS','PTRATIO', 'B', 'LSTAT']]
df_raw_x2_const = add_constant(df_raw_x2)
df_vif = pd.DataFrame()
df_vif["variable"] = df_raw_x2_const.columns
df_vif["VIF"] = [variance_inflation_factor(df_raw_x2_const.values, i) for i in range(df_raw_x2_const.shape[1])]
df_vif.sort_values("VIF",inplace=True)
df_vif.round(3)

Unnamed: 0,variable,VIF
8,B,1.307
7,PTRATIO,1.387
1,CRIM,1.475
4,RM,1.842
2,ZN,2.154
9,LSTAT,2.908
5,AGE,3.069
3,NOX,3.281
6,DIS,3.746
0,const,516.108


In [23]:
reg_model=smf.ols(formula="MEDV ~ CRIM+C(INDUS)+C(CHAS)+NOX+RM+DIS+C(RAD)+C(TAX)+PTRATIO+B+LSTAT", data=df_raw)
reg_result = reg_model.fit()
print(reg_result.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.744
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     83.40
Date:                Mon, 23 Nov 2020   Prob (F-statistic):          3.09e-132
Time:                        15:24:16   Log-Likelihood:                -1495.6
No. Observations:                 506   AIC:                             3027.
Df Residuals:                     488   BIC:                             3103.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        40.0513      5.304      7.551