In [141]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [142]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [143]:
path = './data/'

df = pd.read_csv(path+'전세가격예측프로젝트_월별데이터_총병합.csv')

In [144]:
df

Unnamed: 0,Year,Month,Region_Name,Region_Code,Building_Use,Sell_Price,Sell_BA,Sell_PPA,Sell_PPP,Sell_Count,...,JS_Count,CR,CR_PPA,CR_PPP,IR,UR,LC_index,CA_index,TC_index,SDT_index
0,2012,7,강남구,11680,단독다가구,316562.50,344.92,943.75,285.48,8,...,135,3.89,22.31,22.31,3.0,3.1,74.5,81.5,78.5,102.461258
1,2012,7,강남구,11680,아파트,82549.28,81.56,1023.00,309.46,133,...,750,46.13,44.26,44.26,3.0,3.1,74.5,81.5,78.5,102.461258
2,2012,7,강남구,11680,연립다세대,28362.79,48.58,565.35,171.02,38,...,140,61.63,61.95,61.95,3.0,3.1,74.5,81.5,78.5,102.461258
3,2012,7,강남구,11680,오피스텔,25309.88,39.07,671.99,203.28,42,...,10,157.65,62.87,62.86,3.0,3.1,74.5,81.5,78.5,102.461258
4,2012,7,강동구,11740,단독다가구,103757.14,336.59,480.88,145.47,21,...,217,7.61,33.75,33.75,3.0,3.1,74.5,81.5,78.5,102.461258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12185,2022,8,중구,11140,오피스텔,33270.39,29.85,1108.02,335.18,162,...,72,74.87,74.52,74.52,2.5,2.1,109.0,109.0,110.3,87.677816
12186,2022,8,중랑구,11260,단독다가구,113917.78,221.65,517.06,156.41,81,...,271,11.79,66.34,66.34,2.5,2.1,109.0,109.0,110.3,87.677816
12187,2022,8,중랑구,11260,아파트,54521.62,58.36,989.66,299.37,74,...,414,53.66,44.39,44.39,2.5,2.1,109.0,109.0,110.3,87.677816
12188,2022,8,중랑구,11260,연립다세대,33390.06,48.04,743.72,224.98,327,...,276,72.34,95.16,95.16,2.5,2.1,109.0,109.0,110.3,87.677816


## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - Year : 년
    - Month : 월
    - Region_Code : 자치구 코드
    - JS_Count : 전세 거래량
   
- Continous
    - Sell : 매매
    - Sell_Price : 매매 가격
    - Sell_BA = Sell_building Area : 매매 건물 면적
    - Sell_PPA = Sell_Price Per Area : 면적 당 매매 가격
    - Sell_PPP = Sell_Price Per Pyeong : 평 당 매매 가격
    - Sell_Count : 매매 거래량
    - JS : 전세
    - JS_Price : 전세 가격
    - JS_BA = JS_Building Area : 임대 면적
    - JS_PPA = JS_Price Per Area : 임대 면적 당 전세 가격
    - JS_PPP = JS_Price Per Pyeong : 평 당 전세 가격
    - CR = Charter Rate : 전세가율
    - CR_PPA  = Charter_Rate_Price Per Area : 면적 당 전세가율
    - CR_PPP = Charter Rate_Price Per Pyeong : 평 당 전세가율
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    

In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12190 entries, 0 to 12189
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          12190 non-null  int64  
 1   Month         12190 non-null  int64  
 2   Region_Name   12190 non-null  object 
 3   Region_Code   12190 non-null  int64  
 4   Building_Use  12190 non-null  object 
 5   Sell_Price    12190 non-null  float64
 6   Sell_BA       12190 non-null  float64
 7   Sell_PPA      12190 non-null  float64
 8   Sell_PPP      12190 non-null  float64
 9   Sell_Count    12190 non-null  int64  
 10  JS_Price      12190 non-null  float64
 11  JS_BA         12190 non-null  float64
 12  JS_PPA        12190 non-null  float64
 13  JS_PPP        12190 non-null  float64
 14  JS_Count      12190 non-null  int64  
 15  CR            12190 non-null  float64
 16  CR_PPA        12190 non-null  float64
 17  CR_PPP        12190 non-null  float64
 18  IR            12190 non-nu

In [146]:
df.columns

Index(['Year', 'Month', 'Region_Name', 'Region_Code', 'Building_Use',
       'Sell_Price', 'Sell_BA', 'Sell_PPA', 'Sell_PPP', 'Sell_Count',
       'JS_Price', 'JS_BA', 'JS_PPA', 'JS_PPP', 'JS_Count', 'CR', 'CR_PPA',
       'CR_PPP', 'IR', 'UR', 'LC_index', 'CA_index', 'TC_index', 'SDT_index'],
      dtype='object')

## Feature_Scaling

In [147]:
# from sklearn.preprocessing import StandardScaler

# # 회귀분석에 사용할 변수들을 선택
# X = df[['Year', 'Month', 'Sell_Price', 'Sell_BA', 'Sell_PPA', 'Sell_PPP', 'Sell_Count',
#         'JS_BA', 'JS_PPA', 'JS_PPP', 'JS_Count', 'CR', 'BA_CR',
#        'PPA_CR', 'IR', 'UR', 'LC_index', 'CA_index', 'TC_index', 'SDT_index']]

# # 표준화를 위한 StandardScaler 객체 생성
# scaler = StandardScaler()

# # 데이터 표준화
# X_scaled = scaler.fit_transform(X)

In [148]:
def standardize(df):
    for column in df.columns:
        if df[column].dtype == float:
            df[column] = (df[column] - np.mean(df[column])) / np.std(df[column])
    return df

In [149]:
df = standardize(df)

In [150]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
            print(column)
    return encoded_df

In [151]:
df_encoded = oh_encoding(df)

Region_Name
Building_Use


## Regression Analysis

In [153]:
# 특성 선택
selected_features = ['JS_BA', 'Sell_PPA', 'Sell_PPP', 'JS_PPA', 'JS_PPP', 'JS_Count', 'Sell_Count', 'LC_index', 'TC_index', 'Year',
                     'CA_index', 'Sell_Price', 'CR', 'Sell_BA', 'IR', 'Building_Use_아파트', 'Region_Name_강남구', 'Region_Name_서초구', 'Building_Use_단독다가구']

# X 데이터 선택
X = df_encoded[selected_features]

# 상수항 추가
X = sm.add_constant(X)

# 종속 변수 선택
y = df_encoded['JS_Price']

# 회귀 모델 생성
model = sm.OLS(y, X).fit()

# 결과 요약 출력
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               JS_Price   R-squared:                       0.954
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                 1.329e+04
Date:                Tue, 12 Sep 2023   Prob (F-statistic):               0.00
Time:                        13:42:03   Log-Likelihood:                 1474.7
No. Observations:               12190   AIC:                            -2909.
Df Residuals:                   12170   BIC:                            -2761.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 14.9123     12