In [58]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
import math
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import pingouin as pg
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings

# Regression Analysis

In [59]:
warnings.simplefilter('ignore')

In [60]:
path = './data/'

# 데이터 불러오기
df = pd.read_csv(path+'Coordinates_Preprocessed.csv')

In [61]:
# 필요 없는 열 삭제
df.drop(['YearMonth', 'Region_Name', 'Building_Use'], axis=1, inplace=True)

In [62]:
# 데이터 전처리 함수
def data_preprocessing(df):
    # 이상치 제거
    def detect_outliers(df, features_to_process):
        df_cleaned = df.copy()
        for feature in features_to_process:
            Q1 = df_cleaned[feature].quantile(0.25)
            Q3 = df_cleaned[feature].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df_cleaned = df_cleaned[(df_cleaned[feature] >= lower_bound) & (df_cleaned[feature] <= upper_bound)]
        return df_cleaned

    df = detect_outliers(df, df.columns)

    # 결측치 제거
    df.dropna(inplace=True)

#     # 범주형 변수 더미 인코딩
#     df_encoded = pd.get_dummies(df, columns=['Region_Name', 'Building_Use'], prefix=['Region_Name', 'Building_Use'])

    df_encoded = df
    
    # Min-Max 스케일링
    scaler = MinMaxScaler()
    numeric_columns = df_encoded.select_dtypes(include=['float', 'int']).columns
    df_encoded[numeric_columns] = scaler.fit_transform(df_encoded[numeric_columns])

    return df_encoded

df_encoded = data_preprocessing(df)


In [63]:
# 회귀 분석
selected_features = ['Shortest_Distance_to_Park',
 'LC_index',
 'TC_index',
 'SDT_index',
 'Sell_Price',
 'Building_Age',
 'IR',
 'HSP_index',
 'JS_BA',
 'Shortest_Distance_to_Subway',
 'CA_index']

# 데이터 선택
X = df_encoded[selected_features]
X = sm.add_constant(X)
Y = df_encoded['JS_Price']

# 선형 회귀 모델
model = pg.linear_regression(X, Y)

In [64]:
model

Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%]
0,Intercept,-0.011481,0.015361,-0.747384,0.4548786,0.762553,0.761855,-0.041599,0.018637
1,Shortest_Distance_to_Park,-0.011552,0.007174,-1.610383,0.1073986,0.762553,0.761855,-0.025616,0.002512
2,LC_index,0.317255,0.11684,2.715286,0.006652271,0.762553,0.761855,0.088178,0.546332
3,TC_index,-0.129178,0.069552,-1.857287,0.06334895,0.762553,0.761855,-0.265541,0.007186
4,SDT_index,0.06181,0.012779,4.836969,1.371464e-06,0.762553,0.761855,0.036756,0.086864
5,Sell_Price,0.47767,0.008626,55.373758,0.0,0.762553,0.761855,0.460757,0.494582
6,Building_Age,-0.227402,0.007535,-30.179923,2.866584e-179,0.762553,0.761855,-0.242175,-0.212629
7,IR,-0.006765,0.010284,-0.657857,0.5106705,0.762553,0.761855,-0.026928,0.013397
8,HSP_index,-0.23429,0.025814,-9.076175,1.777655e-19,0.762553,0.761855,-0.284901,-0.18368
9,JS_BA,0.400102,0.009087,44.03118,0.0,0.762553,0.761855,0.382287,0.417918


In [65]:
# 변수 선택법
X_without_constant = X.drop('const', axis=1)
vif = pd.DataFrame()
vif["Features"] = X_without_constant.columns
vif["VIF"] = [variance_inflation_factor(X_without_constant.values, i) for i in range(X_without_constant.shape[1])]

In [66]:
vif

Unnamed: 0,Features,VIF
0,Shortest_Distance_to_Park,4.568639
1,LC_index,1485.532447
2,TC_index,473.620319
3,SDT_index,9.708609
4,Sell_Price,5.417724
5,Building_Age,4.128381
6,IR,9.374678
7,HSP_index,47.976464
8,JS_BA,9.056481
9,Shortest_Distance_to_Subway,4.249364


In [67]:
# 변수 제거
delete_columns = []
for i in range(1, len(model.index)):
    if model['pval'][i] > 0.05 and vif['VIF'].iloc[i-1] > 10:
        delete_columns.append(model['names'][i-1])

selected_features = list(set(selected_features) - set(delete_columns))

# 변수 제거 후 다시 회귀 모델 생성
X = df_encoded[selected_features]
X = sm.add_constant(X)
Y = df_encoded['JS_Price']
model = pg.linear_regression(X, Y)

model

Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%]
0,Intercept,-0.019488,0.015089,-1.29159,0.1965791,0.762085,0.761449,-0.049071,0.010094
1,HSP_index,-0.175461,0.014046,-12.492269,4.091203e-35,0.762085,0.761449,-0.202999,-0.147924
2,SDT_index,0.077705,0.011369,6.835024,9.533623e-12,0.762085,0.761449,0.055416,0.099994
3,CA_index,0.359453,0.052488,6.848346,8.696854e-12,0.762085,0.761449,0.256546,0.46236
4,TC_index,-0.033526,0.060022,-0.558558,0.5764966,0.762085,0.761449,-0.151205,0.084153
5,Building_Age,-0.227123,0.007541,-30.120085,1.2122589999999999e-178,0.762085,0.761449,-0.241907,-0.212339
6,JS_BA,0.39976,0.009094,43.960291,0.0,0.762085,0.761449,0.381931,0.417589
7,Shortest_Distance_to_Park,-0.0115,0.00718,-1.60183,0.1092776,0.762085,0.761449,-0.025577,0.002576
8,Sell_Price,0.478075,0.008632,55.38199,0.0,0.762085,0.761449,0.461151,0.495
9,IR,-0.019779,0.009106,-2.172003,0.02991803,0.762085,0.761449,-0.037633,-0.001925


In [70]:
# 변수 제거 후 다시 회귀 모델 생성
selected_features = [feature for feature in selected_features if feature not in ['TC_index', 'Shortest_Distance_to_Park']]

# 변수 제거 후 다시 회귀 모델 생성
X = df_encoded[selected_features]
X = sm.add_constant(X)
Y = df_encoded['JS_Price']
model = pg.linear_regression(X, Y)

model

Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%]
0,Intercept,-0.022264,0.014372,-1.549168,0.1214258,0.761903,0.761394,-0.050442,0.005913
1,HSP_index,-0.179831,0.01159,-15.515811,1.145858e-52,0.761903,0.761394,-0.202555,-0.157108
2,SDT_index,0.076103,0.011006,6.914818,5.484865e-12,0.761903,0.761394,0.054525,0.097681
3,CA_index,0.331645,0.016459,20.149208,8.328501999999999e-86,0.761903,0.761394,0.299375,0.363916
4,Building_Age,-0.228761,0.007477,-30.596396,1.070302e-183,0.761903,0.761394,-0.24342,-0.214102
5,JS_BA,0.400724,0.009076,44.154314,0.0,0.761903,0.761394,0.38293,0.418517
6,Sell_Price,0.478726,0.008625,55.506196,0.0,0.761903,0.761394,0.461816,0.495635
7,IR,-0.022018,0.008118,-2.712257,0.006713218,0.761903,0.761394,-0.037934,-0.006102
8,Shortest_Distance_to_Subway,-0.042766,0.007453,-5.738163,1.032992e-08,0.761903,0.761394,-0.057378,-0.028154
