## 라이브러리

In [92]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pingouin as pg

## 데이터 불러오기

In [93]:
path = './data/'

df = pd.read_csv(path+'Coordinates_Preprocessed.csv')

In [94]:
df

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ,Shortest_Distance_to_Park
0,14,22500,84.70,433809,4.1,90.4,95.3,91.0,107.634598,91.7,39900.00,0.967620,1.25,강동구,아파트,201703,218.546662,342.320637,2080.047982,159.232767
1,0,16000,17.45,662019,3.4,98.0,101.1,99.1,112.039216,131.7,18000.00,0.834577,1.25,송파구,오피스텔,201912,365.167081,428.396368,2078.432085,1250.766345
2,30,42000,108.47,553927,2.7,78.0,84.3,81.7,120.439963,74.7,135000.00,1.537764,2.50,강남구,아파트,201310,698.127221,334.807784,1514.222790,918.048403
3,4,48000,84.95,674828,2.9,72.9,80.0,77.1,114.366829,79.4,91646.15,1.145652,3.25,송파구,아파트,201110,536.947700,24.176463,3817.518298,838.633151
4,0,70000,84.99,302243,2.1,109.0,109.0,110.3,87.677816,167.9,108000.00,0.725826,2.50,서대문구,아파트,202208,1173.890039,335.949816,1165.416466,701.477137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,133000,84.86,530126,3.4,102.6,101.3,100.8,128.819696,158.1,193333.33,1.352069,0.50,강남구,아파트,202011,1341.605321,298.254673,3567.318940,1272.216183
4996,10,49000,84.91,427540,3.1,74.5,81.5,78.5,102.461258,74.2,81850.00,1.221012,3.00,서초구,아파트,201207,440.715060,269.506677,1053.568719,2804.609144
4997,0,23000,30.00,425539,4.5,93.9,98.0,94.1,94.786910,106.1,27038.00,0.907344,1.50,강동구,연립다세대,201803,364.897534,391.843327,1835.115994,251.807047
4998,0,71000,84.65,571614,3.8,106.9,103.8,103.4,117.233889,177.7,110000.00,0.671993,0.50,강서구,아파트,202106,0.000000,809.669099,2549.064034,864.197138


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Building_Age                 5000 non-null   int64  
 1   JS_Price                     5000 non-null   int64  
 2   JS_BA                        5000 non-null   float64
 3   Population                   5000 non-null   int64  
 4   UR                           5000 non-null   float64
 5   LC_index                     5000 non-null   float64
 6   CA_index                     5000 non-null   float64
 7   TC_index                     5000 non-null   float64
 8   SDT_index                    5000 non-null   float64
 9   HSP_index                    5000 non-null   float64
 10  Sell_Price                   5000 non-null   float64
 11  Crime_Rates                  5000 non-null   float64
 12  IR                           5000 non-null   float64
 13  Region_Name       

In [96]:
df.columns

Index(['Building_Age', 'JS_Price', 'JS_BA', 'Population', 'UR', 'LC_index',
       'CA_index', 'TC_index', 'SDT_index', 'HSP_index', 'Sell_Price',
       'Crime_Rates', 'IR', 'Region_Name', 'Building_Use', 'YearMonth',
       'Shortest_Distance_to_Subway', 'Shortest_Distance_to_School',
       'Shortest_Distance_to_Univ', 'Shortest_Distance_to_Park'],
      dtype='object')

In [97]:
df.drop('YearMonth', axis=1, inplace=True)
df.drop('Region_Name', axis=1, inplace=True)
# df.drop('Building_Use', axis=1, inplace=True)

## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - YearMonth : 년월
    - Building_Age : 건물연식
    - JS_Price : 전세가
   
- Continous
    - Sell_Price : 매매 가격
    - JS_BA = JS_Building Area : 임대 면적
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    - HSP_index = 
    - Population : 인구수
    - Crime_Rates : 범죄율
    - Shortest_Distance_to_Subway : 가장 가까운 지하철역과의 거리
    - Shortest_Distance_to_School : 가장 가까운 초중고등학교와의 거리
    - Shortest_Distance_to_Univ : 가장 가까운 대학교와의 거리
    - Shortest_Distance_to_Park : 가장 가까운 공원과의 거리
    

## 범주형 변수 인코딩

In [98]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    df_encoded = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            df_encoded = pd.get_dummies(df_encoded, columns=[column], prefix=column)
    return df_encoded

In [99]:
df_encoded = oh_encoding(df)

## PCC

In [100]:
pcc_result = pg.pairwise_corr(df, columns=list(df.select_dtypes(include=np.number).columns) + ['JS_Price'], method='pearson')
pcc_result

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,BF10,power
0,Building_Age,JS_Price,pearson,two-sided,5000,-0.093166,"[-0.12, -0.07]",4.096523e-11,5.084e+07,0.999998
1,Building_Age,JS_BA,pearson,two-sided,5000,0.080944,"[0.05, 0.11]",9.948288e-09,2.399e+05,0.999920
2,Building_Age,Population,pearson,two-sided,5000,0.122222,"[0.09, 0.15]",4.218035e-18,3.773e+14,1.000000
3,Building_Age,UR,pearson,two-sided,5000,0.051539,"[0.02, 0.08]",2.665162e-04,13.616,0.954183
4,Building_Age,LC_index,pearson,two-sided,5000,0.089791,"[0.06, 0.12]",2.011918e-10,1.073e+07,0.999995
...,...,...,...,...,...,...,...,...,...,...
148,Shortest_Distance_to_School,Shortest_Distance_to_Park,pearson,two-sided,5000,0.025905,"[-0.0, 0.05]",6.701391e-02,0.095,0.449063
149,Shortest_Distance_to_School,JS_Price,pearson,two-sided,5000,-0.095770,"[-0.12, -0.07]",1.153990e-11,1.757e+08,0.999999
150,Shortest_Distance_to_Univ,Shortest_Distance_to_Park,pearson,two-sided,5000,-0.131758,"[-0.16, -0.1]",8.347701e-21,1.767e+17,1.000000
151,Shortest_Distance_to_Univ,JS_Price,pearson,two-sided,5000,0.047784,"[0.02, 0.08]",7.250688e-04,5.351,0.922304


In [101]:
pcc_result = pcc_result[(pcc_result['Y'] == 'JS_Price') & (pcc_result['X'] != 'JS_Price')]

In [102]:
pcc_result.sort_values('p-unc', ascending=False).reset_index(drop=True)

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,BF10,power
0,Crime_Rates,JS_Price,pearson,two-sided,5000,0.005477,"[-0.02, 0.03]",0.6986339,0.019,0.06734
1,UR,JS_Price,pearson,two-sided,5000,0.012145,"[-0.02, 0.04]",0.3905811,0.026,0.137779
2,Population,JS_Price,pearson,two-sided,5000,-0.02114,"[-0.05, 0.01]",0.1350055,0.054,0.321166
3,Shortest_Distance_to_Park,JS_Price,pearson,two-sided,5000,0.024396,"[-0.0, 0.05]",0.08454691,0.078,0.407247
4,Shortest_Distance_to_Univ,JS_Price,pearson,two-sided,5000,0.047784,"[0.02, 0.08]",0.0007250688,5.351,0.922304
5,Building_Age,JS_Price,pearson,two-sided,5000,-0.093166,"[-0.12, -0.07]",4.096523e-11,50840000.0,0.999998
6,Building_Age,JS_Price,pearson,two-sided,5000,-0.093166,"[-0.12, -0.07]",4.096523e-11,50840000.0,0.999998
7,Shortest_Distance_to_School,JS_Price,pearson,two-sided,5000,-0.09577,"[-0.12, -0.07]",1.15399e-11,175700000.0,0.999999
8,Shortest_Distance_to_Subway,JS_Price,pearson,two-sided,5000,-0.126082,"[-0.15, -0.1]",3.59244e-19,4294000000000000.0,1.0
9,SDT_index,JS_Price,pearson,two-sided,5000,-0.133945,"[-0.16, -0.11]",1.873168e-21,7.746e+17,1.0


## 제거변수
- p-value값 고려
    - PCC 결과 a=0.05일 때 p-value값이 0.05초과인 변수는 통계적으로 상관관계가 유의하지 않으므로 변수 삭제 리스트에 추가
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수는 삭제 리스트에 추가

In [103]:
# PCC 결과 데이터프레임에서 p-value가 0.05 이상이거나 상관계수 절댓값이 0.1 이하인 변수를 삭제 리스트에 추가
delete_columns = []

for index, row in pcc_result.iterrows():
    if (row['p-unc'] > 0.05) or (abs(row['r']) <= 0.1):
        delete_columns.append(row['X'])

# 중복된 변수 제거
delete_columns = list(set(delete_columns))
delete_columns.append('JS_Price')

# 삭제할 변수 리스트 출력
print(delete_columns)

['Crime_Rates', 'Shortest_Distance_to_Park', 'Shortest_Distance_to_School', 'Shortest_Distance_to_Univ', 'UR', 'Population', 'Building_Age', 'JS_Price']


In [104]:
selected_features = list(set(df_encoded.columns) - set(delete_columns))
selected_features

['LC_index',
 'Building_Use_오피스텔',
 'IR',
 'JS_BA',
 'Building_Use_연립다세대',
 'CA_index',
 'Shortest_Distance_to_Subway',
 'Building_Use_아파트',
 'HSP_index',
 'TC_index',
 'Sell_Price',
 'SDT_index']