## 라이브러리

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

## 데이터 불러오기

In [2]:
path = './data/'

df = pd.read_csv(path+'Coordinates_Preprocessed.csv')

In [3]:
df

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ,Shortest_Distance_to_Park
0,14,22500,84.70,433809,4.1,90.4,95.3,91.0,107.634598,91.7,39900.00,0.967620,1.25,강동구,아파트,201703,218.546662,342.320637,2080.047982,159.232767
1,0,16000,17.45,662019,3.4,98.0,101.1,99.1,112.039216,131.7,18000.00,0.834577,1.25,송파구,오피스텔,201912,365.167081,428.396368,2078.432085,1250.766345
2,30,42000,108.47,553927,2.7,78.0,84.3,81.7,120.439963,74.7,135000.00,1.537764,2.50,강남구,아파트,201310,698.127221,334.807784,1514.222790,918.048403
3,4,48000,84.95,674828,2.9,72.9,80.0,77.1,114.366829,79.4,91646.15,1.145652,3.25,송파구,아파트,201110,536.947700,24.176463,3817.518298,838.633151
4,0,70000,84.99,302243,2.1,109.0,109.0,110.3,87.677816,167.9,108000.00,0.725826,2.50,서대문구,아파트,202208,1173.890039,335.949816,1165.416466,701.477137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,133000,84.86,530126,3.4,102.6,101.3,100.8,128.819696,158.1,193333.33,1.352069,0.50,강남구,아파트,202011,1341.605321,298.254673,3567.318940,1272.216183
4996,10,49000,84.91,427540,3.1,74.5,81.5,78.5,102.461258,74.2,81850.00,1.221012,3.00,서초구,아파트,201207,440.715060,269.506677,1053.568719,2804.609144
4997,0,23000,30.00,425539,4.5,93.9,98.0,94.1,94.786910,106.1,27038.00,0.907344,1.50,강동구,연립다세대,201803,364.897534,391.843327,1835.115994,251.807047
4998,0,71000,84.65,571614,3.8,106.9,103.8,103.4,117.233889,177.7,110000.00,0.671993,0.50,강서구,아파트,202106,0.000000,809.669099,2549.064034,864.197138


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Building_Age                 5000 non-null   int64  
 1   JS_Price                     5000 non-null   int64  
 2   JS_BA                        5000 non-null   float64
 3   Population                   5000 non-null   int64  
 4   UR                           5000 non-null   float64
 5   LC_index                     5000 non-null   float64
 6   CA_index                     5000 non-null   float64
 7   TC_index                     5000 non-null   float64
 8   SDT_index                    5000 non-null   float64
 9   HSP_index                    5000 non-null   float64
 10  Sell_Price                   5000 non-null   float64
 11  Crime_Rates                  5000 non-null   float64
 12  IR                           5000 non-null   float64
 13  Region_Name       

In [5]:
df.columns

Index(['Building_Age', 'JS_Price', 'JS_BA', 'Population', 'UR', 'LC_index',
       'CA_index', 'TC_index', 'SDT_index', 'HSP_index', 'Sell_Price',
       'Crime_Rates', 'IR', 'Region_Name', 'Building_Use', 'YearMonth',
       'Shortest_Distance_to_Subway', 'Shortest_Distance_to_School',
       'Shortest_Distance_to_Univ', 'Shortest_Distance_to_Park'],
      dtype='object')

In [6]:
df['YearMonth'] = df['YearMonth'].astype('object')

## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - YearMonth : 년월
    - Building_Age : 건물연식
    - JS_Price : 전세가
   
- Continous
    - Sell_Price : 매매 가격
    - JS_BA = JS_Building Area : 임대 면적
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    - HSP_index = 
    - Population : 인구수
    - Crime_Rates : 범죄율
    - Shortest_Distance_to_Subway : 가장 가까운 지하철역과의 거리
    - Shortest_Distance_to_School : 가장 가까운 초중고등학교와의 거리
    - Shortest_Distance_to_Univ : 가장 가까운 대학교와의 거리
    - Shortest_Distance_to_Park : 가장 가까운 공원과의 거리
    

## PCC

- 범주형 변수 타겟인코딩

In [7]:
def target_encoding(df, categorical_columns, target_column):
    for categorical_column in categorical_columns:
        encoding_map = df.groupby(categorical_column)[target_column].mean().to_dict()
        df[categorical_column + '_encoded'] = df[categorical_column].map(encoding_map)
        df.drop(columns=[categorical_column], inplace=True)
    return df

# 범주형 변수 리스트 지정
categorical_columns = ['Region_Name', 'YearMonth', 'Building_Use']
target_column = 'JS_Price'

# 타겟 인코딩 적용
df_encoded = target_encoding(df, categorical_columns, target_column)

In [8]:
df_encoded

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ,Shortest_Distance_to_Park,Region_Name_encoded,YearMonth_encoded,Building_Use_encoded
0,14,22500,84.70,433809,4.1,90.4,95.3,91.0,107.634598,91.7,39900.00,0.967620,1.25,218.546662,342.320637,2080.047982,159.232767,33726.793220,40437.837838,39231.789867
1,0,16000,17.45,662019,3.4,98.0,101.1,99.1,112.039216,131.7,18000.00,0.834577,1.25,365.167081,428.396368,2078.432085,1250.766345,47540.026030,46627.590909,20770.125373
2,30,42000,108.47,553927,2.7,78.0,84.3,81.7,120.439963,74.7,135000.00,1.537764,2.50,698.127221,334.807784,1514.222790,918.048403,54890.388489,34813.620690,39231.789867
3,4,48000,84.95,674828,2.9,72.9,80.0,77.1,114.366829,79.4,91646.15,1.145652,3.25,536.947700,24.176463,3817.518298,838.633151,47540.026030,22473.529412,39231.789867
4,0,70000,84.99,302243,2.1,109.0,109.0,110.3,87.677816,167.9,108000.00,0.725826,2.50,1173.890039,335.949816,1165.416466,701.477137,34823.335878,59168.750000,39231.789867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,133000,84.86,530126,3.4,102.6,101.3,100.8,128.819696,158.1,193333.33,1.352069,0.50,1341.605321,298.254673,3567.318940,1272.216183,54890.388489,40992.459459,39231.789867
4996,10,49000,84.91,427540,3.1,74.5,81.5,78.5,102.461258,74.2,81850.00,1.221012,3.00,440.715060,269.506677,1053.568719,2804.609144,58066.764706,27004.545455,39231.789867
4997,0,23000,30.00,425539,4.5,93.9,98.0,94.1,94.786910,106.1,27038.00,0.907344,1.50,364.897534,391.843327,1835.115994,251.807047,33726.793220,40960.181818,20913.237705
4998,0,71000,84.65,571614,3.8,106.9,103.8,103.4,117.233889,177.7,110000.00,0.671993,0.50,0.000000,809.669099,2549.064034,864.197138,25190.794702,52163.285714,39231.789867


- pcc

In [9]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [10]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.747533,0.0
1,JS_BA,0.586589,0.0
2,Region_Name_encoded,0.45277,2.586575e-251
3,YearMonth_encoded,0.353157,8.504635e-147
4,CA_index,0.307229,9.138304e-110
5,LC_index,0.306841,1.764852e-109
6,TC_index,0.303019,1.100381e-106
7,HSP_index,0.273446,1.85826e-86
8,Building_Use_encoded,0.229512,9.131655e-61
9,Shortest_Distance_to_Univ,0.047784,0.0007250688


In [11]:
abs(result_df[result_df['Column_Name'] == 'JS_BA']['PCC'])

1    0.586589
Name: PCC, dtype: float64

## 제거변수
- p-value값 고려
    - PCC 결과 a=0.05일 때 p-value값이 0.05초과인 변수는 통계적으로 상관관계가 유의하지 않으므로 변수 삭제 리스트에 추가
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수는 삭제 리스트에 추가

In [12]:
delete_columns = []

for index, row in result_df.iterrows():
    if abs(row['PCC']) < 0.1 or row['p-value'] > 0.05:
        delete_columns.append(row['Column_Name'])

delete_columns

['Building_Age',
 'Population',
 'UR',
 'Crime_Rates',
 'Shortest_Distance_to_School',
 'Shortest_Distance_to_Univ',
 'Shortest_Distance_to_Park']

In [13]:
selected_features = list(set(df_encoded.columns) - set(delete_columns))
selected_features

['HSP_index',
 'IR',
 'CA_index',
 'SDT_index',
 'Sell_Price',
 'TC_index',
 'Building_Use_encoded',
 'Region_Name_encoded',
 'LC_index',
 'YearMonth_encoded',
 'JS_Price',
 'JS_BA',
 'Shortest_Distance_to_Subway']