## 라이브러리

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

## 데이터 불러오기

In [13]:
path = './data/'

df = pd.read_csv(path+'Total_Officetel_for_Target_Features.csv')

In [14]:
df

Unnamed: 0,Sell_Price,Sell_Count,JS_Price,JS_Count,CR,UR,LC_index,CA_index,TC_index,SDT_index,IR,Crime_Rates,Total_Pop,Univ_Counts,Park_Counts,School_Counts,Subway_Counts
0,25309.88,42,39900.00,10,157.65,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,554870,1.0,7,77,21
1,15573.33,15,13035.71,14,83.71,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,484742,0.0,7,60,14
2,13150.00,7,8200.00,10,62.36,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,338041,1.0,4,34,3
3,12285.00,44,12953.85,26,105.44,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,561431,2.0,9,80,9
4,12520.38,24,7614.38,40,60.82,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,517095,1.0,2,55,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,60298.48,66,31588.97,72,52.39,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,215891,2.0,2,34,10
3036,16641.24,97,18504.30,79,111.20,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,465727,1.0,7,66,13
3037,27591.03,78,22345.39,64,80.99,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,140477,6.0,12,36,15
3038,33270.39,162,24908.53,72,74.87,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,119206,2.0,4,31,23


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sell_Price     3040 non-null   float64
 1   Sell_Count     3040 non-null   int64  
 2   JS_Price       3040 non-null   float64
 3   JS_Count       3040 non-null   int64  
 4   CR             3040 non-null   float64
 5   UR             3040 non-null   float64
 6   LC_index       3040 non-null   float64
 7   CA_index       3040 non-null   float64
 8   TC_index       3040 non-null   float64
 9   SDT_index      3040 non-null   float64
 10  IR             3040 non-null   float64
 11  Crime_Rates    3040 non-null   float64
 12  Total_Pop      3040 non-null   int64  
 13  Univ_Counts    3040 non-null   float64
 14  Park_Counts    3040 non-null   int64  
 15  School_Counts  3040 non-null   int64  
 16  Subway_Counts  3040 non-null   int64  
dtypes: float64(11), int64(6)
memory usage: 403.9 KB


## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - Year : 년
    - Month : 월
    - Region_Code : 자치구 코드
    - JS_Count : 전세 거래량
    - Sell_Count : 매매 거래량
    - School_Counts : 자치구 내 초중고 수
    - Subway_Counts : 자치구 내 지하철역 수
    - Univ_Counts : 자치구 내 대학교 수
    - Park_Counts : 자치구 내 공원 수
   
- Continous
    - Sell : 매매
    - Sell_Price : 매매 가격
    - Sell_BA = Sell_building Area : 매매 건물 면적
    - Sell_PPA = Sell_Price Per Area : 면적 당 매매 가격
    - Sell_PPP = Sell_Price Per Pyeong : 평 당 매매 가격
    - JS : 전세
    - JS_Price : 전세 가격
    - JS_BA = JS_Building Area : 임대 면적
    - JS_PPA = JS_Price Per Area : 임대 면적 당 전세 가격
    - JS_PPP = JS_Price Per Pyeong : 평 당 전세 가격
    - CR = Charter Rate : 전세가율
    - CR_PPA  = Charter_Rate_Price Per Area : 면적 당 전세가율
    - CR_PPP = Charter Rate_Price Per Pyeong : 평 당 전세가율
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    

## PCC

In [16]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [17]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.738398,0.0
1,LC_index,0.481152,5.815308999999999e-176
2,TC_index,0.47864,6.808391e-174
3,CA_index,0.475413,2.919273e-171
4,CR,0.320423,1.5391300000000002e-73
5,Sell_Count,0.255177,2.154376e-46
6,Subway_Counts,0.231702,2.459732e-38
7,JS_Count,0.229884,9.528542e-38
8,School_Counts,0.067214,0.0002084095
9,UR,0.066216,0.0002587596


## ANOVA (continous vs discrete)

In [18]:
# def anova(df):
#     for column in df.columns:
#         if np.issubdtype(df[column].dtype, np.integer):
#             # 이 코드는 정수형 열에 대해서만 분산분석을 수행합니다.
#             f_statistic, p_value = stats.f_oneway(*[group for name, group in df.groupby(column)['JS_Price']])
#             print(column)
#             print("F-statistic:", f_statistic, "p-value:", p_value)

In [19]:
# anova(df)

## KCC (continous vs categorical)

- 범주형 변수 더미화

In [20]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
            print(column)
    return encoded_df

df_encoded = oh_encoding(df)

In [21]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'KCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 Kendall 상관 계수 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price':
        if column.startswith('Region') or column.startswith('Building'):  
            kendall_corr, p_value = kendalltau(df_encoded['JS_Price'], df_encoded[column])
            result_df = result_df.append({'Column_Name': column, 'KCC': kendall_corr, 'p-value': p_value}, ignore_index=True)

In [22]:
result_df.sort_values(by='KCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,KCC,p-value


- p-value값 고려
    - PCC 결과 a=0.05일 때 Park_Counts, Total_Pop은 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> UR, School_Counts, Park_Counts, Total_Pop, Crime_Rates를 변수에서 제외