## 라이브러리

In [18]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

## 데이터 불러오기

In [19]:
path = './data/'

df = pd.read_csv(path+'월별데이터 총병합.csv')

In [20]:
df

Unnamed: 0.1,Unnamed: 0,Year,Month,Region_Name,Region_Code,Building_Use,Sell_Price,Sell_BA,Sell_Count,JS_Price,...,UR,LC_index,CA_index,TC_index,SDT_index,Crime_Rates,School_Counts,Subway_Counts,Univ_Counts,Park_Counts
0,0,2014,1,강남구,11680,단독다가구,231221.43,321.47,14,14987.43,...,3.4,78.9,85.3,82.5,120.590868,1517.02,77,21,1,7
1,1,2014,1,강남구,11680,아파트,89505.48,86.55,631,47138.67,...,3.4,78.9,85.3,82.5,120.590868,1517.02,77,21,1,7
2,2,2014,1,강남구,11680,연립다세대,34570.70,60.10,76,19115.34,...,3.4,78.9,85.3,82.5,120.590868,1517.02,77,21,1,7
3,3,2014,1,강남구,11680,오피스텔,25451.99,40.23,70,31225.00,...,3.4,78.9,85.3,82.5,120.590868,1517.02,77,21,1,7
4,4,2014,2,강남구,11680,단독다가구,374166.85,452.19,13,14679.90,...,4.5,79.2,85.6,82.7,121.038496,1517.02,77,21,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8051,8051,2021,11,중랑구,11260,오피스텔,17302.79,29.17,14,13128.67,...,2.6,107.1,105.3,105.7,100.134564,819.12,48,8,1,6
8052,8052,2021,12,중랑구,11260,단독다가구,97187.70,198.71,30,12778.38,...,3.5,107.3,106.2,106.3,94.655902,819.12,48,8,1,6
8053,8053,2021,12,중랑구,11260,아파트,62155.66,66.94,53,34043.88,...,3.5,107.3,106.2,106.3,94.655902,819.12,48,8,1,6
8054,8054,2021,12,중랑구,11260,연립다세대,31058.43,45.39,166,21929.65,...,3.5,107.3,106.2,106.3,94.655902,819.12,48,8,1,6


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8056 entries, 0 to 8055
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     8056 non-null   int64  
 1   Year           8056 non-null   int64  
 2   Month          8056 non-null   int64  
 3   Region_Name    8056 non-null   object 
 4   Region_Code    8056 non-null   int64  
 5   Building_Use   8056 non-null   object 
 6   Sell_Price     8056 non-null   float64
 7   Sell_BA        8056 non-null   float64
 8   Sell_Count     8056 non-null   int64  
 9   JS_Price       8056 non-null   float64
 10  JS_BA          8056 non-null   float64
 11  JS_Count       8056 non-null   int64  
 12  CR             8056 non-null   float64
 13  IR             8056 non-null   float64
 14  UR             8056 non-null   float64
 15  LC_index       8056 non-null   float64
 16  CA_index       8056 non-null   float64
 17  TC_index       8056 non-null   float64
 18  SDT_inde

In [22]:
df.drop('Unnamed: 0', axis=1, inplace=True)

## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - Year : 년
    - Month : 월
    - Region_Code : 자치구 코드
    - JS_Count : 전세 거래량
    - Sell_Count : 매매 거래량
    - School_Counts : 자치구 내 초중고 수
    - Subway_Counts : 자치구 내 지하철역 수
    - Univ_Counts : 자치구 내 대학교 수
    - Park_Counts : 자치구 내 공원 수
   
- Continous
    - Sell : 매매
    - Sell_Price : 매매 가격
    - Sell_BA = Sell_building Area : 매매 건물 면적
    - Sell_PPA = Sell_Price Per Area : 면적 당 매매 가격
    - Sell_PPP = Sell_Price Per Pyeong : 평 당 매매 가격
    - JS : 전세
    - JS_Price : 전세 가격
    - JS_BA = JS_Building Area : 임대 면적
    - JS_PPA = JS_Price Per Area : 임대 면적 당 전세 가격
    - JS_PPP = JS_Price Per Pyeong : 평 당 전세 가격
    - CR = Charter Rate : 전세가율
    - CR_PPA  = Charter_Rate_Price Per Area : 면적 당 전세가율
    - CR_PPP = Charter Rate_Price Per Pyeong : 평 당 전세가율
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    

## PCC

In [23]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [24]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,JS_BA,0.751189,0.0
1,JS_Count,0.462311,0.0
2,Sell_Count,0.396765,6.035104e-302
3,Subway_Counts,0.236888,3.722441e-103
4,LC_index,0.235439,6.938073e-102
5,Sell_Price,0.232716,1.60399e-99
6,TC_index,0.232022,6.341413e-99
7,Year,0.22998,3.542651e-97
8,CA_index,0.228192,1.1605330000000001e-95
9,CR,0.193933,4.117143e-69


In [25]:
a = result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)['Column_Name'][0:13].tolist()

In [26]:
a

['JS_BA',
 'JS_Count',
 'Sell_Count',
 'Subway_Counts',
 'LC_index',
 'Sell_Price',
 'TC_index',
 'Year',
 'CA_index',
 'CR',
 'Region_Code',
 'Park_Counts',
 'Crime_Rates']

In [27]:
b = result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)['Column_Name'][20:].tolist()

In [28]:
b

[]

In [29]:
df.columns[0]

'Year'

## ANOVA (continous vs discrete)

In [30]:
def anova(df):
    for column in df.columns:
        if np.issubdtype(df[column].dtype, np.integer):
            # 이 코드는 정수형 열에 대해서만 분산분석을 수행합니다.
            f_statistic, p_value = stats.f_oneway(*[group for name, group in df.groupby(column)['JS_Price']])
            print(column)
            print("F-statistic:", f_statistic, "p-value:", p_value)

In [31]:
anova(df)

Year
F-statistic: 66.00579538575329 p-value: 5.965089137111651e-93
Month
F-statistic: 1.135427880521386 p-value: 0.32817113312900476
Region_Code
F-statistic: 71.86728876950895 p-value: 2.3916424780170578e-268
Sell_Count
F-statistic: 4.526449293724414 p-value: 2.7405050527380674e-249
JS_Count
F-statistic: 4.9273107783463574 p-value: 4.80454e-319
School_Counts
F-statistic: 65.59216213312669 p-value: 3.7566824453487053e-234
Subway_Counts
F-statistic: 80.72638867672627 p-value: 2.672818906391762e-245
Univ_Counts
F-statistic: 43.582352230813015 p-value: 1.0744179727875513e-52
Park_Counts
F-statistic: 36.219572131903874 p-value: 3.205325348163452e-50


## KCC (continous vs categorical)

- 범주형 변수 더미화

In [32]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
            print(column)
    return encoded_df

df_encoded = oh_encoding(df)

Region_Name
Building_Use


In [33]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'KCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 Kendall 상관 계수 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price':
        if column.startswith('Region') or column.startswith('Building'):  
            kendall_corr, p_value = kendalltau(df_encoded['JS_Price'], df_encoded[column])
            result_df = result_df.append({'Column_Name': column, 'KCC': kendall_corr, 'p-value': p_value}, ignore_index=True)

In [34]:
result_df.sort_values(by='KCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,KCC,p-value
0,Building_Use_아파트,0.590369,0.0
1,Region_Name_강남구,0.155815,9.475711000000001e-66
2,Region_Name_서초구,0.142332,3.64018e-55
3,Region_Name_용산구,0.080321,1.062926e-18
4,Region_Name_송파구,0.064505,1.34183e-12
5,Region_Name_종로구,0.063375,3.266451e-12
6,Region_Name_중구,0.050741,2.44578e-08
7,Region_Name_광진구,0.049931,4.062673e-08
8,Region_Code,0.039303,2.35321e-07
9,Region_Name_마포구,0.036537,5.920896e-05


- p-value값 고려
    - PCC 결과 a=0.05일 때 UR은 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
    - Anova 결과 a=0.05일 때 Month는 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
    - KCC 결과 a=0.05일 때 Region_Name_영등포구, Region_Name_동작구, Region_Name_성동구, Region_Name_서대문구, Building_Use_연립다세대 등은 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> Region_Code, Crime_Rates, Month, UR, SDT_index, School_Counts, Univ_Counts, Park_Counts를 변수에서 제외
    - KCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> Building_Use_아파트, Region_Name_강남구, Region_Name_서초구, Building_Use_단독다가구만을 변수에 포함