## 라이브러리

In [124]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau

## 데이터 불러오기

In [125]:
path = './data/'

df = pd.read_csv(path+'월별데이터 총병합.csv')

In [126]:
df

Unnamed: 0.1,Unnamed: 0,Year,Month,Region_Name,Region_Code,Building_Use,Sell_Price,Sell_BA,Sell_PPA,Sell_PPP,...,CR,CR_PPA,CR_PPP,IR,UR,LC_index,CA_index,TC_index,SDT_index,Crime_Rates
0,0,2014,1,강남구,11680,단독다가구,231221.43,321.47,927.69,280.63,...,6.48,30.37,30.37,2.5,3.4,78.9,85.3,82.5,120.590868,1517.02
1,1,2014,1,강남구,11680,아파트,89505.48,86.55,1039.87,314.56,...,52.67,51.45,51.45,2.5,3.4,78.9,85.3,82.5,120.590868,1517.02
2,2,2014,1,강남구,11680,연립다세대,34570.70,60.10,617.49,186.79,...,55.29,66.15,66.15,2.5,3.4,78.9,85.3,82.5,120.590868,1517.02
3,3,2014,1,강남구,11680,오피스텔,25451.99,40.23,622.11,188.19,...,122.68,120.12,120.11,2.5,3.4,78.9,85.3,82.5,120.590868,1517.02
4,4,2014,2,강남구,11680,단독다가구,374166.85,452.19,1156.48,349.83,...,3.92,24.93,24.93,2.5,4.5,79.2,85.6,82.7,121.038496,1517.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9587,9587,2021,11,중랑구,11260,오피스텔,17302.79,29.17,597.86,180.85,...,75.88,79.14,79.14,1.0,2.6,107.1,105.3,105.7,100.134564,819.12
9588,9588,2021,12,중랑구,11260,단독다가구,97187.70,198.71,493.70,149.34,...,13.15,71.09,71.09,1.0,3.5,107.3,106.2,106.3,94.655902,819.12
9589,9589,2021,12,중랑구,11260,아파트,62155.66,66.94,949.29,287.16,...,54.77,56.17,56.17,1.0,3.5,107.3,106.2,106.3,94.655902,819.12
9590,9590,2021,12,중랑구,11260,연립다세대,31058.43,45.39,745.88,225.63,...,70.61,89.25,89.25,1.0,3.5,107.3,106.2,106.3,94.655902,819.12


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9592 entries, 0 to 9591
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    9592 non-null   int64  
 1   Year          9592 non-null   int64  
 2   Month         9592 non-null   int64  
 3   Region_Name   9592 non-null   object 
 4   Region_Code   9592 non-null   int64  
 5   Building_Use  9592 non-null   object 
 6   Sell_Price    9592 non-null   float64
 7   Sell_BA       9592 non-null   float64
 8   Sell_PPA      9592 non-null   float64
 9   Sell_PPP      9592 non-null   float64
 10  Sell_Count    9592 non-null   int64  
 11  JS_Price      9592 non-null   float64
 12  JS_BA         9592 non-null   float64
 13  JS_PPA        9592 non-null   float64
 14  JS_PPP        9592 non-null   float64
 15  JS_Count      9592 non-null   int64  
 16  CR            9592 non-null   float64
 17  CR_PPA        9592 non-null   float64
 18  CR_PPP        9592 non-null 

In [132]:
df.drop('Unnamed: 0')

KeyError: "['Unnamed: 0'] not found in axis"

## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - Year : 년
    - Month : 월
    - Region_Code : 자치구 코드
    - JS_Count : 전세 거래량
    - Sell_Count : 매매 거래량
   
- Continous
    - Sell : 매매
    - Sell_Price : 매매 가격
    - Sell_BA = Sell_building Area : 매매 건물 면적
    - Sell_PPA = Sell_Price Per Area : 면적 당 매매 가격
    - Sell_PPP = Sell_Price Per Pyeong : 평 당 매매 가격
    - JS : 전세
    - JS_Price : 전세 가격
    - JS_BA = JS_Building Area : 임대 면적
    - JS_PPA = JS_Price Per Area : 임대 면적 당 전세 가격
    - JS_PPP = JS_Price Per Pyeong : 평 당 전세 가격
    - CR = Charter Rate : 전세가율
    - CR_PPA  = Charter_Rate_Price Per Area : 면적 당 전세가율
    - CR_PPP = Charter Rate_Price Per Pyeong : 평 당 전세가율
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    

## PCC

In [128]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [129]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,JS_BA,0.750458,0.0
1,Sell_PPA,0.691118,0.0
2,Sell_PPP,0.691118,0.0
3,JS_PPA,0.641914,0.0
4,JS_PPP,0.641913,0.0
5,JS_Count,0.476498,0.0
6,Sell_Count,0.416285,0.0
7,LC_index,0.239574,2.7575690000000005e-125
8,Unnamed: 0,0.239518,3.1637460000000004e-125
9,TC_index,0.236141,1.125779e-121


## KCC (continous vs categorical)

In [130]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'KCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 Kendall 상관 계수 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype == object:  # 범주형 변수인 경우에만 계산
            kendall_corr, p_value = kendalltau(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'KCC': kendall_corr, 'p-value': p_value}, ignore_index=True)

In [131]:
result_df.sort_values(by='KCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,KCC,p-value
0,Building_Use,0.073227,6.535318e-22
1,Region_Name,0.051097,1.841213e-13
