In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import random
from imblearn.over_sampling import RandomOverSampler

In [2]:
path = './data/'

df = pd.read_csv(path+'merged_target.csv')

In [3]:
# 데이터프레임의 행 수를 가져옵니다.
total_rows = df.shape[0]

# 10000개의 행을 무작위로 선택합니다.
nrows_to_select = 10000
random_indices = random.sample(range(total_rows), nrows_to_select)

# 선택된 인덱스에 해당하는 행들을 가져옵니다.
df = df.iloc[random_indices]

In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.reset_index(drop = True, inplace = True)

In [5]:
df

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_University,Shortest_Distance_to_School
0,4,1500,18.22,500389,3.6,86.4,91.7,89.0,115.250447,85.5,14950.0,1.080275,1.50,14428.599190,18749.306885,17803.776978,929.265692,162.240669,454.747668
1,8,100000,101.94,522782,2.4,109.7,109.4,111.3,78.964640,156.1,180000.0,0.656524,3.00,46291.251616,36298.957205,49663.822387,1614.895804,6015.776743,297.168208
2,2,20000,30.02,473512,3.0,96.9,100.5,98.7,99.980392,127.1,23500.0,0.800750,1.25,17222.396732,19873.401378,21331.830357,615.397430,1208.328286,82.292708
3,18,38500,59.91,654241,3.2,85.4,90.9,87.8,121.310376,84.4,63000.0,1.015461,1.50,47239.513155,36298.957205,38506.037037,195.218272,2890.497585,469.328907
4,8,20000,55.75,492793,2.9,75.3,82.1,79.4,109.252507,72.1,28000.0,1.092192,2.75,20699.148509,18749.306885,16801.896976,607.163330,2494.049500,233.547171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,20950,29.88,228342,3.6,101.1,100.0,100.5,119.333816,154.0,21500.0,1.048278,0.50,18602.468944,19873.401378,24088.879870,2969.107862,4305.021913,309.674692
9996,3,18500,74.74,494712,3.0,77.5,83.6,81.1,117.277727,73.7,19800.0,1.062302,2.50,17222.396732,19873.401378,13694.414587,348.036287,2411.408893,330.976145
9997,2,45000,48.98,337801,4.0,99.6,98.8,100.0,117.451122,147.2,105750.0,0.952624,0.50,34182.781979,36298.957205,43319.290288,184.408602,1524.092036,384.293260
9998,0,17000,29.52,486578,4.9,85.5,91.0,88.4,116.592129,84.5,23500.0,0.907575,1.50,17222.396732,19873.401378,17848.888889,509.264817,2569.054566,281.960299


In [6]:
# JS_Price를 5개의 범주로 나누고 기존 변수 삭제
df['JS_Price_Category'] = pd.cut(df['JS_Price'], bins=5, labels=False)
df.drop('JS_Price', axis=1, inplace=True)

# # 범주형 변수 더미화 함수
# def oh_encoding(df):
#     # DataFrame의 복사본을 만듭니다.
#     encoded_df = df.copy()
#     for column in df.columns:
#         if df[column].dtype == object:
#             encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
#     return encoded_df

# # 범주형 변수 더미화 적용
# df_encoded = oh_encoding(df)

df_encoded = df





# 오버샘플링할 데이터와 레이블을 준비합니다.
X = df_encoded.drop('JS_Price_Category', axis=1)  # 독립변수
y = df_encoded['JS_Price_Category']  # 종속변수

# RandomOverSampler를 초기화합니다.
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# 오버샘플링을 적용합니다.
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# 오버샘플링된 데이터를 새로운 데이터프레임으로 만듭니다.
df_encoded = pd.concat([X_resampled, y_resampled], axis=1)

# 오버샘플링된 데이터의 클래스 분포를 확인합니다.
print(df_encoded['JS_Price_Category'].value_counts())




# 상관관계 분석을 위한 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price_Category'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price_Category':
        correlation, p_value = pearsonr(df_encoded['JS_Price_Category'], df_encoded[column])
        result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

# PCC 및 p-value를 기준으로 필터링
delete_columns = []

for index, row in result_df.iterrows():
    if abs(row['PCC']) < 0.1 or row['p-value'] > 0.05:
        delete_columns.append(row['Column_Name'])
delete_columns.append('JS_Price_Category')
        
# 선택된 특성 열
selected_features = list(set(df_encoded.columns) - set(delete_columns))

0    8131
2    8131
1    8131
3    8131
4    8131
Name: JS_Price_Category, dtype: int64


In [7]:
result_df.sort_values('PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.815558,0.0
1,JS_BA,0.766363,0.0
2,Region_Name,0.734274,0.0
3,Building_Use,0.57017,0.0
4,YearMonth,0.515329,0.0
5,Crime_Rates,0.360512,0.0
6,CA_index,0.075746,8.341868e-53
7,IR,0.066875,1.591419e-41
8,Population,0.052099,7.637089e-26
9,TC_index,0.047306,1.382025e-21


In [8]:
selected_features

['Region_Name',
 'JS_BA',
 'Building_Use',
 'Building_Age',
 'SDT_index',
 'Crime_Rates',
 'Shortest_Distance_to_School',
 'Sell_Price',
 'HSP_index',
 'YearMonth',
 'Shortest_Distance_to_Subway']