In [88]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import random
from imblearn.over_sampling import RandomOverSampler

In [89]:
path = './data/'

df = pd.read_csv(path+'merged_target.csv')

In [90]:
# 데이터프레임의 행 수를 가져옵니다.
total_rows = df.shape[0]

# 10000개의 행을 무작위로 선택합니다.
nrows_to_select = 10000
random_indices = random.sample(range(total_rows), nrows_to_select)

# 선택된 인덱스에 해당하는 행들을 가져옵니다.
df = df.iloc[random_indices]

In [91]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.reset_index(drop = True, inplace = True)

In [92]:
df

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_University,Shortest_Distance_to_School
0,26,48000,83.06,658553,3.3,81.7,87.2,84.9,116.875559,77.4,87450.00,1.191437,2.00,47239.513155,36298.957205,35647.415274,558.881292,527.782208,228.498862
1,12,24000,84.50,406546,4.2,85.8,91.0,88.6,114.847943,84.5,21300.00,1.212720,1.50,19719.623803,19873.401378,16858.604651,172.915007,1830.694322,106.538215
2,1,27300,22.87,228373,4.1,107.6,107.0,106.7,91.878124,181.5,29400.00,0.897322,1.25,18602.468944,19873.401378,26008.305400,1887.985803,4338.009097,341.020281
3,1,8000,43.84,494712,2.7,77.7,83.8,81.4,120.394134,74.3,23700.00,1.090516,2.50,17222.396732,19873.401378,13548.262115,822.503918,209.593056,319.383357
4,16,13000,55.62,376908,3.0,76.4,82.8,80.6,112.511457,73.9,20000.00,1.092541,2.50,21368.501145,19873.401378,12654.214475,865.963582,2002.741260,189.418715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,4,27000,31.45,285433,3.2,107.1,104.0,103.7,119.982699,181.7,33133.33,0.721627,0.50,18832.156357,19873.401378,25067.690476,296.106155,1179.806267,197.667674
9996,34,27000,49.94,504265,3.0,109.5,108.4,112.4,66.608626,141.8,84500.00,0.808294,3.25,21828.535914,36298.957205,53586.319549,449.527766,1286.123683,203.415410
9997,26,36000,59.67,353271,3.1,88.7,93.6,90.1,114.695499,91.8,67400.00,1.430245,1.25,45647.228435,36298.957205,39521.311037,181.383075,1153.221762,206.582240
9998,7,22500,23.92,372625,2.4,109.7,109.4,111.3,78.964640,156.1,23000.00,0.806356,3.00,21904.126996,18749.306885,18637.661738,322.165991,2821.404157,131.868223


In [93]:
# JS_Price를 5개의 범주로 나누고 기존 변수 삭제
df['JS_Price_Category'] = pd.cut(df['JS_Price'], bins=5, labels=False)
df.drop('JS_Price', axis=1, inplace=True)

# # 범주형 변수 더미화 함수
# def oh_encoding(df):
#     # DataFrame의 복사본을 만듭니다.
#     encoded_df = df.copy()
#     for column in df.columns:
#         if df[column].dtype == object:
#             encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
#     return encoded_df

# # 범주형 변수 더미화 적용
# df_encoded = oh_encoding(df)

df_encoded = df





# 오버샘플링할 데이터와 레이블을 준비합니다.
X = df_encoded.drop('JS_Price_Category', axis=1)  # 독립변수
y = df_encoded['JS_Price_Category']  # 종속변수

# RandomOverSampler를 초기화합니다.
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# 오버샘플링을 적용합니다.
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# 오버샘플링된 데이터를 새로운 데이터프레임으로 만듭니다.
df_encoded = pd.concat([X_resampled, y_resampled], axis=1)

# 오버샘플링된 데이터의 클래스 분포를 확인합니다.
print(df_encoded['JS_Price_Category'].value_counts())




# 상관관계 분석을 위한 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price_Category'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price_Category':
        correlation, p_value = pearsonr(df_encoded['JS_Price_Category'], df_encoded[column])
        result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

# PCC 및 p-value를 기준으로 필터링
delete_columns = []

for index, row in result_df.iterrows():
    if abs(row['PCC']) < 0.1 or row['p-value'] > 0.05:
        delete_columns.append(row['Column_Name'])
delete_columns.append('JS_Price_Category')
        
# 선택된 특성 열
selected_features = list(set(df_encoded.columns) - set(delete_columns))

0    9225
1    9225
2    9225
3    9225
4    9225
Name: JS_Price_Category, dtype: int64


In [96]:
result_df.sort_values('PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,JS_BA,0.920487,0.0
1,Sell_Price,0.79802,0.0
2,Region_Name,0.730195,0.0
3,Building_Use,0.530841,0.0
4,UR,0.347014,0.0
5,SDT_index,0.317434,0.0
6,Crime_Rates,0.307636,0.0
7,YearMonth,0.203227,0.0
8,IR,0.199062,0.0
9,Population,0.106982,1.778397e-117


In [95]:
selected_features

['Building_Age',
 'Population',
 'UR',
 'IR',
 'YearMonth',
 'Crime_Rates',
 'JS_BA',
 'Region_Name',
 'LC_index',
 'TC_index',
 'Building_Use',
 'Sell_Price',
 'HSP_index',
 'SDT_index',
 'CA_index',
 'Shortest_Distance_to_School']