In [None]:
# 데이터 로드
from google.colab import drive

import numpy as np
import pandas as pd
import tensorflow as tf

# 데이터 셋 분할 패키지
from sklearn.model_selection import train_test_split

# 표준, 정규화 패키지
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# 인공신경망 모델 패키지, 신경층 패키지
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense

# 의사결정 트리 패키지
from sklearn.tree import DecisionTreeRegressor

# 하이퍼 파라미터 튜닝용 그리드서치 패키지
from sklearn.model_selection import GridSearchCV

# metrics 패키지
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# plt 인코딩 패키지
import matplotlib.font_manager as fm


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# plt 인코딩 코드
!apt-get update -qq
!apt-get install fonts-nanum* -qq
!fc-cache -fv

font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font_prop = fm.FontProperties(fname=font_path)


# 정형화된 데이터 프레임으로 확인
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.unicode.east_asian_width', True)

/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 39 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/nanum: skipping, looped directory detected
/var/cache/fontconfig: cleaning cache directory
/root/.cache/fontconfig: not cleaning non-existent cache directory
/root/.fontconfig: not cleaning non-existent cache director

In [None]:
### 함수 정의 구간
# 데이터 업로드 _utf
def upload_data_utf(file_name):
  rtn = pd.read_csv(file_name, encoding = 'utf-8')
  rtn = pd.DataFrame(rtn)

  return rtn

def upload_data_cp(file_name):
  rtn = pd.read_csv(file_name, encoding = 'cp949', delimiter = '\t')
  rtn = pd.DataFrame(rtn)

  return rtn

### 데이터 정규화
  # Min-Max Scaling
  # 0에서 1사이로 정규화
def scaling(data, scaled_column):

  # 스케일 대상 column
  rtn = data

  # 데이터 정규화
  scaler = MinMaxScaler()
  rtn[scaled_column] = scaler.fit_transform(rtn[scaled_column])

  return rtn

# rmse
def calculate_rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

# adjusted r2
def calculate_adjusted_r2(y_true, y_pred, num_features):
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - num_features - 1)
    return adjusted_r2

In [None]:
# 수원 XDATA 파일 업로드
x_data1 = upload_data_utf("/content/drive/MyDrive/Colab Notebooks/헤도닉_집값예측/SuwonData/XDATA(Special).csv")
x_data2 = upload_data_utf("/content/drive/MyDrive/Colab Notebooks/헤도닉_집값예측/AnyangData/XDATA(Special).csv")

x_data = pd.concat([x_data1, x_data2], ignore_index=True)

y_data1 = x_data1[['거래금액', '거래일자']]
y_data2 = x_data2[['거래금액', '거래일자']]

In [None]:
# 계절지수 데이터 업로드
jisoo = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/헤도닉_집값예측/계절지수.xlsx")

In [None]:
# 날짜 타입으로 바꾸기
y_data1['거래일자'] = pd.to_datetime(y_data1['거래일자'])
y_data2['거래일자'] = pd.to_datetime(y_data2['거래일자'])
jisoo['일자'] = pd.to_datetime(jisoo['일자'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data1['거래일자'] = pd.to_datetime(y_data1['거래일자'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data2['거래일자'] = pd.to_datetime(y_data2['거래일자'])


In [None]:
# 년월만 있는 '년월' 열 추가
jisoo['년월'] = jisoo['일자'].dt.to_period('M')
y_data1['년월'] = y_data1['거래일자'].dt.to_period('M')
y_data2['년월'] = y_data2['거래일자'].dt.to_period('M')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data1['년월'] = y_data1['거래일자'].dt.to_period('M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data2['년월'] = y_data2['거래일자'].dt.to_period('M')


In [None]:
# 같은 년월의 가격 지수로 거래금액 나누기
for i in range(len(y_data1)):
  for j in range(len(jisoo)):
    if y_data1['년월'][i] == jisoo['년월'][j]:
       y_data1['거래금액'][i] = float(y_data1['거래금액'][i].replace(',', '')) / jisoo['경부2권'][j] * 100

# 같은 년월의 가격 지수로 거래금액 나누기
for i in range(len(y_data2)):
  for j in range(len(jisoo)):
    if y_data2['년월'][i] == jisoo['년월'][j]:
       y_data2['거래금액'][i] = float(y_data2['거래금액'][i].replace(',', '')) / jisoo['경부1권'][j] * 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data1['거래금액'][i] = float(y_data1['거래금액'][i].replace(',', '')) / jisoo['경부2권'][j] * 100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data2['거래금액'][i] = float(y_data2['거래금액'][i].replace(',', '')) / jisoo['경부1권'][j] * 100


In [None]:
# 입지 정보 칼럼명
location = ['위도','경도',
            '약국', '의원','병원', '대병원',
            '대형상권','공원', '도서관',
            '유치원', '초등학교', '중학교', '고등학교', '학원',
            '지하철',
            '약국 수', '의원 수', '병원 수', '대병원 수',
            '대형상권 수','공원 수','도서관 수',
            '유치원 수', '초등학교 수', '중학교 수', '고등학교 수', '학원 수',
            '지하철 수']

y_data = pd.concat([y_data1, y_data2], ignore_index=True)
y_data = y_data[['거래금액']]

x_data = x_data[['건축년도', '전용면적'] + location]

In [None]:
x_data

Unnamed: 0,건축년도,전용면적,위도,경도,약국,의원,병원,대병원,대형상권,공원,...,대병원 수,대형상권 수,공원 수,도서관 수,유치원 수,초등학교 수,중학교 수,고등학교 수,학원 수,지하철 수
0,2006,66.1000,37.255415,127.015434,0.441692,0.286298,0.649299,2.716930,0.733942,0.799211,...,0,0,0,1,4,2,0,0,24,0
1,2010,84.9800,37.259819,127.009835,0.319383,0.227208,1.053526,2.544017,0.696127,0.290106,...,0,1,1,1,2,1,1,0,9,0
2,2015,84.9800,37.260808,127.006641,0.126923,0.126923,1.350325,2.646423,0.587980,0.282061,...,0,2,1,0,2,2,1,0,16,0
3,1979,49.3200,37.255360,127.009545,0.211612,0.127644,1.126568,2.965217,1.142994,0.389391,...,0,0,1,1,2,1,1,0,13,0
4,2015,39.9100,37.262824,127.006542,0.127191,0.126411,1.298673,2.496756,0.377251,0.505459,...,0,3,1,0,1,1,1,0,49,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13356,2003,59.8820,37.363819,126.962413,0.175231,0.085904,0.571665,0.778303,0.177250,1.802944,...,0,1,0,0,1,1,1,0,19,0
13357,1992,76.2300,37.378071,126.957341,0.229779,0.175647,0.595744,0.943608,0.253860,0.628171,...,0,2,1,0,4,3,3,1,268,0
13358,2006,118.8100,37.369059,126.955208,0.222566,0.118175,0.204178,0.204178,0.527303,1.443282,...,1,1,0,0,1,1,0,0,19,0
13359,2014,18.9685,37.368194,126.955424,0.260377,0.040783,0.223618,0.234750,0.590511,1.515445,...,1,1,0,0,0,0,0,0,19,0


In [None]:
# 데이터 정규화
x_data = scaling(x_data, ['건축년도', '전용면적'] + location)
x_data = x_data[['건축년도', '전용면적'] + location]

In [None]:
x_data = x_data.astype('float32')
y_data = y_data.astype('float32')

In [None]:
# 훈련 세트와 테스트 세트로 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [None]:
### 인공신경망
# 인공신경망(선형회귀) 모델 초기화
model = tf.keras.Sequential()

model.add(tf.keras.layers.Dense(120, activation='relu', input_shape=(30,)))
model.add(Dropout(0.2))
model.add(tf.keras.layers.Dense(60, activation='relu'))
model.add(Dropout(0.2))
model.add(tf.keras.layers.Dense(1))

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 훈련
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# 모델 평가
loss = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on Test Set: {loss}')

# 데이터에 대한 예측
predictions = model.predict(X_test)
print(f'Predictions for new data: {predictions}')

# R-squared 계산
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
### 의사결정트리 모델

# 의사결정트리 모델 초기화
tree_model = DecisionTreeRegressor()

# Grid Search를 위한 파라미터 목록
params = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 하이퍼파라미터 탐색
grid_search = GridSearchCV(tree_model, params, cv=5)
grid_search.fit(X_train, y_train)

print("최적 파라미터 조합 : ", grid_search.best_params_)

# 모델 학습
tree_model = grid_search.best_estimator_
tree_model.fit(X_train, y_train)

# 특성 추출
feature_impo = tree_model.feature_importances_
feature = list(zip(X_train.columns, feature_impo))

feature.sort(key=lambda x : x[1], reverse = True)
feature

# 특성의 총합은 1
# 전용면적이 40퍼센트, 건축년도 18퍼센트, 입지정보가 40퍼센트 정도
sumFeature = 0
for i in range(len(feature)):
  sumFeature += feature[i][1]
print(sumFeature)

feature = pd.DataFrame(feature)
feature.to_csv('특성값 비율.csv', encoding = 'utf-8')

# 테스트 데이터에 대한 예측
predictions = tree_model.predict(X_test)


# 모델 평가
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse}")

# R-scored
print("R-score", r2_score(y_test, predictions))

# rmse 계산
rmse_value = calculate_rmse(y_test, predictions)
print(f"RMSE: {rmse_value}")

# adjusted R-squared 계산
adjusted_r2_value = calculate_adjusted_r2(y_test, predictions, 30)
print(f"Adjusted R-squared: {adjusted_r2_value}")

최적 파라미터 조합 :  {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 10}
1.0000000000000004
MSE: 102158963.08155471
R-score 0.8625264120653701
RMSE: 10107.371719767445
Adjusted R-squared: 0.8609653947913205


In [None]:
from sklearn.svm import SVR

# SVR 모델 초기화 및 학습
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# 테스트 데이터 1차 예측
y_pred = svr_model.predict(X_test)

# 하이퍼 파라미터 범위
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['rbf', 'linear', 'poly']
}

# 하이퍼 파라미터 튜닝 수행, 교차검증 K-Fold 5회
grid_search = GridSearchCV(SVR(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best SVR model:", grid_search.best_estimator_)

Best parameters: {'C': 10, 'gamma': 10, 'kernel': 'poly'}
Best SVR model: SVR(C=10, gamma=10, kernel='poly')


In [None]:
# 최적 조합 svr모델
bestSVR = SVR(C = 10, gamma = 10, kernel = 'poly')

# 모델 훈련
bestSVR.fit(X_train, y_train)

# 테스트 데이터로 예측
y_pred = bestSVR.predict(X_test)

# R-scored
print("R-score", r2_score(y_test, y_pred))

# rmse 계산
rmse_value = calculate_rmse(y_test, y_pred)
print(f"RMSE: {rmse_value}")

# adjusted R-squared 계산
adjusted_r2_value = calculate_adjusted_r2(y_test, y_pred, 30)
print(f"Adjusted R-squared: {adjusted_r2_value}")

R-score 0.9353963084214771
RMSE: 6928.7879753714315
Adjusted R-squared: 0.9346627313028717


In [None]:
X_train

Unnamed: 0,건축년도,전용면적,위도,경도,약국,의원,병원,대병원,대형상권,공원,...,대병원 수,대형상권 수,공원 수,도서관 수,유치원 수,초등학교 수,중학교 수,고등학교 수,학원 수,지하철 수
4580,0.911111,0.337904,0.242142,0.861349,0.024841,0.348353,0.363206,0.156657,0.000000,0.104749,...,0.0,0.500,0.1,0.0,0.111111,0.500000,0.25,0.0,0.236145,0.0
2380,0.733333,0.338102,0.094090,0.699027,0.352957,0.148201,0.217103,0.437329,0.000000,0.133218,...,0.0,0.250,0.4,0.5,0.444444,0.666667,0.50,0.2,0.163855,0.0
12017,0.577778,0.223554,0.833882,0.245443,0.132633,0.112145,0.204855,0.241403,0.177281,0.261940,...,0.0,0.375,0.1,0.5,0.444444,0.500000,0.00,0.0,0.159036,0.0
2340,0.400000,0.223595,0.072008,0.728490,0.194235,0.135161,0.056392,0.520827,0.102008,0.152088,...,0.0,0.375,0.3,0.5,0.333333,0.500000,0.75,0.2,0.166265,0.0
265,0.822222,0.222681,0.154165,0.349144,0.357840,0.118561,0.319519,0.705881,0.426754,0.053814,...,0.0,0.000,0.3,0.5,0.444444,0.666667,0.50,0.2,0.115663,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,0.822222,0.223869,0.703280,0.305798,0.132625,0.307345,0.190482,0.114539,0.129635,0.469484,...,0.0,0.375,0.0,0.0,0.111111,0.166667,0.25,0.0,0.055422,0.0
5191,0.422222,0.337900,0.084179,0.902314,0.292936,0.074631,0.293936,0.567110,0.194976,0.190357,...,0.0,0.250,0.3,0.0,0.666667,0.500000,0.50,0.4,0.139759,0.5
5390,0.866667,0.236160,0.046081,0.815706,0.125944,0.164341,0.189556,0.637286,0.116983,0.129528,...,0.0,0.125,0.2,1.0,0.444444,0.500000,0.25,0.0,0.238554,0.0
860,0.755556,0.337279,0.039716,0.707705,0.181049,0.214167,0.278259,0.633300,0.210755,0.075689,...,0.0,0.125,0.2,0.5,0.222222,0.166667,0.25,0.0,0.096386,0.0
