## 랜덤포레스트

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [13]:
# 데이터 로드
default = pd.read_csv('default2.csv')

In [14]:
default

Unnamed: 0.1,Unnamed: 0,기준년분기코드,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,개업점포수,폐업점포수,당월매출금액,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외
0,0,20191,11290525,3664622,1011937000,90,2,4,4328200306,4429,12823,4444697,103305,119698,86911
1,1,20191,11200590,3654571,192261000,28,0,3,456182353,986,15796,4271714,126578,163240,89917
2,2,20191,11200520,3555413,300062000,48,2,0,1156401286,1218,18133,4486970,80865,109410,52321
3,3,20191,11170570,3955417,248116000,54,5,3,1103200412,5206,14933,3484892,79883,81935,77832
4,4,20191,11170510,3150859,548758000,86,0,1,3626619880,3345,18358,4805181,81171,111138,51203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7950,7950,20234,11710632,3910198,1526342000,115,6,6,8085087192,10171,31488,6197742,94031,121008,67053
7951,7951,20234,11650520,5196131,4127886000,262,14,9,31165275563,67858,17183,6791734,181696,215355,148038
7952,7952,20234,11680630,4101144,5519540000,304,15,10,43643536399,33305,20756,10010985,160349,184079,136619
7953,7953,20234,11650621,4557937,1474917000,119,0,4,11522249834,21837,22692,6565906,105925,131794,80055


In [15]:
# 전처리 데이터 내보내기
# default.to_csv('default_set.csv')

---

#### 2. 모델 학습

In [9]:
# 위에 데이터 셋을 별도로 저장한 후 불러옴
default = pd.read_csv('default_set.csv')

In [16]:
# 데이터 세팅
data = default[['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체',
       '환산1층', '환산그외']]
amt = default[['당월매출금액']]

In [17]:
# 피처 변수(X)와 타겟 변수(y) 분리
features = ['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체',
       '환산1층', '환산그외']
X = data[features]
y = amt['당월매출금액']

In [18]:
# 정규화
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [19]:
# 데이터 분할 (훈련 세트와 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [20]:
# 랜덤 포레스트 분류 모델 초기화
rf_model = RandomForestRegressor(random_state=42)

In [21]:
# 그리드 서치를 위한 파라미터 그리드 설정
param_grid = {
    'n_estimators': [400, 500, 600],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [22]:
# 그리드 서치 초기화
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [23]:
# 그리드 서치 실행 (모델 훈련)
grid_search.fit(X_train, y_train.ravel())

# 최적 파라미터 출력
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   9.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  10.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  10.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  10.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  12.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  12.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  12.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=600; tota



[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  20.1s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  13.9s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  22.6s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  14.7s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  14.7s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  24.0s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  24.0s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=  17.8s
[CV] END bootstrap=False, max_depth=None

In [24]:
# 최적 파라미터로 예측
best_rf_model = grid_search.best_estimator_
y_pred_scaled = best_rf_model.predict(X_test)

In [25]:
# 역변환 (inverse transform)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

In [26]:
# 모델 평가 (역변환된 값 사용)
mse = mean_squared_error(scaler_y.inverse_transform(y_test), y_pred)
r2 = r2_score(scaler_y.inverse_transform(y_test), y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1.92599383743284e+18
R-squared: 0.9794209653509578


---

#### 3. 사업성공여부를 측정할 파생 데이터 생성

In [27]:
# 최종 모델 저장
reg = RandomForestRegressor(n_estimators=500, random_state=42).fit(X_train, y_train.ravel())
joblib.dump(reg, 'Random_Forest.pkl', compress=1)

['Random_Forest.pkl']

In [48]:
# 모델 로드
loaded_model = joblib.load('Random_Forest.pkl')

# 로드된 모델로 점수 계산
score = loaded_model.score(X_test, y_test)
print('test score: {score:.3f}'.format(score=score))

test score: 0.980


In [49]:
# 예측 결과 데이터프레임 생성
df = pd.DataFrame(scaler_X.inverse_transform(X_test), columns=features)
predictions = scaler_y.inverse_transform(loaded_model.predict(X_test).reshape(-1, 1))
df['Predicted'] = predictions

In [50]:
# 훈련된 데이터 프레임 타입 변경
df = df.astype(int)

In [51]:
# 최초에 불러왔던 default 데이터 프레임과 머지
heaven = df.merge(default, on=['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수','환산전체',
       '환산1층', '환산그외'], how='inner').drop('Unnamed: 0', axis=1)

In [54]:
# 중앙값 기준 설정 (테이블) 
median = heaven.groupby('행정동코드')[['당월매출금액']].median().astype(int).reset_index().rename(columns={'당월매출금액':'median'})

In [57]:
# 위에 세팅했던 데이터 셋과 머지
last_set = pd.merge(heaven, median, on='행정동코드', how='left')

In [58]:
# 성공여부 컬럼 추가
last_set['success'] = last_set.apply(lambda row: 0 if row['median'] > row['Predicted'] else 1, axis=1)

In [60]:
last_set = last_set[['행정동코드', 'median', '기준년분기코드', '월평균소득금액','음식지출총금액', '유사업종점포수',
              '개업점포수', '폐업점포수', '당월매출금액', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체',
              '환산1층','환산그외','Predicted', 'success']]

In [61]:
# 출력확인
last_set.head()

Unnamed: 0,행정동코드,median,기준년분기코드,월평균소득금액,음식지출총금액,유사업종점포수,개업점포수,폐업점포수,당월매출금액,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외,Predicted,success
0,11230720,3985224711,20201,2722774,800191000,93,1,6,2302043834,1421,12359,5800073,100194,148726,51663,3679878926,0
1,11110650,8672181420,20211,3333112,2043928000,230,5,3,6392837438,6669,16799,7888102,117679,153779,81579,8194336200,0
2,11620735,4228504286,20191,2747467,2109396000,194,15,9,3719447268,2498,23709,7236595,106667,123159,90175,7143895708,1
3,11380520,6310358940,20192,3416366,1282355000,137,11,8,6114086699,3314,30158,4433577,99016,133751,64281,5641493392,0
4,11170650,9117761871,20232,3682946,3787935000,173,4,1,9117761871,13034,7126,2641988,182180,213394,150966,8958541241,0


In [62]:
last_set.to_csv('last_set.csv')