In [21]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import joblib

In [8]:
total = pd.read_csv("ffinal_total4.csv")
total.drop(['Unnamed: 0'],axis=1,inplace=True)

In [9]:
total

Unnamed: 0,기준년분기코드,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,개업점포수,폐업점포수,당월매출금액,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외
0,20191,11290525,3664622,1011937000,90,2,4,4328200306,4429,12823,4444697,103305,119698,86911
1,20191,11200590,3654571,192261000,28,0,3,456182353,986,15796,4271714,126578,163240,89917
2,20191,11200520,3555413,300062000,48,2,0,1156401286,1218,18133,4486970,80865,109410,52321
3,20191,11170570,3955417,248116000,54,5,3,1103200412,5206,14933,3484892,79883,81935,77832
4,20191,11170510,3150859,548758000,86,0,1,3626619880,3345,18358,4805181,81171,111138,51203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7950,20234,11710632,3910198,1526342000,115,6,6,8085087192,10171,31488,6197742,94031,121008,67053
7951,20234,11650520,5196131,4127886000,262,14,9,31165275563,67858,17183,6791734,181696,215355,148038
7952,20234,11680630,4101144,5519540000,304,15,10,43643536399,33305,20756,10010985,160349,184079,136619
7953,20234,11650621,4557937,1474917000,119,0,4,11522249834,21837,22692,6565906,105925,131794,80055


In [15]:
#상수항 추가
proceed = sm.add_constant(total, has_constant="add")

#변수 설정
X = proceed.drop(['기준년분기코드', '행정동코드','개업점포수', '폐업점포수','당월매출금액'],axis = 1)
y = proceed['당월매출금액']

#훈련과 테스트 변수 나눔 비율은 8 : 2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [19]:
#Min_Max 스케일링 진행
minmax = MinMaxScaler()
minmax_X_train = minmax.fit_transform(X_train)
minmax_X_test = minmax.transform(X_test)

#랜덤포레스트 모델 훈련
forest_reg = RandomForestRegressor(n_estimators=200, random_state=42)
forest_reg.fit(minmax_X_train, y_train)

#훈련 데이터 평가, 예측 수행 및 MSE, RMSE 계산
forest_predictions = forest_reg.predict(minmax_X_train)
forest_mse = mean_squared_error(y_train, forest_predictions)
forest_rmse = np.sqrt(forest_mse)

#테스트 데이터 평가
forest_test_predictions = forest_reg.predict(minmax_X_test)
forest_test_mse = mean_squared_error(y_test, forest_test_predictions)
forest_test_rmse = np.sqrt(forest_test_mse)

#교차 검증 평가
forest_scores = cross_val_score(forest_reg, minmax_X_train, y_train,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

#평과 결과 출력
print("<<<Random Forest>>>")
display_scores(forest_rmse_scores)
print("forest_rmse:", forest_rmse)
print("-----------------------------------------------")
print("minmax R2 score on training set : ", r2_score(y_train, forest_predictions))
print("minmax R2 score on test set : ", r2_score(y_test, forest_test_predictions))


# 교차 검증으로 훈련 세트의 평균 제곱 오차 (MSE) 계산
train_cv_scores = cross_val_score(forest_reg , minmax_X_train, y_train, cv=5, scoring='neg_mean_squared_error')
train_mse = -np.mean(train_cv_scores)  # 부호 변경하여 양수로 변환
print("Train CV MSE:", train_mse)

# 교차 검증으로 테스트 세트의 평균 제곱 오차 (MSE) 계산
test_cv_scores = cross_val_score(forest_reg , minmax_X_test, y_test, cv=5, scoring='neg_mean_squared_error')
test_mse = -np.mean(test_cv_scores)  # 부호 변경하여 양수로 변환
print("Test CV MSE:", test_mse)


# 과적합 여부 확인
if train_mse < test_mse:
    print("과적합 아닙니다")
else:
    print("과적합입니다")


<<<Random Forest>>>
Scores: [1.73212613e+09 1.20268309e+09 1.49868205e+09 1.76299059e+09
 1.39899729e+09 1.25377271e+09 1.54110673e+09 1.62052443e+09
 1.41189889e+09 1.55120423e+09]
Mean: 1497398613.0915318
Standard deviation: 175729052.93142664
forest_rmse: 558404309.3685626
-----------------------------------------------
minmax R2 score on training set :  0.9966659123816524
minmax R2 score on test set :  0.9794315617614621
Train CV MSE: 2.602789238533329e+18
Test CV MSE: 1.187708775644251e+19
과적합 아닙니다


In [23]:
# 최종 모델 저장
joblib.dump(forest_reg, 'minmax_Random_Forest.pkl', compress=1)

# 모델 로드
loaded_model = joblib.load('minmax_Random_Forest.pkl')

# 로드된 모델로 점수 계산
score = loaded_model.score(minmax_X_test, y_test)
print('test score: {score:.3f}'.format(score=score))

# 예측 결과 데이터프레임 생성
df = pd.DataFrame(X_test, columns= ['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체', '환산1층', '환산그외'])
predict = RandomForestRegressor(n_estimators=200, random_state=42).fit(minmax_X_train, y_train).predict(minmax_X_test).astype(int)
df['prediction'] = predict

test score: 0.979


In [24]:
result = pd.merge(total,df,on=['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수','환산전체',
       '환산1층', '환산그외'],how='inner')
result

Unnamed: 0,기준년분기코드,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,개업점포수,폐업점포수,당월매출금액,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외,prediction
0,20191,11290525,3664622,1011937000,90,2,4,4328200306,4429,12823,4444697,103305,119698,86911,2990175851
1,20191,11110540,3772087,884788000,79,1,4,4569768323,3997,2831,1010831,238289,197561,279017,5631541700
2,20191,11680640,3747340,20051983000,1104,49,50,64954932801,192833,35362,23188491,135934,147116,124753,62110817066
3,20191,11590640,3954608,761344000,74,1,1,2611009529,2689,24423,4932708,100871,142486,59256,2859766298
4,20191,11590520,2461079,1301028000,117,4,8,10558758018,5545,14606,4131564,123865,163280,84450,7163115142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,20234,11710540,2435636,251428000,89,4,4,1695955926,955,22624,3625768,80056,81689,78423,1693882542
1587,20234,11710562,2928271,7551205000,343,21,15,43408131557,24911,24312,6510592,136490,159794,113185,26545407847
1588,20234,11710632,3910198,1526342000,115,6,6,8085087192,10171,31488,6197742,94031,121008,67053,7436720237
1589,20234,11650621,4557937,1474917000,119,0,4,11522249834,21837,22692,6565906,105925,131794,80055,10049185866


In [25]:
#성공여부의 기준인 중앙 값에 대한 컬럼 만들고 머지
median = result.groupby('행정동코드')[['당월매출금액']].median().reset_index().rename(columns ={"당월매출금액" : "중간값"}).astype(int)
result = pd.merge(result, median, on='행정동코드',how='inner')

In [26]:
#성공여부 컬럼 생성
result['success'] = result.apply(lambda row: 0 if row['중간값'] > row['prediction'] else 1, axis=1)

In [27]:
result.head()

Unnamed: 0,기준년분기코드,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,개업점포수,폐업점포수,당월매출금액,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외,prediction,중간값,success
0,20191,11290525,3664622,1011937000,90,2,4,4328200306,4429,12823,4444697,103305,119698,86911,2990175851,4652199868,0
1,20191,11110540,3772087,884788000,79,1,4,4569768323,3997,2831,1010831,238289,197561,279017,5631541700,4981116371,1
2,20191,11680640,3747340,20051983000,1104,49,50,64954932801,192833,35362,23188491,135934,147116,124753,62110817066,69162584184,0
3,20191,11590640,3954608,761344000,74,1,1,2611009529,2689,24423,4932708,100871,142486,59256,2859766298,2543788117,1
4,20191,11590520,2461079,1301028000,117,4,8,10558758018,5545,14606,4131564,123865,163280,84450,7163115142,9264860520,0
