In [16]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time
import glob
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, classification_report, r2_score
from sklearn.model_selection import learning_curve, cross_val_score, KFold
import scipy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor as lgb
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
plt.rcParams['font.family'] = 'Malgun Gothic'


In [76]:
data = pd.read_csv('ffinal_total4.csv')

In [77]:
data.drop('Unnamed: 0', axis= 1, inplace = True)

In [78]:
data = data[['기준년분기코드', '행정동코드', '월평균소득금액', '음식지출총금액', '유사업종점포수', '개업점포수', '폐업점포수',
       '총직장인구수', '총상주인구수', '총유동인구수', '환산전체', '환산1층', '환산그외','당월매출금액', ]]

In [79]:
data.drop(['기준년분기코드','개업점포수','폐업점포수'],axis = 1, inplace = True)

In [80]:
data

Unnamed: 0,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외,당월매출금액
0,11290525,3664622,1011937000,90,4429,12823,4444697,103305,119698,86911,4328200306
1,11200590,3654571,192261000,28,986,15796,4271714,126578,163240,89917,456182353
2,11200520,3555413,300062000,48,1218,18133,4486970,80865,109410,52321,1156401286
3,11170570,3955417,248116000,54,5206,14933,3484892,79883,81935,77832,1103200412
4,11170510,3150859,548758000,86,3345,18358,4805181,81171,111138,51203,3626619880
...,...,...,...,...,...,...,...,...,...,...,...
7950,11710632,3910198,1526342000,115,10171,31488,6197742,94031,121008,67053,8085087192
7951,11650520,5196131,4127886000,262,67858,17183,6791734,181696,215355,148038,31165275563
7952,11680630,4101144,5519540000,304,33305,20756,10010985,160349,184079,136619,43643536399
7953,11650621,4557937,1474917000,119,21837,22692,6565906,105925,131794,80055,11522249834


In [81]:
min_max = MinMaxScaler()

In [82]:
data

Unnamed: 0,행정동코드,월평균소득금액,음식지출총금액,유사업종점포수,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외,당월매출금액
0,11290525,3664622,1011937000,90,4429,12823,4444697,103305,119698,86911,4328200306
1,11200590,3654571,192261000,28,986,15796,4271714,126578,163240,89917,456182353
2,11200520,3555413,300062000,48,1218,18133,4486970,80865,109410,52321,1156401286
3,11170570,3955417,248116000,54,5206,14933,3484892,79883,81935,77832,1103200412
4,11170510,3150859,548758000,86,3345,18358,4805181,81171,111138,51203,3626619880
...,...,...,...,...,...,...,...,...,...,...,...
7950,11710632,3910198,1526342000,115,10171,31488,6197742,94031,121008,67053,8085087192
7951,11650520,5196131,4127886000,262,67858,17183,6791734,181696,215355,148038,31165275563
7952,11680630,4101144,5519540000,304,33305,20756,10010985,160349,184079,136619,43643536399
7953,11650621,4557937,1474917000,119,21837,22692,6565906,105925,131794,80055,11522249834


In [90]:
X = data.iloc[:,1:10]
y = data.iloc[:, -1:]

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [92]:
X_train

Unnamed: 0,월평균소득금액,음식지출총금액,유사업종점포수,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,환산그외
5253,4370970,463608000,59,14196,20645,3401191,132918,157621,108216
739,3034862,1491785000,139,7547,23768,3115503,72989,101455,44523
4243,2795134,2006312000,165,1152,17745,7480502,109296,143625,74967
3265,3383749,1531450000,169,14856,36200,8606905,92404,93446,91363
223,4304601,4499905000,307,27945,20835,9327891,133965,172414,95516
...,...,...,...,...,...,...,...,...,...
6775,2955988,297024000,25,2626,23749,1913959,91444,109341,73547
5139,3682775,181459000,41,2705,15551,3044669,118645,126705,110584
6793,2551630,688946000,80,2466,30878,10043894,155292,156461,154124
578,3437396,473036000,54,1938,20586,3739630,72168,82374,61961


In [93]:
scaler_X = MinMaxScaler().fit(X_train)
scaler_y = MinMaxScaler().fit(y_train)

In [94]:
X_train_scaled = scaler_X.transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

In [95]:
def objective(trial):
    
    cbrm_param = {
        'loss_function': 'RMSE',
        'iterations': trial.suggest_int("iterations", 100, 500),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-5, 100),
        'depth': trial.suggest_int('depth', 1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30),
        'early_stopping_rounds': 25,
        'task_type' : 'GPU'
    }

    model_cbrm = CatBoostRegressor(**cbrm_param)
    model_cbrm.fit(X_train_scaled, y_train_scaled, eval_set=[(X_test_scaled, y_test_scaled)], verbose=1, plot=True)
    
    y_pred = model_cbrm.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    return r2

In [96]:
sampler = TPESampler()

In [None]:
optuna_cbrm = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cbrm.optimize(objective, n_trials=50)

In [None]:
print("Best trial:")
trial = optuna_cbrm.best_trial
print(f"  R^2: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
final_model_1 = CatBoostRegressor(iterations = 115,learning_rate = 0.012293620204899292, reg_lambda = 25.47275733606502,depth = 12,min_data_in_leaf = 14)
final_model_1.fit(X_train_scaled, y_train_scaled, eval_set=[(X_test_scaled, y_test_scaled)], verbose=1, plot=True)

In [None]:
final_pred_scaled = final_model_1.predict(X_test_scaled)
final_pred = scaler_y.inverse_transform(final_pred_scaled.reshape(-1, 1)).flatten()

final_r2 = r2_score(y_test, final_pred)
print(f"Final R^2 on test data: {final_r2}")

In [102]:
best_params = {
    'iterations' : 115,
    'learning_rate' : 0.012293620204899292, 
    'reg_lambda' : 25.47275733606502,
    'depth' : 12,
    'min_data_in_leaf' : 14
}

def cross_validate_model(X, y, params, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    r2_scores = []
    
    for train_index, val_index in kf.split(X):
        X_train_cv, X_val_cv = X[train_index], X[val_index]
        y_train_cv, y_val_cv = y[train_index], y[val_index]
        
        model = CatBoostRegressor(**best_params)
        model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=0)
        
        y_val_pred = model.predict(X_val_cv)
        y_val_orig = scaler_y.inverse_transform(y_val_cv.reshape(-1, 1)).flatten()
        y_val_pred_orig = scaler_y.inverse_transform(y_val_pred.reshape(-1, 1)).flatten()
        
        r2 = r2_score(y_val_orig, y_val_pred_orig)
        r2_scores.append(r2)
        
    return np.mean(r2_scores), np.std(r2_scores)

In [None]:
mean_r2, std_r2 = cross_validate_model(X_train_scaled, y_train_scaled, best_params)

In [None]:
print(f"Mean R^2 from cross-validation: {mean_r2}")
print(f"Standard deviation of R^2 from cross-validation: {std_r2}")

final_model_1 = CatBoostRegressor(**best_params)
final_model_1.fit(X_train_scaled, y_train_scaled, eval_set=[(X_test_scaled, y_test_scaled)], verbose=1, plot=True)

final_pred_scaled = final_model_1.predict(X_test_scaled)
final_pred = scaler_y.inverse_transform(final_pred_scaled.reshape(-1, 1)).flatten()

final_r2 = r2_score(y_test, final_pred)
print(f"Final R^2 on test data: {final_r2}")

In [None]:
print(f"교차검증 평균: {mean_r2}")
print(f"교차검증 표준편차: {std_r2}")


In [None]:
print(f"최종 R^2: {final_r2}")

In [None]:
final_model_1

In [107]:
import pickle

with open('cb_model_sample_1', 'wb') as f:
    pickle.dump(final_model_1, f)

In [108]:
with open('cb_model_sample_1', 'rb') as f:
    final_model_1 = pickle.load(f)

In [None]:
X_test.columns

In [110]:
df_sample = pd.DataFrame(X_test, columns=['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체',
'환산1층'])

# 예측값
predictions = final_model_1.predict(X_test_scaled)

# 예측값을 DataFrame의 새로운 열로 추가
df_sample['Weekly_Sales'] = predictions

# 부동 소수점 형식 설정
pd.options.display.float_format = '{:.2f}'.format

# 결과 출력



In [111]:
df_sample

Unnamed: 0,월평균소득금액,음식지출총금액,유사업종점포수,총직장인구수,총상주인구수,총유동인구수,환산전체,환산1층,Weekly_Sales
2125,5195802,392211000,68,9490,21595,1782898,122048,157113,0.03
1403,3672088,1567216000,163,14774,21886,5401905,88007,99492,0.06
4447,2453690,1310589000,278,13937,24606,8964197,102100,118667,0.07
5480,5007768,6708940000,420,47822,9524,4023951,162108,217048,0.19
1984,3263464,759409000,85,6769,32810,8235912,92684,127557,0.04
...,...,...,...,...,...,...,...,...,...
82,3338065,3195926000,192,11632,18409,5993563,107191,125270,0.09
3987,2897346,1873906000,271,4641,23440,6739724,89965,114027,0.08
2470,4470889,257463000,26,3192,18657,1471235,111275,121929,0.03
3297,2551630,443967000,86,2466,29648,10425677,62478,88739,0.03


In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 모델 학습 및 예측
final_model_1.fit(X_train_scaled, y_train)
predictions = final_model_1.predict(X_test_scaled)

# 스케일링되지 않은 원래 입력 데이터를 사용하여 데이터프레임 생성 및 예측값 추가
df = pd.DataFrame(X_test, columns=['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체','환산1층'])
df['prediction_rf'] = predictions

# 소수점 표시 형식 설정
pd.options.display.float_format = '{:.2f}'.format


In [116]:
total = pd.read_csv('ffinal_total4.csv',encoding= 'utf-8')

In [None]:
df

In [138]:
total_predict = df.merge(total, on = ['월평균소득금액', '음식지출총금액', '유사업종점포수', '총직장인구수', '총상주인구수', '총유동인구수', '환산전체','환산1층'],how='inner')

In [None]:
total_predict

In [140]:
median = total_predict.groupby('행정동코드')[['당월매출금액']].median().reset_index().rename(columns={'당월매출금액' : 'median'})

In [142]:
final = median.merge(total_predict, on = ['행정동코드'],how = 'inner')

In [None]:
final

In [144]:
final['success'] = final.apply(lambda row: 0 if row['median'] > row['prediction_rf'] else 1, axis=1)

In [147]:
final.to_csv('ffinal_catboost.csv', index= False, encoding = 'cp949')

In [None]:
df