In [117]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression


# json 문자 삭제
import re


In [118]:
def drop_columns(df):
    columns_to_drop = ['날짜', '구분', '경기(행사)내용', '시설명', '지역', '인근_역', 'quantile']
    df = df.drop(columns=columns_to_drop)
    return df

In [119]:
df = pd.read_csv('oversampled_dataset.csv', encoding='cp949',index_col=0)

In [120]:
df = drop_columns(df)

In [121]:
def remove_pattern(text):
    if isinstance(text, str):  
        pattern = r'\([^)]*\)'
        return re.sub(pattern, '', text)
    else:
        return text

df.columns = df.columns.map(remove_pattern)

In [122]:
df

Unnamed: 0,관람인원,사용일수,강수량,평균기온,좌석수,좌석점유율,주말 여부,합계_승차,미세먼지.mean,초미세먼지.mean
0,1000,1,0.000000,14.900000,5003,19.990000,1,6146,39.600000,17.500000
1,15246,6,0.000000,15.900000,11000,22.860000,1,13492,65.800000,28.700000
2,17782,6,0.000000,17.600000,11000,26.660000,1,12518,50.000000,28.800000
3,928,5,0.000000,20.600000,1473,65.630000,1,13131,33.600000,19.800000
4,4000,3,3.000000,24.800000,15000,8.890000,0,5679,19.900000,10.500000
...,...,...,...,...,...,...,...,...,...,...
443,1285,7,0.000000,15.637079,1473,72.223684,1,11532,43.891901,23.882515
444,928,5,0.000000,15.978967,1473,71.385858,1,12533,32.594061,18.102478
445,928,7,0.000000,20.549143,1473,68.472910,1,12282,33.040572,17.969146
446,910,7,29.051371,15.392392,1391,66.293799,1,10189,18.728343,16.553619


In [123]:
# training and test
X = df.drop('좌석점유율', axis=1)
y = df['좌석점유율']

# test size 설정
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [124]:
# LightGBM 
lgbm_model = LGBMRegressor()

# LightGBM 파라미터 그리드
lgbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [5, 10, 20],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 1.0]
}

# LightGBM 그리드 서치
lgbm_grid_search = GridSearchCV(estimator=lgbm_model, param_grid=lgbm_params, scoring='neg_mean_squared_error', cv=5)
lgbm_grid_search.fit(X_train, y_train)
lgbm_best_params = lgbm_grid_search.best_params_
lgbm_best_model = lgbm_grid_search.best_estimator_

lgbm_best_model.fit(X_train, y_train)
lgbm_pred = lgbm_best_model.predict(X_test)
lgbm_rmse = mean_squared_error(y_test, lgbm_pred, squared=False)



print("LightGBM Best Parameters:", lgbm_best_params)
print("LightGBM RMSE:", lgbm_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, lgbm_pred)))

LightGBM Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 20, 'n_estimators': 300, 'subsample': 0.8}
LightGBM RMSE: 5.824954574321796
R제곱 (Variance Score) : 0.943


In [125]:
from sklearn.model_selection import cross_val_score
import numpy as np

# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(lgbm_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [ -22.13 -105.76  -32.64  -39.68  -38.63]
5 Folds 교차검증 개별 RMSE scores: [ 4.7  10.28  5.71  6.3   6.22]
5 Folds 교차검증 평균 RMSE: 6.643


In [126]:
# XGBoost
xgb_model = XGBRegressor()

# XGBoost 파라미터 그리드
xgb_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 2, 4]
}

# XGBoost 그리드 서치
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_params, scoring='neg_mean_squared_error', cv=5)
xgb_grid_search.fit(X_train, y_train)
xgb_best_params = xgb_grid_search.best_params_
xgb_best_model = xgb_grid_search.best_estimator_



xgb_best_model.fit(X_train, y_train)
xgb_pred = xgb_best_model.predict(X_test)
xgb_rmse = mean_squared_error(y_test, xgb_pred, squared=False)


print("XGBoost Best Parameters:", xgb_best_params)
print("XGBoost RMSE:", xgb_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, xgb_pred)))

XGBoost Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.9}
XGBoost RMSE: 5.759016973395443
R제곱 (Variance Score) : 0.944


In [127]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(xgb_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-13.26 -43.76 -44.34 -17.72 -55.84]
5 Folds 교차검증 개별 RMSE scores: [3.64 6.62 6.66 4.21 7.47]
5 Folds 교차검증 평균 RMSE: 5.72


In [128]:

# 선형 회귀 모델 생성
lr_model = LinearRegression()

# 하이퍼파라미터 그리드 설정
param_grid = {
    'fit_intercept': [True, False]
}

# 그리드 서치를 통한 최적의 파라미터 탐색
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 확인
best_params = grid_search.best_params_

# 최적의 파라미터로 모델 재학습
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
lr_pred = best_model.predict(X_test)

# 평가 지표 계산
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)

print("선형 회귀 Best Parameters:", best_params)
print("선형 회귀 RMSE:", lr_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, lr_pred)))

선형 회귀 Best Parameters: {'fit_intercept': True}
선형 회귀 RMSE: 16.52285948194996
R제곱 (Variance Score) : 0.541


In [129]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-282.58 -459.48 -214.68 -252.23 -242.73]
5 Folds 교차검증 개별 RMSE scores: [16.81 21.44 14.65 15.88 15.58]
5 Folds 교차검증 평균 RMSE: 16.872


In [130]:
import warnings
warnings.filterwarnings(action='ignore')

In [131]:
# Random Forest
rf_model = RandomForestRegressor()

# Random Forest 파라미터 그리드
rf_params = {
    'n_estimators': [100, 500, 1000],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 5, 10],
}

# Random Forest 그리드 서치
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_params, scoring='neg_mean_squared_error', cv=5)
rf_grid_search.fit(X_train, y_train)
rf_best_params = rf_grid_search.best_params_
rf_best_model = rf_grid_search.best_estimator_

rf_best_model.fit(X_train, y_train)
rf_pred = rf_best_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)

print("Random Forest Best Parameters:", rf_best_params)
print("Random Forest RMSE:", rf_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, rf_pred)))

Random Forest Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 1000}
Random Forest RMSE: 6.839203650475674
R제곱 (Variance Score) : 0.921


In [132]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(rf_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-30.72 -84.77 -43.47 -29.84 -60.39]
5 Folds 교차검증 개별 RMSE scores: [5.54 9.21 6.59 5.46 7.77]
5 Folds 교차검증 평균 RMSE: 6.915


In [133]:

# Ridge 회귀
ridge_model = Ridge()
ridge_params = {
    'alpha': [0.1, 1, 10],
    'fit_intercept': [True, False]
}
ridge_grid_search = GridSearchCV(estimator=ridge_model, param_grid=ridge_params, scoring='neg_mean_squared_error', cv=5)
ridge_grid_search.fit(X_train, y_train)
ridge_best_params = ridge_grid_search.best_params_
ridge_best_model = ridge_grid_search.best_estimator_
ridge_best_model.fit(X_train, y_train)
ridge_pred = ridge_best_model.predict(X_test)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

print("Ridge 회귀 Best Parameters:", ridge_best_params)
print("Ridge 회귀 RMSE:", ridge_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, ridge_pred)))

Ridge 회귀 Best Parameters: {'alpha': 10, 'fit_intercept': True}
Ridge 회귀 RMSE: 16.516993360736116
R제곱 (Variance Score) : 0.541


In [134]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(ridge_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-282.58 -460.1  -213.95 -252.31 -242.01]
5 Folds 교차검증 개별 RMSE scores: [16.81 21.45 14.63 15.88 15.56]
5 Folds 교차검증 평균 RMSE: 16.866


In [135]:
# Lasso 회귀
lasso_model = Lasso()
lasso_params = {
    'alpha': [0.1, 1, 10],
    'fit_intercept': [True, False]
}
lasso_grid_search = GridSearchCV(estimator=lasso_model, param_grid=lasso_params, scoring='neg_mean_squared_error', cv=5)
lasso_grid_search.fit(X_train, y_train)
lasso_best_params = lasso_grid_search.best_params_
lasso_best_model = lasso_grid_search.best_estimator_
lasso_best_model.fit(X_train, y_train)
lasso_pred = lasso_best_model.predict(X_test)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

print("Lasso 회귀 Best Parameters:", lasso_best_params)
print("Lasso 회귀 RMSE:", lasso_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, lasso_pred)))

Lasso 회귀 Best Parameters: {'alpha': 0.1, 'fit_intercept': True}
Lasso 회귀 RMSE: 16.516615459636377
R제곱 (Variance Score) : 0.542


In [136]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(lasso_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-282.9  -461.39 -213.89 -252.3  -241.9 ]
5 Folds 교차검증 개별 RMSE scores: [16.82 21.48 14.62 15.88 15.55]
5 Folds 교차검증 평균 RMSE: 16.872


In [137]:
# ElasticNet 회귀
elasticnet_model = ElasticNet()
elasticnet_params = {
    'alpha': [0.1, 1, 10],
    'l1_ratio': [0.3, 0.5, 0.7],
    'fit_intercept': [True, False]
}
elasticnet_grid_search = GridSearchCV(estimator=elasticnet_model, param_grid=elasticnet_params, scoring='neg_mean_squared_error', cv=5)
elasticnet_grid_search.fit(X_train, y_train)
elasticnet_best_params = elasticnet_grid_search.best_params_
elasticnet_best_model = elasticnet_grid_search.best_estimator_
elasticnet_best_model.fit(X_train, y_train)
elasticnet_pred = elasticnet_best_model.predict(X_test)
elasticnet_rmse = mean_squared_error(y_test, elasticnet_pred, squared=False)

print("ElasticNet 회귀 Best Parameters:", elasticnet_best_params)
print("ElasticNet 회귀 RMSE:", elasticnet_rmse)
print('R제곱 (Variance Score) : {0:.3f}'.format(r2_score(y_test, elasticnet_pred)))

ElasticNet 회귀 Best Parameters: {'alpha': 0.1, 'fit_intercept': True, 'l1_ratio': 0.3}
ElasticNet 회귀 RMSE: 16.51282080217472
R제곱 (Variance Score) : 0.542


In [138]:
# 5-fold 교차 검증으로 MSE 구함
neg_mse_scores = cross_val_score(elasticnet_best_model, X_train, y_train,
                                 scoring='neg_mean_squared_error',
                                 cv=5)

# neg_mse: 음수값이므로 -1을 곱해서 사용
# RMSE는 np.sqrt() 사용
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# 평가 지표 출력
print('5 Folds 교차검증 개별 Negative MSE scores:', np.round(neg_mse_scores, 2))
print('5 Folds 교차검증 개별 RMSE scores:', np.round(rmse_scores, 2))
print('5 Folds 교차검증 평균 RMSE:', np.round(avg_rmse, 3))

5 Folds 교차검증 개별 Negative MSE scores: [-282.86 -460.99 -213.36 -252.4  -241.43]
5 Folds 교차검증 개별 RMSE scores: [16.82 21.47 14.61 15.89 15.54]
5 Folds 교차검증 평균 RMSE: 16.864
