In [3]:
### 모듈 및 데이터 로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
# 데이터 로딩
file = '../0. Data/2. output/1. Train_data.csv'
data = pd.read_csv(file)

In [4]:
### 모델링 및  Train 평가

# 중복값 제거
data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = data.drop(columns=['scale_pv'])
y = data['scale_pv']


# 학습 데이터와 검증 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


# 타겟 스케일링
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_valid_scaled = target_scaler.transform(y_valid.values.reshape(-1, 1))


### Grid Search를 사용한 최적의 파라미터 찾기

# Linear Regression - GridSearchCV
lr_param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False],
    'n_jobs': [None, -1],
    'positive': [True, False]
}
lr_grid_search = GridSearchCV(LinearRegression(), lr_param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
lr_grid_search.fit(X_train_scaled, y_train_scaled.ravel())

print(f"Best parameters for Linear Regression: {lr_grid_search.best_params_}")
print(f"Best CV score (MAE) for Linear Regression: {-lr_grid_search.best_score_}")

# Random Forest - GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train_scaled.ravel())

print(f"Best parameters for Random Forest: {rf_grid_search.best_params_}")
print(f"Best CV score (MAE) for Random Forest: {-rf_grid_search.best_score_}")

# LightGBM - GridSearchCV
lgb_param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'max_depth': [-1, 10, 20, 30]
}
lgb_grid_search = GridSearchCV(lgb.LGBMRegressor(random_state=42), lgb_param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
lgb_grid_search.fit(X_train_scaled, y_train_scaled.ravel())

print(f"Best parameters for LightGBM: {lgb_grid_search.best_params_}")
print(f"Best CV score (MAE) for LightGBM: {-lgb_grid_search.best_score_}")

### 최적의 모델로 학습 및 평가

# 최적의 파라미터로 모델 재학습
lr_best_model = lr_grid_search.best_estimator_
rf_best_model = rf_grid_search.best_estimator_
lgb_best_model = lgb_grid_search.best_estimator_


# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_valid, y_train_scaled, y_valid_scaled, target_scaler):
    model.fit(X_train, y_train_scaled.ravel())
    y_train_pred_scaled = model.predict(X_train)
    y_valid_pred_scaled = model.predict(X_valid)

    # 역스케일링
    y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
    y_valid_pred = target_scaler.inverse_transform(y_valid_pred_scaled.reshape(-1, 1))
    y_train_original = target_scaler.inverse_transform(y_train_scaled)
    y_valid_original = target_scaler.inverse_transform(y_valid_scaled)

    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    valid_mae = mean_absolute_error(y_valid_original, y_valid_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    valid_mape = mean_absolute_percentage_error(y_valid_original, y_valid_pred)

    return train_mae, valid_mae, train_mape, valid_mape, y_train_pred, y_valid_pred


# 모델 학습 및 평가

# Multiple Regression
lr_train_mae, lr_valid_mae, lr_train_mape, lr_valid_mape, lr_y_train_pred, lr_y_valid_pred = train_and_evaluate_model(lr_best_model, X_train_scaled, X_valid_scaled, y_train_scaled, y_valid_scaled, target_scaler)

# Random Forest
rf_train_mae, rf_valid_mae, rf_train_mape, rf_valid_mape, rf_y_train_pred, rf_y_valid_pred = train_and_evaluate_model(rf_best_model, X_train_scaled, X_valid_scaled, y_train_scaled, y_valid_scaled, target_scaler)

# LightGBM
lgb_train_mae, lgb_valid_mae, lgb_train_mape, lgb_valid_mape, lgb_y_train_pred, lgb_y_valid_pred = train_and_evaluate_model(lgb_best_model, X_train_scaled, X_valid_scaled, y_train_scaled, y_valid_scaled, target_scaler)

# 결과 출력
print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Valid MAE: {lr_valid_mae}, Valid MAPE: {lr_valid_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Valid MAE: {rf_valid_mae}, Valid MAPE: {rf_valid_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Valid MAE: {lgb_valid_mae}, Valid MAPE: {lgb_valid_mape*100}")

Best parameters for Linear Regression: {'copy_X': True, 'fit_intercept': False, 'n_jobs': None, 'positive': False}
Best CV score (MAE) for Linear Regression: 0.6614026279955656
Best parameters for Random Forest: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score (MAE) for Random Forest: 0.5481775349382733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 26844, number of used features: 4
[LightGBM] [Info] Start training from score -0.000000
Best parameters for LightGBM: {'learning_rate': 0.1, 'max_depth': 30, 'n_estimators': 200, 'num_leaves': 100}
Best CV score (MAE) for LightGBM: 0.583842624248817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 se

In [9]:
import joblib


# 모델 및 스케일러 저장
scaler_data = {
    'scaler': scaler,
    'feature_names': X.columns.tolist(),
    'target_scaler': target_scaler
}
joblib.dump(lr_best_model, '../2. Modeling/model/psG_lr_model.pkl')
joblib.dump(scaler_data, '../2. Modeling/model/psG_lr_scaler.pkl')

print("Model saved successfully!")

Model saved successfully!


In [5]:
### Test 평가

# Test_data 로딩
test_file = '../0. Data/2. output/0. Test_data.csv'
test_data = pd.read_csv(test_file)


# 피처와 타겟 분리
X_test_final = test_data.drop(columns=['scale_pv'])
y_test_final = test_data['scale_pv']


# 테스트 데이터 스케일링
X_test_final_scaled = scaler.transform(X_test_final)
y_test_final_scaled = target_scaler.transform(y_test_final.values.reshape(-1, 1))


# 최종 테스트 데이터 예측 및 평가
def final_evaluate_model(model, X_test_scaled, y_test_scaled, target_scaler):
    y_test_pred_scaled = model.predict(X_test_scaled)

    # 역스케일링
    y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_test_original = target_scaler.inverse_transform(y_test_scaled)

    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)

    return test_mae, test_mape, y_test_pred


# 최종 평가 결과
lr_test_mae_final, lr_test_mape_final, lr_y_test_pred_final = final_evaluate_model(lr_best_model, X_test_final_scaled, y_test_final_scaled, target_scaler)
rf_test_mae_final, rf_test_mape_final, rf_y_test_pred_final = final_evaluate_model(rf_best_model, X_test_final_scaled, y_test_final_scaled, target_scaler)
lgb_test_mae_final, lgb_test_mape_final, lgb_y_test_pred_final = final_evaluate_model(lgb_best_model, X_test_final_scaled, y_test_final_scaled, target_scaler)

print(f"Final Test - Linear Regression MAE: {lr_test_mae_final}, MAPE: {lr_test_mape_final*100}")
print(f"Final Test - Random Forest MAE: {rf_test_mae_final}, MAPE: {rf_test_mape_final*100}")
print(f"Final Test - LightGBM MAE: {lgb_test_mae_final}, MAPE: {lgb_test_mape_final*100}")

Final Test - Linear Regression MAE: 0.02367770578477322, MAPE: 0.7737255505206652
Final Test - Random Forest MAE: 0.02656589468334562, MAPE: 0.8697699337806791
Final Test - LightGBM MAE: 0.02589248745148354, MAPE: 0.847608143528696
