In [1]:
### 모듈 및 데이터 로딩
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
file = '../0. Data/2. output/1. Train_data.csv'
data = pd.read_csv(file)

In [2]:
### 모델링 및  Train 평가


# 중복값 제거
data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = data.drop(columns=['scale_pv'])
y = data['scale_pv']


# 학습 데이터와 테스트 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)

    train_mae = mean_absolute_error(y_train, y_train_pred)
    valid_mae = mean_absolute_error(y_valid, y_valid_pred)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)

    return train_mae, valid_mae, train_mape, valid_mape, y_train_pred, y_valid_pred


# 모델 학습 및 평가

# Multiple Regression
lr_model = LinearRegression()
lr_train_mae, lr_valid_mae, lr_train_mape, lr_valid_mape, lr_y_train_pred, lr_y_valid_pred = train_and_evaluate_model(lr_model, X_train, X_valid, y_train, y_valid)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_train_mae, rf_valid_mae, rf_train_mape, rf_valid_mape, rf_y_train_pred, rf_y_valid_pred = train_and_evaluate_model(rf_model, X_train, X_valid, y_train, y_valid)

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_train_mae, lgb_valid_mae, lgb_train_mape, lgb_valid_mape, lgb_y_train_pred, lgb_y_valid_pred = train_and_evaluate_model(lgb_model, X_train, X_valid, y_train, y_valid)


# 결과 출력
print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Valid MAE: {lr_valid_mae}, Valid MAPE: {lr_valid_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Valid MAE: {rf_valid_mae}, Valid MAPE: {rf_valid_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Valid MAE: {lgb_valid_mae}, Valid MAPE: {lgb_valid_mape*100}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000474 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 26844, number of used features: 4
[LightGBM] [Info] Start training from score 3.041687
Linear Regression - Train MAE: 0.026006620096935367, Train MAPE: 0.8563967117358431
Linear Regression - Valid MAE: 0.026538280288540907, Valid MAPE: 0.8754376532083852

Random Forest - Train MAE: 0.009557685622079395, Train MAPE: 0.3146545374290152
Random Forest - Valid MAE: 0.022573473714241007, Valid MAPE: 0.7439481077144354

LightGBM - Train MAE: 0.023162779940527782, Train MAPE: 0.7625622551093778
LightGBM - Valid MAE: 0.02434221997550128, Valid MAPE: 0.8020035577248755


In [3]:
### Test 평가

# Test_data 로딩
test_file = '../0. Data/2. output/0. Test_data.csv'
test_data = pd.read_csv(test_file)


# 피처와 타겟 분리
X_test_final = test_data.drop(columns=['scale_pv'])
y_test_final = test_data['scale_pv']


# 최종 테스트 데이터 예측 및 평가
def final_evaluate_model(model, X_test, y_test):
    y_test_pred = model.predict(X_test)

    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

    return test_mae, test_mape, y_test_pred


# 최종 평가 결과
lr_test_mae_final, lr_test_mape_final, lr_y_test_pred_final = final_evaluate_model(lr_model, X_test_final, y_test_final)
rf_test_mae_final, rf_test_mape_final, rf_y_test_pred_final = final_evaluate_model(rf_model, X_test_final, y_test_final)
lgb_test_mae_final, lgb_test_mape_final, lgb_y_test_pred_final = final_evaluate_model(lgb_model, X_test_final, y_test_final)

print(f"Final Test - Linear Regression MAE: {lr_test_mae_final}, MAPE: {lr_test_mape_final*100}")
print(f"Final Test - Random Forest MAE: {rf_test_mae_final}, MAPE: {rf_test_mape_final*100}")
print(f"Final Test - LightGBM MAE: {lgb_test_mae_final}, MAPE: {lgb_test_mape_final*100}")

Final Test - Linear Regression MAE: 0.02367770578477326, MAPE: 0.7737255505206666
Final Test - Random Forest MAE: 0.026774825057899774, MAPE: 0.8766874021924751
Final Test - LightGBM MAE: 0.02423126942205223, MAPE: 0.7930422335720423
