In [1]:
### 모듈 및 데이터 로딩
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tpot import TPOTRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
file = '../0. Data/2. output/1. Train_data.csv'
data = pd.read_csv(file)

In [2]:
### 모델링 및 Train 평가


# 중복값 제거
data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = data.drop(columns=['scale_pv'])
y = data['scale_pv']


# 학습 데이터와 테스트 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


# 타겟 스케일링
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_valid_scaled = target_scaler.transform(y_valid.values.reshape(-1, 1))


# TPOT Regressor 설정 및 학습
tpot = TPOTRegressor(verbosity=2, generations=5, population_size=50, random_state=42, n_jobs=-1)
tpot.fit(X_train_scaled, y_train_scaled.ravel())


# 모델 평가
y_train_pred_scaled = tpot.predict(X_train_scaled)
y_valid_pred_scaled = tpot.predict(X_valid_scaled)


# 타겟 역스케일링
y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
y_valid_pred = target_scaler.inverse_transform(y_valid_pred_scaled.reshape(-1, 1))
y_train_original = target_scaler.inverse_transform(y_train_scaled)
y_valid_original = target_scaler.inverse_transform(y_valid_scaled)


# 평가 지표 계산
train_mae = mean_absolute_error(y_train_original, y_train_pred)
valid_mae = mean_absolute_error(y_valid_original, y_valid_pred)
train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
valid_mape = mean_absolute_percentage_error(y_valid_original, y_valid_pred)

print(f"Train MAE: {train_mae}")
print(f"Valid MAE: {valid_mae}")
print(f"Train MAPE: {train_mape * 100}")
print(f"Valid MAPE: {valid_mape * 100}")


# 최적의 파이프라인 출력 및 저장
print(tpot.fitted_pipeline_)
tpot.export('2.1 scaling_tpot_best_pipeline.py')

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


                                                                              
Generation 1 - Current best internal CV score: -0.6754608515645353
                                                                              
Generation 2 - Current best internal CV score: -0.6754608515645353
                                                                              
Generation 3 - Current best internal CV score: -0.6754608515645353
                                                                              
Generation 4 - Current best internal CV score: -0.6706113498202649
                                                                              
Generation 5 - Current best internal CV score: -0.6706113498202649
                                                                              
Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.5, min_samples_leaf=1, min_samples_split=8, n_estimators=100)
Train MAE: 0.012408305003867139
Valid MAE: 0.0

In [5]:
### Test 평가


# Test 데이터 로딩
test_file = '../0. Data/2. output/0. Test_data.csv'  # 적절한 경로로 수정
test_data = pd.read_csv(test_file)


# 피처와 타겟 분리
X_test_final = test_data.drop(columns=['scale_pv'])
y_test_final = test_data['scale_pv']


# 테스트 데이터 스케일링
X_test_final_scaled = scaler.transform(X_test_final)
y_test_final_scaled = target_scaler.transform(y_test_final.values.reshape(-1, 1))


# 최종 테스트 데이터 예측 및 평가
y_test_pred_scaled = tpot.predict(X_test_final_scaled)


# 타겟 역스케일링
y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
y_test_original = target_scaler.inverse_transform(y_test_final_scaled)


# 평가 지표 계산
test_mae = mean_absolute_error(y_test_original, y_test_pred)
test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)


print(f"Final Test MAE: {test_mae}")
print(f"Final Test MAPE: {test_mape * 100}")

Final Test MAE: 0.026272391289611938
Final Test MAPE: 0.8601835680490993
