In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# 기본 라이브러리
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 모델 라이브러리
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
import xgboost as xgb

# 평가 및 시각화
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor

# SHAP (선택사항)
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    print("SHAP 라이브러리가 설치되지 않았습니다. pip install shap 실행하세요.")
    SHAP_AVAILABLE = False

# 한글 폰트 설정
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

In [2]:
data = pd.read_csv('df_merged.csv')

In [None]:
def hybrid_outlier_treatment(df):
    # 그룹 1: 이상치 보존 (극한 기후 조건 반영 필요)
    preserve_vars = ['Temp', 'VPD', 'CO2ppm']  # 기후 관련 변수
    
    # 그룹 2: Winsorization (11-17% 이상치)
    winsorize_vars = ['Fv-Fm', 'DF_abs', 'Leaf_TPC', 'Root_TPC', 'Humid']
    
    # 그룹 3: 이상치 플래깅 (5% 미만 이상치)
    # 문자열형 변수 제외
    num_cols = df.select_dtypes(include='number').columns.tolist()
    flag_vars = [col for col in num_cols if col not in preserve_vars + winsorize_vars]
    
    # Winsorization 적용 (95% 백분위수 사용)
    for col in winsorize_vars:
        lower = df[col].quantile(0.025)
        upper = df[col].quantile(0.975)
        df[col] = df[col].clip(lower=lower, upper=upper)
    
    # 이상치 플래깅 -> 원본 유지
    outlier_flags = pd.DataFrame()
    for col in flag_vars:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outlier_flags[f'{col}_outlier'] = ((df[col] < Q1 - 1.5*IQR) | 
                                          (df[col] > Q3 + 1.5*IQR)).astype(int)
    
    return pd.concat([df, outlier_flags], axis=1)

In [4]:
df_processed = hybrid_outlier_treatment(data)

In [5]:
df_processed

Unnamed: 0,month,CO2ppm,Temp,Humid,VPD,Chl_a,Chl_b,TChl,Car,Chl_a_b,...,ABS-RC_outlier,Dio-RC_outlier,Tro-RC_outlier,Eto-RC_outlier,PI_abs_outlier,SFI_abs_outlier,Leaf_ExtractionYield_outlier,Root_ExtractionYield_outlier,Leaf_TFC_outlier,Root_TFC_outlier
0,5,381.681033,16.918639,81.750838,1.532512,8.79,2.22,11.00,2.97,3.97,...,0,0,0,0,0,0,0,1,0,0
1,5,374.463441,16.922124,81.750838,1.532868,8.99,2.56,11.55,3.09,3.52,...,0,0,0,0,0,0,0,1,0,0
2,5,371.850683,16.930256,81.750838,1.534584,9.66,2.44,12.10,3.11,3.96,...,0,0,0,0,0,0,0,1,0,0
3,5,400.475202,16.921511,81.750838,1.534512,9.33,2.45,11.79,3.13,3.80,...,0,0,0,0,0,0,0,1,0,0
4,5,381.360788,16.921323,81.750838,1.531475,10.53,2.58,13.11,3.37,4.08,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,9,1208.463000,25.901000,60.192000,2.560000,2.64,0.52,3.15,1.07,5.09,...,0,1,0,0,0,0,0,0,0,0
401,9,1211.911000,25.896000,60.254000,2.559000,4.74,1.12,5.86,1.53,4.22,...,0,1,0,0,0,0,0,0,0,0
402,9,1206.015000,25.923000,59.200486,2.565000,2.52,0.08,2.60,1.38,31.49,...,0,1,0,0,0,0,0,0,0,0
403,9,1225.166000,25.890000,59.200486,2.571000,2.63,0.39,3.02,1.17,6.73,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_processed.columns

Index(['month', 'CO2ppm', 'Temp', 'Humid', 'VPD', 'Chl_a', 'Chl_b', 'TChl',
       'Car', 'Chl_a_b', 'TCh-Car', 'ABS-RC', 'Dio-RC', 'Tro-RC', 'Eto-RC',
       'PI_abs', 'DF_abs', 'SFI_abs', 'Fv-Fm', 'Leaf_ExtractionYield',
       'Root_ExtractionYield', 'Leaf_TPC', 'Root_TPC', 'Leaf_TFC', 'Root_TFC',
       'scenario', 'month_outlier', 'Chl_a_outlier', 'Chl_b_outlier',
       'TChl_outlier', 'Car_outlier', 'Chl_a_b_outlier', 'TCh-Car_outlier',
       'ABS-RC_outlier', 'Dio-RC_outlier', 'Tro-RC_outlier', 'Eto-RC_outlier',
       'PI_abs_outlier', 'SFI_abs_outlier', 'Leaf_ExtractionYield_outlier',
       'Root_ExtractionYield_outlier', 'Leaf_TFC_outlier', 'Root_TFC_outlier'],
      dtype='object')

In [8]:
# csv 파일 저장
df_processed.to_csv('df_processed.csv', index=False)