In [255]:
# 데이터 처리
import pandas as pd
import numpy as np

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns

# 사이킷런: 전처리
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# 사이킷런: 모델 선택 및 평가
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# 사이킷런: 대표 모델
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# 부스팅 계열 모델
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 기타 (필요 시)
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")  # seaborn 스타일 설정
plt.rcParams['font.family'] = 'AppleGothic'  # Mac용 한글 폰트, Windows는 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 깨짐 방지

In [256]:
fatigue_df = pd.read_csv('/Users/joyongho/Desktop/code folder/fatigue_df_delta_ERA.csv')

In [257]:
# 결측치 드랍
fatigue_df.dropna(inplace=True)

## 신체 기량 PCA

### 주성분 1개 설명력 : 59.59%

### GPT 피셜 그래도 SHAP해라

In [258]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd

# 1. 사용할 기량 변수
skill_features = ['ERA', 'WHIP', '직구_피안타율']

# 2. 결측치 제거
X_skill = fatigue_df[skill_features].dropna()

# 3. 상수항 추가
X_skill_const = add_constant(X_skill)

# 4. VIF 계산
vif_skill = pd.DataFrame()
vif_skill['변수'] = X_skill_const.columns
vif_skill['VIF'] = [variance_inflation_factor(X_skill_const.values, i) for i in range(X_skill_const.shape[1])]

display(vif_skill)


Unnamed: 0,변수,VIF
0,const,26.029452
1,ERA,2.491147
2,WHIP,2.495516
3,직구_피안타율,1.006558


In [259]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1. 선수 기량 변수
skill_vars = ['ERA', 'WHIP', '직구_피안타율']
X_skill = fatigue_df[skill_vars].dropna()

# 2. 표준화 (PCA 전에 꼭 필요)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_skill)

# 3. PCA 주성분 1개 추출
pca = PCA(n_components=1)
PC1 = pca.fit_transform(X_scaled)

# 4. 결과 추가
fatigue_df.loc[X_skill.index, '기량_PC1'] = PC1[:, 0]


In [260]:
print(f"기량_PC1 설명력: {pca.explained_variance_ratio_[0]:.2%}")


기량_PC1 설명력: 59.59%


In [261]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1. 사용할 기량 변수
skill_vars = ['ERA', 'WHIP', '직구_피안타율']
X_skill = fatigue_df[skill_vars].dropna()

# 2. 표준화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_skill)

# 3. PCA (2개 주성분)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 4. 결과 저장
fatigue_df.loc[X_skill.index, '기량_PC1'] = X_pca[:, 0]
fatigue_df.loc[X_skill.index, '기량_PC2'] = X_pca[:, 1]

# 5. 설명력 출력
print(f"PC1 설명력: {pca.explained_variance_ratio_[0]:.2%}")
print(f"PC2 설명력: {pca.explained_variance_ratio_[1]:.2%}")
print(f"총 설명력: {pca.explained_variance_ratio_.sum():.2%}")


PC1 설명력: 59.59%
PC2 설명력: 32.87%
총 설명력: 92.46%


# Z-score

In [262]:
#%pip install shap

### 외부환경지수 z-score, SHAP 가중치

In [263]:
# Venue_bin: Home = 1, Away = 0
fatigue_df['Venue_bin'] = fatigue_df['Venue'].map({'Home': 1, 'Away': 0})

In [264]:
env_vars = ['누적이동거리', 'Temp', 'Venue_bin']

from sklearn.preprocessing import StandardScaler

# z-score 변환
scaler = StandardScaler()
env_z = scaler.fit_transform(fatigue_df[env_vars])

# 표준화된 결과 저장
env_z_df = pd.DataFrame(env_z, columns=[f"{col}_z" for col in env_vars])
fatigue_df = pd.concat([fatigue_df.reset_index(drop=True), env_z_df], axis=1)

# 평균 기반 환경지수 생성
fatigue_df['환경지수_z'] = env_z_df.mean(axis=1)

In [265]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import shap

# 타깃 및 피처 설정
target = 'delta_GSv2'
env_z_cols = [f"{col}_z" for col in env_vars]

X_env = fatigue_df[env_z_cols]
y_env = fatigue_df[target]

X_train, X_test, y_train, y_test = train_test_split(X_env, y_env, test_size=0.2, random_state=42)

# 모델 학습
model_env = CatBoostRegressor(verbose=0, random_state=42)
model_env.fit(X_train, y_train)


<catboost.core.CatBoostRegressor at 0x164fb30d0>

In [266]:
explainer = shap.Explainer(model_env)
shap_values = explainer(X_train)

# 평균 절댓값 기준 SHAP 중요도
shap_importance = np.abs(shap_values.values).mean(axis=0)

# 결과 정리
env_importance_df = pd.DataFrame({
    '변수': env_z_cols,
    'SHAP_중요도': shap_importance
})
env_importance_df['가중치'] = env_importance_df['SHAP_중요도'] / env_importance_df['SHAP_중요도'].sum()

display(env_importance_df)


Unnamed: 0,변수,SHAP_중요도,가중치
0,누적이동거리_z,0.473886,0.420906
1,Temp_z,0.4885,0.433886
2,Venue_bin_z,0.163485,0.145208


In [267]:
fatigue_df['환경지수'] = (
    0.433886 * fatigue_df['Temp_z'] +
    0.420906 * fatigue_df['누적이동거리_z'] +
    0.145208 * fatigue_df['Venue_bin_z']
)

### 기량지수 z-score, SHAP 가중치

In [268]:
from sklearn.preprocessing import StandardScaler

skill_vars = ['ERA', 'WHIP', '직구_피안타율']

# 표준화 실행
scaler = StandardScaler()
skill_z = scaler.fit_transform(fatigue_df[skill_vars])

# z-score 결과를 붙이기
skill_z_df = pd.DataFrame(skill_z, columns=[f"{col}_z" for col in skill_vars])
fatigue_df = pd.concat([fatigue_df.reset_index(drop=True), skill_z_df], axis=1)


In [269]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import shap

# 1. 타깃 & 피처 설정
target = 'delta_GSv2'  # 예시 타깃
skill_cols = ['ERA_z', 'WHIP_z', '직구_피안타율_z']

X = fatigue_df[skill_cols]
y = fatigue_df[target]

# 2. train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. CatBoost 회귀 모델 학습
model = CatBoostRegressor(verbose=0, random_state=42)
model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x162d78410>

In [270]:
import shap
import numpy as np
import pandas as pd

# 1. SHAP explainer 생성 및 값 계산
explainer = shap.Explainer(model)
shap_values = explainer(X_train)

# 2. 변수별 SHAP 중요도 = 평균 절댓값
shap_importance = np.abs(shap_values.values).mean(axis=0)

# 3. 중요도 정리
importance_df = pd.DataFrame({
    '변수': X_train.columns,
    'SHAP_중요도': shap_importance
})
importance_df['가중치'] = importance_df['SHAP_중요도'] / importance_df['SHAP_중요도'].sum()

# 결과 확인
display(importance_df)

Unnamed: 0,변수,SHAP_중요도,가중치
0,ERA_z,1.645411,0.341328
1,WHIP_z,2.635053,0.546622
2,직구_피안타율_z,0.54015,0.11205


In [271]:
# SHAP 기반 기량지수 계산
fatigue_df['기량지수'] = (
    0.341328 * fatigue_df['ERA_z'] +
    0.546622 * fatigue_df['WHIP_z'] +
    0.112050 * fatigue_df['직구_피안타율_z']
)