# 불러오기

In [1]:
# 데이터 처리
import pandas as pd
import numpy as np

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns

# 사이킷런: 전처리
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

# 사이킷런: 모델 선택 및 평가
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# 사이킷런: 대표 모델
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# 부스팅 계열 모델
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 기타 (필요 시)
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")  # seaborn 스타일 설정
plt.rcParams['font.family'] = 'AppleGothic'  # Mac용 한글 폰트, Windows는 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 깨짐 방지


In [2]:
fatigue_df = pd.read_csv('/Users/joyongho/Desktop/code folder/merged_최종최종_Total_IP.csv')

# 칼럼 수정

In [107]:
# 30이닝 미만 선수 전부 제거

fatigue_df = fatigue_df[fatigue_df['Total_IP'] >= 30]

In [109]:
# 칼럼 제거
fatigue_df.drop(columns=['Dec', 'Stadium','커터_피안타율','커브_피안타율','슬라_피안타율','첸접_피안타율','싱커_피안타율','포크_피안타율','너클_피안타율','기타_피안타율','BirthYear','Opp','GS','Time'], inplace=True)

직구 피안타율 생성

In [110]:
# 포심_구사율과 투심_구사율이 모두 0인 행 제거
fatigue_df = fatigue_df[~((fatigue_df['포심_구사율'] == 0) & (fatigue_df['투심_구사율'] == 0))]

In [111]:
fatigue_df['직구_피안타율'] = (fatigue_df['포심_피안타율'] * fatigue_df['포심_구사율'] + fatigue_df['투심_피안타율'] * fatigue_df['투심_구사율']) / (fatigue_df['포심_구사율'] + fatigue_df['투심_구사율'])

In [114]:
fatigue_df.drop(columns=['포심_피안타율','투심_피안타율'], inplace=True)

직구 구사율 생성

In [115]:
fatigue_df['직구_구사율'] = (fatigue_df['포심_구사율'] + fatigue_df['투심_구사율'])

In [116]:
fatigue_df.drop(columns=['포심_구사율','투심_구사율'], inplace=True)

변화구 구사율 생성

In [118]:
fatigue_df['변화구_구사율'] = 100 - fatigue_df['직구_구사율']

In [119]:
fatigue_df.drop(columns=['커터_구사율','커브_구사율','슬라_구사율',
                     '첸접_구사율','싱커_구사율',
                     '포크_구사율','너클_구사율','기타_구사율'], inplace=True)

In [None]:
# fatigue_df.to_csv('/Users/joyongho/Desktop/code folder/피로도_df.csv', index=False)

# WHIP 문제 해결 df + 거리 df 병합

In [150]:
# WHIP 해결 csv, 거리 해결 csv 불러오기
fatigue_df = pd.read_csv('/Users/joyongho/Desktop/code folder/피로도_df_보간완료.csv',encoding='utf-8-sig', parse_dates=['Date'])
distance_df = pd.read_csv('/Users/joyongho/Desktop/code folder/kbo_team_total_distances.csv',encoding='utf-8-sig', parse_dates=['날짜_정렬'])

In [152]:
# distance_df의 컬럼명을 fatigue_df와 맞게 변경
distance_df_renamed = distance_df.rename(columns={
    '연도': 'Year',
    '구단': 'Team',
    '날짜_정렬': 'Date'
})

In [153]:
# fatigue_df에 distance 정보 병합
merged_df = pd.merge(
    fatigue_df,
    distance_df_renamed,
    how='left',
    on=['Year', 'Team', 'Date']
)

In [161]:
merged_df.columns

Index(['Name', 'Year', 'Team', 'Date', 'Role', 'ERA', 'WHIP', 'IP', 'NP', 'R',
       'ER', 'H', 'HR', 'SO', 'BB', 'HB', 'Venue', 'Temp', '연투 여부', '휴식일 수',
       '연투일', '2023WBC', '2021올림픽', '2023아시안게임', '2023아시아프로야구챔피언십',
       '2023 아시아 야구 선수권 대회', '2019프리미어12', 'PS_KS', 'PS_PO', 'PS_SP', 'PS_V',
       'PS_WC', 'Height', 'Weight', 'Age', '총이동거리', '전체구속', '2Seam', '4Seam',
       'Cutter', 'Curve', 'Slider', 'Changeup', 'Sinker', 'Forkball',
       'Knuckle', 'Other', '누적_IP', 'FIP', 'GSv2', 'avg_GSv2', 'delta_GSv2',
       'Total_IP', '직구_피안타율', '직구_구사율', '변화구_구사율', 'WHIP_missing_flag', '날짜',
       '구장', '월', '일', '이동거리(km)', '누적이동거리(km)'],
      dtype='object')

In [162]:
# distance 관련 컬럼 제거
merged_df = merged_df.drop(columns=['구장', '월','일', '날짜'])

In [165]:
fatigue_df = merged_df

In [None]:
# fatigue_df.to_csv('/Users/joyongho/Desktop/code folder/fatigue_df_거리완료.csv', index=False)