# 1. 데이터 전처리 (Data Preprocessing)

## 목표
- KBO 타자 데이터 로드 및 정제
- 일별 데이터에서 선수별 일관성 지표 계산
- 결측치 처리 및 필요한 변수 생성

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1.1 데이터 로드

In [6]:
data_path = '../data/'

regular_season = pd.read_csv(data_path + 'Regular_Season_Batter.csv')
pre_season = pd.read_csv(data_path + 'Pre_Season_Batter.csv')
day_by_day = pd.read_csv(data_path + 'Regular_Season_Batter_Day_by_Day_b4.csv')

print("Regular Season Shape:", regular_season.shape)
print("Pre Season Shape:", pre_season.shape)
print("Day by Day Shape:", day_by_day.shape)

Regular Season Shape: (2454, 29)
Pre Season Shape: (1393, 29)
Day by Day Shape: (112273, 20)


In [7]:
print("\n=== Regular Season 데이터 샘플 ===")
regular_season.head()


=== Regular Season 데이터 샘플 ===


Unnamed: 0,batter_id,batter_name,year,team,avg,G,AB,R,H,2B,3B,HR,TB,RBI,SB,CS,BB,HBP,SO,GDP,SLG,OBP,E,height/weight,year_born,position,career,starting_salary,OPS
0,0,가르시아,2018,LG,0.339,50,183,27,62,9,0,8,95,34,5,0,9,8,25,3,0.519,0.383,9,177cm/93kg,1985년 04월 12일,내야수(우투우타),쿠바 Ciego de Avila Maximo Gomez Baez(대),,0.902
1,1,강경학,2011,한화,0.0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,1,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.0
2,1,강경학,2014,한화,0.221,41,86,11,19,2,3,1,30,7,0,0,13,2,28,1,0.349,0.337,6,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.686
3,1,강경학,2015,한화,0.257,120,311,50,80,7,4,2,101,27,4,3,40,5,58,3,0.325,0.348,15,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.673
4,1,강경학,2016,한화,0.158,46,101,16,16,3,2,1,26,7,0,0,8,2,30,5,0.257,0.232,7,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.489


In [8]:
print("\n=== Day by Day 데이터 샘플 ===")
day_by_day.head()


=== Day by Day 데이터 샘플 ===


Unnamed: 0,batter_id,batter_name,date,opposing_team,avg1,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,HBP,SO,GDP,avg2,year
0,0,가르시아,3.24,NC,0.333,3,1,1,0,0,0,0,0,0,1,0,1,0,0.333,2018
1,0,가르시아,3.25,NC,0.0,4,0,0,0,0,0,0,0,0,0,0,1,0,0.143,2018
2,0,가르시아,3.27,넥센,0.2,5,0,1,0,0,0,0,0,0,0,0,0,0,0.167,2018
3,0,가르시아,3.28,넥센,0.2,5,1,1,0,0,0,1,0,0,0,0,0,0,0.176,2018
4,0,가르시아,3.29,넥센,0.25,4,0,1,0,0,0,3,0,0,0,0,0,1,0.19,2018


## 1.2 기본 정보 파싱 (키, 몸무게, 나이)

In [9]:
def parse_height_weight(df):
    df['height'] = df['height/weight'].str.extract(r'(\d+)cm').astype(float)
    df['weight'] = df['height/weight'].str.extract(r'(\d+)kg').astype(float)
    df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)
    return df

def parse_age(df):
    df['birth_year'] = df['year_born'].str.extract(r'(\d{4})').astype(float)
    df['age'] = df['year'] - df['birth_year'] + 1
    return df

regular_season = parse_height_weight(regular_season)
regular_season = parse_age(regular_season)

print("\n키, 몸무게, BMI, 나이 파싱 완료")
regular_season[['batter_name', 'year', 'height', 'weight', 'BMI', 'age']].head()


키, 몸무게, BMI, 나이 파싱 완료


Unnamed: 0,batter_name,year,height,weight,BMI,age
0,가르시아,2018,177.0,93.0,29.684956,34.0
1,강경학,2011,180.0,72.0,22.222222,20.0
2,강경학,2014,180.0,72.0,22.222222,23.0
3,강경학,2015,180.0,72.0,22.222222,24.0
4,강경학,2016,180.0,72.0,22.222222,25.0


## 1.3 팀명 통일 및 해체 구단 제외

In [10]:
print("\n=== 팀별 데이터 개수 ===")
print(regular_season['team'].value_counts())

regular_season = regular_season[regular_season['team'] != '현대'].copy()
regular_season['team'] = regular_season['team'].replace({'우리': '넥센', '히어로즈': '넥센'})

print("\n현대 구단 제외 및 팀명 통일 완료")
print(regular_season['team'].value_counts())


=== 팀별 데이터 개수 ===
team
한화      302
롯데      298
LG      294
두산      282
삼성      280
SK      254
KIA     254
넥센      172
NC      131
KT       85
현대       60
우리       13
히어로즈     13
해태       12
OB        3
쌍방울       1
Name: count, dtype: int64

현대 구단 제외 및 팀명 통일 완료
team
한화     302
롯데     298
LG     294
두산     282
삼성     280
SK     254
KIA    254
넥센     198
NC     131
KT      85
해태      12
OB       3
쌍방울      1
Name: count, dtype: int64


## 1.4 포지션 분류

In [11]:
def classify_position(position):
    if pd.isna(position):
        return 'Unknown'
    if '내야수' in position:
        return '내야수'
    elif '외야수' in position:
        return '외야수'
    elif '포수' in position:
        return '포수'
    else:
        return 'Other'

regular_season['position_group'] = regular_season['position'].apply(classify_position)

print("\n=== 포지션별 데이터 개수 ===")
print(regular_season['position_group'].value_counts())


=== 포지션별 데이터 개수 ===
position_group
내야수        814
Unknown    763
외야수        614
포수         203
Name: count, dtype: int64


## 1.5 경력 분류 (신인 vs 베테랑)

In [12]:
career_years = regular_season.groupby('batter_id')['year'].apply(lambda x: x.max() - x.min() + 1).reset_index()
career_years.columns = ['batter_id', 'career_years']

regular_season = regular_season.merge(career_years, on='batter_id', how='left')
regular_season['career_group'] = regular_season['career_years'].apply(lambda x: '5년차 이하' if x <= 5 else '6년차 이상')

print("\n=== 경력별 데이터 개수 ===")
print(regular_season['career_group'].value_counts())


=== 경력별 데이터 개수 ===
career_group
6년차 이상    2073
5년차 이하     321
Name: count, dtype: int64


## 1.6 장타형 vs 안타형 분류

In [13]:
regular_season['HR_per_AB'] = regular_season['HR'] / regular_season['AB'].replace(0, np.nan)

hr_threshold = regular_season.groupby('year')['HR_per_AB'].quantile(0.7).to_dict()
regular_season['batter_type'] = regular_season.apply(
    lambda row: '장타형' if row['HR_per_AB'] >= hr_threshold.get(row['year'], 0) else '안타형',
    axis=1
)

print("\n=== 타자 유형별 데이터 개수 ===")
print(regular_season['batter_type'].value_counts())


=== 타자 유형별 데이터 개수 ===
batter_type
안타형    1673
장타형     721
Name: count, dtype: int64


## 1.7 일별 데이터 전처리 및 일관성 지표 계산

In [14]:
day_by_day['daily_avg'] = day_by_day['H'] / day_by_day['AB'].replace(0, np.nan)
day_by_day = day_by_day.dropna(subset=['daily_avg'])

print("\n일별 타율 계산 완료")
print(f"총 {len(day_by_day)} 경기 기록")


일별 타율 계산 완료
총 100793 경기 기록


In [15]:
consistency_stats = day_by_day.groupby(['batter_id', 'year']).agg({
    'daily_avg': ['mean', 'std', 'count'],
    'H': 'sum',
    'AB': 'sum',
    'HR': 'sum'
}).reset_index()

consistency_stats.columns = ['batter_id', 'year', 'avg_daily_avg', 'std_daily_avg', 'games_count', 'total_H', 'total_AB', 'total_HR']

consistency_stats['CV'] = consistency_stats['std_daily_avg'] / consistency_stats['avg_daily_avg'].replace(0, np.nan)
consistency_stats['season_avg'] = consistency_stats['total_H'] / consistency_stats['total_AB'].replace(0, np.nan)

consistency_stats = consistency_stats[consistency_stats['games_count'] >= 30]

print("\n=== 일관성 지표 계산 완료 ===")
print(f"30경기 이상 출전 선수-시즌: {len(consistency_stats)}")
consistency_stats.head()


=== 일관성 지표 계산 완료 ===
30경기 이상 출전 선수-시즌: 1033


Unnamed: 0,batter_id,year,avg_daily_avg,std_daily_avg,games_count,total_H,total_AB,total_HR,CV,season_avg
0,0,2018,0.314667,0.280743,50,62,183,8,0.892193,0.338798
2,1,2014,0.221759,0.29851,36,19,86,1,1.3461,0.22093
3,1,2015,0.232039,0.272672,103,80,311,2,1.175115,0.257235
4,1,2016,0.136036,0.194028,37,16,101,1,1.426298,0.158416
5,1,2017,0.206589,0.349511,43,18,84,0,1.691817,0.214286


## 1.8 데이터 병합

In [16]:
final_data = regular_season.merge(
    consistency_stats[['batter_id', 'year', 'CV', 'std_daily_avg', 'games_count']], 
    on=['batter_id', 'year'], 
    how='inner'
)

final_data = final_data[final_data['AB'] >= 100].copy()

print("\n=== 최종 데이터 ===")
print(f"Shape: {final_data.shape}")
print(f"\n결측치 확인:")
print(final_data[['avg', 'OPS', 'SLG', 'OBP', 'CV', 'position_group', 'career_group']].isnull().sum())


=== 최종 데이터 ===
Shape: (903, 42)

결측치 확인:
avg               0
OPS               0
SLG               0
OBP               0
CV                0
position_group    0
career_group      0
dtype: int64


In [17]:
final_data = final_data.dropna(subset=['avg', 'OPS', 'SLG', 'OBP', 'CV', 'position_group'])

print("\n결측치 제거 후 데이터 Shape:", final_data.shape)
final_data.head()


결측치 제거 후 데이터 Shape: (903, 42)


Unnamed: 0,batter_id,batter_name,year,team,avg,G,AB,R,H,2B,3B,HR,TB,RBI,SB,CS,BB,HBP,SO,GDP,SLG,OBP,E,height/weight,year_born,position,career,starting_salary,OPS,height,weight,BMI,birth_year,age,position_group,career_years,career_group,HR_per_AB,batter_type,CV,std_daily_avg,games_count
0,0,가르시아,2018,LG,0.339,50,183,27,62,9,0,8,95,34,5,0,9,8,25,3,0.519,0.383,9,177cm/93kg,1985년 04월 12일,내야수(우투우타),쿠바 Ciego de Avila Maximo Gomez Baez(대),,0.902,177.0,93.0,29.684956,1985.0,34.0,내야수,1,5년차 이하,0.043716,장타형,0.892193,0.280743,50
2,1,강경학,2015,한화,0.257,120,311,50,80,7,4,2,101,27,4,3,40,5,58,3,0.325,0.348,15,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.673,180.0,72.0,22.222222,1992.0,24.0,내야수,8,6년차 이상,0.006431,안타형,1.175115,0.272672,103
3,1,강경학,2016,한화,0.158,46,101,16,16,3,2,1,26,7,0,0,8,2,30,5,0.257,0.232,7,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.489,180.0,72.0,22.222222,1992.0,25.0,내야수,8,6년차 이상,0.009901,안타형,1.426298,0.194028,37
5,1,강경학,2018,한화,0.278,77,245,42,68,11,1,5,96,27,6,3,38,4,59,7,0.392,0.382,2,180cm/72kg,1992년 08월 11일,내야수(우투좌타),광주대성초-광주동성중-광주동성고,10000만원,0.774,180.0,72.0,22.222222,1992.0,27.0,내야수,8,6년차 이상,0.020408,안타형,1.056978,0.271967,73
6,4,강민호,2005,롯데,0.243,104,214,20,52,11,2,2,73,18,1,4,5,2,56,7,0.341,0.267,9,185cm/100kg,1985년 08월 18일,포수(우투우타),제주신광초-포철중-포철공고-(국제디지털대)-롯데,9000만원,0.608,185.0,100.0,29.218408,1985.0,21.0,포수,15,6년차 이상,0.009346,안타형,1.378389,0.288523,93


## 1.9 데이터 저장

In [18]:
output_path = '../outputs/'

final_data.to_csv(output_path + 'preprocessed_data.csv', index=False, encoding='utf-8-sig')
consistency_stats.to_csv(output_path + 'consistency_stats.csv', index=False, encoding='utf-8-sig')

print("\n데이터 저장 완료!")
print(f"- {output_path}preprocessed_data.csv")
print(f"- {output_path}consistency_stats.csv")


데이터 저장 완료!
- ../outputs/preprocessed_data.csv
- ../outputs/consistency_stats.csv


## 1.10 기초 통계량 확인

In [19]:
print("\n=== 주요 변수 기초 통계량 ===")
final_data[['avg', 'OPS', 'HR', 'CV', 'std_daily_avg', 'age', 'BMI']].describe()


=== 주요 변수 기초 통계량 ===


Unnamed: 0,avg,OPS,HR,CV,std_daily_avg,age,BMI
count,903.0,903.0,903.0,903.0,903.0,903.0,903.0
mean,0.277224,0.765639,9.490587,1.056778,0.274056,28.355482,26.632997
std,0.039384,0.125,9.078353,0.228669,0.033471,4.130747,3.025589
min,0.124,0.376,0.0,0.635004,0.194028,19.0,20.145905
25%,0.251,0.684,3.0,0.878333,0.250834,25.0,24.6755
50%,0.278,0.761,7.0,1.009587,0.267433,28.0,26.52851
75%,0.305,0.846,14.0,1.207589,0.294401,31.0,28.293345
max,0.381,1.175,53.0,2.115343,0.400279,40.0,37.175784


In [20]:
print("\n=== 타자 유형별 평균 CV ===")
final_data.groupby('batter_type')['CV'].describe()


=== 타자 유형별 평균 CV ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
batter_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
안타형,514.0,1.109795,0.233389,0.635004,0.924306,1.08094,1.276105,2.115343
장타형,389.0,0.986725,0.20223,0.665845,0.836152,0.939176,1.101706,1.728254


In [21]:
print("\n=== 포지션별 평균 CV ===")
final_data.groupby('position_group')['CV'].describe()


=== 포지션별 평균 CV ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
position_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
내야수,467.0,1.064807,0.225028,0.635004,0.892081,1.025889,1.210318,1.938848
외야수,346.0,1.008578,0.213756,0.669442,0.837247,0.967296,1.160632,1.552492
포수,90.0,1.200418,0.239697,0.826354,1.033523,1.173029,1.342744,2.115343


In [22]:
print("\n=== 경력별 평균 CV ===")
final_data.groupby('career_group')['CV'].describe()


=== 경력별 평균 CV ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
career_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5년차 이하,95.0,1.102438,0.265138,0.690639,0.883327,1.080485,1.259218,2.115343
6년차 이상,808.0,1.051409,0.223567,0.635004,0.87788,1.003576,1.200928,1.938848
