In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings("ignore")

In [2]:
def get_font_family():
    import platform 
    system_name = platform.system()

    if system_name == "Darwin" : # 맥
        font_family = "AppleGothic"
    elif system_name == "Windows": # 윈도우
        font_family = "Malgun Gothic"
    else: # 리눅스 및 코랩
        !apt-get install fonts-nanum -qq  > /dev/null
        !fc-cache -fv
        import matplotlib as mpl
        mpl.font_manager._rebuild()
        findfont = mpl.font_manager.fontManager.findfont
        mpl.font_manager.findfont = findfont
        mpl.backends.backend_agg.findfont = findfont
        font_family = "NanumBarunGothic"
    return font_family

plt.style.use("seaborn") # 전체 그래프를 seaborn 스타일로 고정
plt.rc("font", family=get_font_family())
matplotlib.rcParams['axes.unicode_minus'] = False # 그래프에 마이너스 허용

%config InlineBackend.figure_format = 'retina' # 그래프를 더 선명하게 사용


plt.rcParams['figure.figsize'] = [8, 4]
plt.rcParams['figure.dpi'] = 120
plt.rcParams.update({'font.size': 20})

# Data Load

In [3]:
train = pd.read_csv("./data/14_18_hitter.csv")

- 주어진 데이터가 충분하지 않다고 판단하였고 데이터 추가 수집이 필요하여 kbo 데이터를 모아놓은 스탯티즈의 데이터를 크롤링 하였다.
- 데이터는 14년 부터 19년까지 데이터를 수집하였고 14년부터 18년의 데이터로 학습을 하고 19년의 데이터로 평가를 할 것

In [4]:
train.head()

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,삼진,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+
0,테임즈,15N1B,10.71,142,595,472,130,180,42,5,...,91,7,0,7,0.381,0.498,0.79,1.288,0.53,222.3
1,강정호,14넥SS,8.05,117,501,418,103,149,36,2,...,106,8,0,2,0.356,0.459,0.739,1.198,0.5,185.6
2,박병호,15넥1B,7.76,140,622,528,129,181,35,1,...,161,10,0,4,0.343,0.436,0.714,1.15,0.481,181.9
3,최형우,16삼LF,7.75,138,618,519,99,195,46,2,...,83,12,0,7,0.376,0.464,0.651,1.116,0.48,177.8
4,서건창,14넥2B,7.63,128,616,543,135,201,41,17,...,47,1,4,2,0.37,0.438,0.547,0.985,0.437,150.9


In [5]:
train.tail()

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,삼진,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+
995,14SP,0.0,1.0,0,0,0,0,0,0,0,...,0,0,0,,,,,,,
996,14한P,0.0,1.0,0,0,0,0,0,0,0,...,0,0,0,,,,,,,
997,17KP,0.0,1.0,0,0,0,0,0,0,0,...,0,0,0,,,,,,,
998,18KP,0.0,1.0,0,0,0,0,0,0,0,...,0,0,0,,,,,,,
999,15kP,0.0,1.0,0,0,0,0,0,0,0,...,0,0,0,,,,,,,


- 타석이 0개인 데이터들이 존재하였고 또한 결측치가 존재한다.
- 데이터 수집간에 이상치 데이터가 수집된 것이기에 타석수가 10개 이하인 타자들은 제거를 한다.

In [6]:
train = train[train["타석"] >= 10]

In [7]:
train

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,삼진,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+
0,테임즈,15N1B,10.71,142,595,472,130,180,42,5,...,91,7,0,7,.381,.498,.790,1.288,.530,222.3
1,강정호,14넥SS,8.05,117,501,418,103,149,36,2,...,106,8,0,2,.356,.459,.739,1.198,.500,185.6
2,박병호,15넥1B,7.76,140,622,528,129,181,35,1,...,161,10,0,4,.343,.436,.714,1.150,.481,181.9
3,최형우,16삼LF,7.75,138,618,519,99,195,46,2,...,83,12,0,7,.376,.464,.651,1.116,.480,177.8
4,서건창,14넥2B,7.63,128,616,543,135,201,41,17,...,47,1,4,2,.370,.438,.547,.985,.437,150.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769,이형종,16LLF,0.00,61,147,124,14,35,4,2,...,20,3,5,1,.282,.366,.371,.737,.335,83.6
784,윤요섭,17kDH,0.00,22,43,39,6,9,1,0,...,7,1,0,0,.231,.302,.410,.713,.315,76.6
786,박기혁,15kSS,0.00,126,350,300,27,84,13,1,...,84,10,16,1,.280,.350,.340,.690,.311,71.7
801,양성우,16한RF,0.00,108,446,384,52,104,12,4,...,67,6,8,6,.271,.347,.354,.701,.324,76.1


- 타자를 평가할 때 WAR이라는 지표를 가장 많이 본다.
- WAR이란 대체 선수 대비 승리 기여도로 이 선수가 다른 선수에 비하여 한 시즌동안 얼마나 많은 활약을 하여 리그의 평균 타자보다 이선수를 사용하였을 때
- 얼마나 승리에 기여하였는지를 알 수 있다. 15년도의 NC 다이노스의 테임즈의 경우 WAR이 10.71인데 이 선수 한명으로 인하여 NC는 약 10승을 더 거뒀다고 말할 수 있는 지표이다.

- 결국 OPS를 구하는 모델을 만드는 것이다. 
- 따라서 야구 선수의 가장 대중적인 평가지표인 WAR과 타자 순수의 능력치를 볼 수 있는 BABIP이라는 파생변수를 하나 더 추가하여 분석에 사용한다.

In [8]:
train.columns

Index(['이름', '팀', 'WAR', 'G', '타석', '타수', '득점', '안타', '2타', '3타', '홈런', '루타',
       '타점', '도루', '도실', '볼넷', '사구', '고4', '삼진', '병살', '희타', '희비', '타율', '출루',
       '장타', 'OPS', 'wOBA', 'WRC+'],
      dtype='object')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 711 entries, 0 to 802
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   이름      711 non-null    object 
 1   팀       711 non-null    object 
 2   WAR     711 non-null    float64
 3   G       711 non-null    int64  
 4   타석      711 non-null    int64  
 5   타수      711 non-null    int64  
 6   득점      711 non-null    int64  
 7   안타      711 non-null    int64  
 8   2타      711 non-null    int64  
 9   3타      711 non-null    int64  
 10  홈런      711 non-null    int64  
 11  루타      711 non-null    int64  
 12  타점      711 non-null    int64  
 13  도루      711 non-null    int64  
 14  도실      711 non-null    int64  
 15  볼넷      711 non-null    int64  
 16  사구      711 non-null    int64  
 17  고4      711 non-null    int64  
 18  삼진      711 non-null    int64  
 19  병살      711 non-null    int64  
 20  희타      711 non-null    int64  
 21  희비      711 non-null    object 
 22  타율

In [10]:
train["희비"] = train["희비"].astype("int")

train[["타율", "출루", "장타", "OPS", "wOBA", "WRC+"]] = \
    train[["타율", "출루", "장타", "OPS", "wOBA", "WRC+"]].astype("float")

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 711 entries, 0 to 802
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   이름      711 non-null    object 
 1   팀       711 non-null    object 
 2   WAR     711 non-null    float64
 3   G       711 non-null    int64  
 4   타석      711 non-null    int64  
 5   타수      711 non-null    int64  
 6   득점      711 non-null    int64  
 7   안타      711 non-null    int64  
 8   2타      711 non-null    int64  
 9   3타      711 non-null    int64  
 10  홈런      711 non-null    int64  
 11  루타      711 non-null    int64  
 12  타점      711 non-null    int64  
 13  도루      711 non-null    int64  
 14  도실      711 non-null    int64  
 15  볼넷      711 non-null    int64  
 16  사구      711 non-null    int64  
 17  고4      711 non-null    int64  
 18  삼진      711 non-null    int64  
 19  병살      711 non-null    int64  
 20  희타      711 non-null    int64  
 21  희비      711 non-null    int32  
 22  타율

In [12]:
train["BABIP"] = (train["안타"]-train["홈런"]) / (train["타수"] - train["삼진"] - train["홈런"] + train["희비"])

In [13]:
train

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+,BABIP
0,테임즈,15N1B,10.71,142,595,472,130,180,42,5,...,7,0,7,0.381,0.498,0.790,1.288,0.530,222.3,0.390029
1,강정호,14넥SS,8.05,117,501,418,103,149,36,2,...,8,0,2,0.356,0.459,0.739,1.198,0.500,185.6,0.397810
2,박병호,15넥1B,7.76,140,622,528,129,181,35,1,...,10,0,4,0.343,0.436,0.714,1.150,0.481,181.9,0.402516
3,최형우,16삼LF,7.75,138,618,519,99,195,46,2,...,12,0,7,0.376,0.464,0.651,1.116,0.480,177.8,0.398058
4,서건창,14넥2B,7.63,128,616,543,135,201,41,17,...,1,4,2,0.370,0.438,0.547,0.985,0.437,150.9,0.395112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
769,이형종,16LLF,0.00,61,147,124,14,35,4,2,...,3,5,1,0.282,0.366,0.371,0.737,0.335,83.6,0.326923
784,윤요섭,17kDH,0.00,22,43,39,6,9,1,0,...,1,0,0,0.231,0.302,0.410,0.713,0.315,76.6,0.233333
786,박기혁,15kSS,0.00,126,350,300,27,84,13,1,...,10,16,1,0.280,0.350,0.340,0.690,0.311,71.7,0.384259
801,양성우,16한RF,0.00,108,446,384,52,104,12,4,...,6,8,6,0.271,0.347,0.354,0.701,0.324,76.1,0.313480


In [14]:
train.columns

Index(['이름', '팀', 'WAR', 'G', '타석', '타수', '득점', '안타', '2타', '3타', '홈런', '루타',
       '타점', '도루', '도실', '볼넷', '사구', '고4', '삼진', '병살', '희타', '희비', '타율', '출루',
       '장타', 'OPS', 'wOBA', 'WRC+', 'BABIP'],
      dtype='object')

In [15]:
train.to_csv("./data/train.csv", index=None)

In [16]:
pd.read_csv("./data/train.csv")

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+,BABIP
0,테임즈,15N1B,10.71,142,595,472,130,180,42,5,...,7,0,7,0.381,0.498,0.790,1.288,0.530,222.3,0.390029
1,강정호,14넥SS,8.05,117,501,418,103,149,36,2,...,8,0,2,0.356,0.459,0.739,1.198,0.500,185.6,0.397810
2,박병호,15넥1B,7.76,140,622,528,129,181,35,1,...,10,0,4,0.343,0.436,0.714,1.150,0.481,181.9,0.402516
3,최형우,16삼LF,7.75,138,618,519,99,195,46,2,...,12,0,7,0.376,0.464,0.651,1.116,0.480,177.8,0.398058
4,서건창,14넥2B,7.63,128,616,543,135,201,41,17,...,1,4,2,0.370,0.438,0.547,0.985,0.437,150.9,0.395112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,이형종,16LLF,0.00,61,147,124,14,35,4,2,...,3,5,1,0.282,0.366,0.371,0.737,0.335,83.6,0.326923
707,윤요섭,17kDH,0.00,22,43,39,6,9,1,0,...,1,0,0,0.231,0.302,0.410,0.713,0.315,76.6,0.233333
708,박기혁,15kSS,0.00,126,350,300,27,84,13,1,...,10,16,1,0.280,0.350,0.340,0.690,0.311,71.7,0.384259
709,양성우,16한RF,0.00,108,446,384,52,104,12,4,...,6,8,6,0.271,0.347,0.354,0.701,0.324,76.1,0.313480


In [17]:
test = pd.read_csv("./data/19_hitter.csv")

In [18]:
test

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,삼진,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+
0,김하성,19키SS,7.17,139,625,540,112,166,38,2,...,80,12,1,7,0.307,0.389,0.491,0.880,0.397,145.7
1,양의지,19NC,6.69,118,459,390,61,138,26,0,...,43,13,0,6,0.354,0.438,0.574,1.012,0.452,179.8
2,최정,19S3B,6.32,141,606,503,86,147,27,0,...,92,4,0,8,0.292,0.399,0.519,0.918,0.411,148.9
3,샌즈,19키RF,6.16,139,613,525,100,160,39,1,...,101,17,0,5,0.305,0.396,0.543,0.939,0.421,163.0
4,로하스,19KCF,5.49,142,576,520,68,167,29,3,...,120,4,0,5,0.321,0.379,0.527,0.905,0.405,151.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,양성우,19한LF,-0.77,56,129,119,11,20,3,0,...,28,2,1,0,0.168,0.227,0.218,0.445,0.214,20.6
247,전병우,19롯1B,-0.79,28,54,50,2,5,1,0,...,22,0,1,0,0.100,0.151,0.120,0.271,0.134,-35.3
248,노시환,19한1B,-0.80,91,192,177,19,33,8,1,...,72,3,1,1,0.186,0.241,0.260,0.501,0.237,36.4
249,김동엽,19삼DH,-0.85,60,211,195,15,42,4,1,...,47,2,0,2,0.215,0.265,0.338,0.604,0.279,60.7


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   이름      251 non-null    object 
 1   팀       251 non-null    object 
 2   WAR     251 non-null    float64
 3   G       251 non-null    int64  
 4   타석      251 non-null    int64  
 5   타수      251 non-null    int64  
 6   득점      251 non-null    int64  
 7   안타      251 non-null    int64  
 8   2타      251 non-null    int64  
 9   3타      251 non-null    int64  
 10  홈런      251 non-null    int64  
 11  루타      251 non-null    int64  
 12  타점      251 non-null    int64  
 13  도루      251 non-null    int64  
 14  도실      251 non-null    int64  
 15  볼넷      251 non-null    int64  
 16  사구      251 non-null    int64  
 17  고4      251 non-null    int64  
 18  삼진      251 non-null    int64  
 19  병살      251 non-null    int64  
 20  희타      251 non-null    int64  
 21  희비      251 non-null    int64  
 22  타율

In [20]:
test["희비"] = test["희비"].astype("int")

test[["타율", "출루", "장타", "OPS", "wOBA", "WRC+"]] = \
    test[["타율", "출루", "장타", "OPS", "wOBA", "WRC+"]].astype("float")

In [21]:
test["BABIP"] = (test["안타"]-test["홈런"]) / (test["타수"] - test["삼진"] - test["홈런"] + test["희비"])

In [23]:
test.to_csv("./data/test.csv", index=None)

In [24]:
pd.read_csv("./data/test.csv")

Unnamed: 0,이름,팀,WAR,G,타석,타수,득점,안타,2타,3타,...,병살,희타,희비,타율,출루,장타,OPS,wOBA,WRC+,BABIP
0,김하성,19키SS,7.17,139,625,540,112,166,38,2,...,12,1,7,0.307,0.389,0.491,0.880,0.397,145.7,0.328125
1,양의지,19NC,6.69,118,459,390,61,138,26,0,...,13,0,6,0.354,0.438,0.574,1.012,0.452,179.8,0.354354
2,최정,19S3B,6.32,141,606,503,86,147,27,0,...,4,0,8,0.292,0.399,0.519,0.918,0.411,148.9,0.302564
3,샌즈,19키RF,6.16,139,613,525,100,160,39,1,...,17,0,5,0.305,0.396,0.543,0.939,0.421,163.0,0.329177
4,로하스,19KCF,5.49,142,576,520,68,167,29,3,...,4,0,5,0.321,0.379,0.527,0.905,0.405,151.3,0.375328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,양성우,19한LF,-0.77,56,129,119,11,20,3,0,...,2,1,0,0.168,0.227,0.218,0.445,0.214,20.6,0.211111
247,전병우,19롯1B,-0.79,28,54,50,2,5,1,0,...,0,1,0,0.100,0.151,0.120,0.271,0.134,-35.3,0.178571
248,노시환,19한1B,-0.80,91,192,177,19,33,8,1,...,3,1,1,0.186,0.241,0.260,0.501,0.237,36.4,0.304762
249,김동엽,19삼DH,-0.85,60,211,195,15,42,4,1,...,2,0,2,0.215,0.265,0.338,0.604,0.279,60.7,0.250000
