In [175]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass

warnings.warn = ignore_warn

from scipy import stats
from scipy.stats import norm, skew

pd.set_option('display.float_format',lambda x:'{:.3f}'.format(x))

In [176]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [177]:
train_ID =  train_data['id']
test_ID = test_data['id']

train_data.drop(['id'], axis = 1,inplace = True)
test_data.drop(['id'], axis = 1, inplace = True)

print("訓練資料集的維度",train_data.shape)
print("測試資料集的維度",test_data.shape)

訓練資料集的維度 (6284, 31)
測試資料集的維度 (1109, 30)


In [178]:
ntrain = train_data.shape[0] 
ntest = test_data.shape[0] #方便之後 訓練時資料切割
y_train = train_data.salary_next_year.values

all_data = pd.concat([train_data,test_data], ignore_index=True)
#all_data.drop(['salary_next_year'],axis = 1, inplace = True)

print("全部數據集的維度",all_data.shape)

全部數據集的維度 (7393, 31)


# 以下一年的salary 當作feature

In [179]:
all_data = all_data.sort_values(by = 'yearID')

In [180]:
all_data['last_year_salary'] = all_data.groupby('playerID')['salary_next_year'].apply(lambda i:i.shift(1)).fillna(method='ffill')
all_data['last_year_salary'] = all_data['last_year_salary'].fillna(method='bfill')

In [181]:
all_data[['last_year_salary','salary_next_year']].loc[all_data['playerID'] == 'tananfr01']

Unnamed: 0,last_year_salary,salary_next_year
5308,350000.0,662500.0
530,662500.0,662500.0
1022,662500.0,1100000.0
6947,1100000.0,
3679,377500.0,1100000.0
3453,1100000.0,1900000.0
2364,1900000.0,1700000.0
1178,1700000.0,1500000.0


In [182]:
all_data['salary_diff'] = all_data.groupby('playerID')['salary_next_year'].apply(lambda i:i.diff(1)).fillna(method= 'ffill')
all_data['salary_diff'] = all_data['salary_diff'].fillna(method='bfill')

In [183]:
all_data = all_data.sort_index()

In [184]:
#all_data.drop(['salary_next_year'],axis = 1, inplace = True)

# 補年資的特徵

In [185]:
all_data['Seniority'] = all_data.groupby('playerID')['yearID'].transform(lambda x: x - x.min()+1)

In [186]:
bin=[0,6,11,16,17,20,24,26]
all_data['Seniority'] = pd.cut(all_data['Seniority'],bin)

In [187]:
all_data['Seniority'] =all_data['Seniority'].astype('str')

In [188]:
all_data['Seniority'].value_counts()

(0, 6]      5260
(6, 11]     1647
(11, 16]     420
(17, 20]      35
(16, 17]      25
(20, 24]       5
(24, 26]       1
Name: Seniority, dtype: int64

In [189]:
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : 4 if x  == '(0, 6]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : 2 if x  == '(6, 11]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : 1 if x  == '(11, 16]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : -1 if x  == '(17, 20]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : 2 if x  == '(16, 17]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : 3 if x  == '(20, 24]' else x)
all_data['Seniority'] = all_data['Seniority'].apply(lambda x : -1 if x  == '(24, 26]' else x)

# 將年份轉變為1985為1 以此後推

In [190]:
all_data['player_age'] = all_data['yearID'] - 1984

# one-hotencoding

In [191]:
lgID = pd.get_dummies(all_data['lgID'],prefix= 'lgID_')#做one-hotecoding
all_data = pd.concat([all_data,lgID],axis=1, join_axes=[all_data.index])
all_data.drop(['lgID'],axis = 1,inplace = True)

teamID = pd.get_dummies(all_data['teamID'],prefix= 'team_')#做one-hotecoding
all_data = pd.concat([all_data,teamID],axis=1, join_axes=[all_data.index])
all_data.drop(['teamID'],axis = 1,inplace = True)

# 訓練模型

In [192]:
all_data.drop(['SH','SF','GIDP','stint'], axis = 1, inplace = True) #刪除掉完全沒有值的資料欄位

# 填補缺失值

In [193]:
all_data['WP'].fillna(value=all_data['WP'].mean(), inplace=True)
all_data['IBB'].fillna(value=all_data['IBB'].mean(), inplace=True)
all_data['HBP'].fillna(value=all_data['HBP'].mean(), inplace=True)
all_data['BK'].fillna(value=all_data['BK'].mean(), inplace=True)  

In [194]:
all_data['BAOpp'].fillna(0,inplace = True)

In [195]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7393 entries, 0 to 7392
Data columns (total 65 columns):
BAOpp               7393 non-null float64
BB                  7393 non-null int64
BFP                 7393 non-null float64
BK                  7393 non-null float64
CG                  7393 non-null int64
ER                  7393 non-null int64
ERA                 7393 non-null float64
G                   7393 non-null int64
GF                  7393 non-null float64
GS                  7393 non-null int64
H                   7393 non-null int64
HBP                 7393 non-null float64
HR                  7393 non-null int64
IBB                 7393 non-null float64
IPouts              7393 non-null int64
L                   7393 non-null int64
R                   7393 non-null int64
SHO                 7393 non-null int64
SO                  7393 non-null int64
SV                  7393 non-null int64
W                   7393 non-null int64
WP                  7393 non-null float

In [196]:
all_data.columns

Index(['BAOpp', 'BB', 'BFP', 'BK', 'CG', 'ER', 'ERA', 'G', 'GF', 'GS', 'H',
       'HBP', 'HR', 'IBB', 'IPouts', 'L', 'R', 'SHO', 'SO', 'SV', 'W', 'WP',
       'playerID', 'salary_next_year', 'yearID', 'last_year_salary',
       'salary_diff', 'Seniority', 'player_age', 'lgID__AL', 'lgID__NL',
       'team__ANA', 'team__ARI', 'team__ATL', 'team__BAL', 'team__BOS',
       'team__CAL', 'team__CHA', 'team__CHN', 'team__CIN', 'team__CLE',
       'team__COL', 'team__DET', 'team__FLO', 'team__HOU', 'team__KCA',
       'team__LAA', 'team__LAN', 'team__MIL', 'team__MIN', 'team__ML4',
       'team__MON', 'team__NYA', 'team__NYN', 'team__OAK', 'team__PHI',
       'team__PIT', 'team__SDN', 'team__SEA', 'team__SFN', 'team__SLN',
       'team__TBA', 'team__TEX', 'team__TOR', 'team__WAS'],
      dtype='object')

In [197]:
#all_data.drop(['salary_next_year'],axis =1, inplace = True)

In [198]:
all_data.drop(['playerID'],axis =1,inplace = True)

In [199]:
train = all_data[:ntrain]
test = all_data[ntrain:]

# 訓練模型

In [203]:
import os

if __name__ == "__main__":
    try:
        from multiprocessing import set_start_method
    except ImportError:
        raise ImportError("Unable to import multiprocessing.set_start_method."
                          " This example only runs on Python 3.4")
    set_start_method("forkserver")



    rng = np.random.RandomState(31337)

    print("Parallel Parameter optimization")


    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want

    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model, parameters = {'colsample_bytree':[0.4603], #when use hyperthread, xgboost may become slower
              'gamma':[0.0468],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [3,5, 6, 7],
              'min_child_weight': [1.7817,2,4],
              'reg_alpha' : [0.4640],
              'silent': [1,2],
              'subsample': [0.5213,0.7,8],
              'colsample_bytree': [0.7],
              'random_state' : [7],
              'nthread' :[-1],
              'n_estimators': [250,500,100]}, verbose=1,
                       n_jobs=2)
    clf.fit(train, y_train)
    print(clf.best_score_)
    print(clf.best_params_)

RuntimeError: context has already been set