In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime
from sklearn.metrics import accuracy_score


# 训练

## 获取全量的竞彩比赛列表

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_game_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_game_list_df['source'] = 'jc'

## 获取全量的胜负彩比赛列表

In [3]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_lottery_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_lottery_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_lottery_game_list_df['source'] = 'lottery'

## 合并竞彩比赛列表和胜负彩比赛列表

In [4]:
tmp = train_lottery_game_list_df.drop(['issue'], axis=1)
df = pd.concat([train_game_list_df, tmp])
df = df[['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn', 'time', 'result', 'win_bet_return', 'draw_bet_return', 'lose_bet_return', 'source']]
df = df.drop_duplicates(subset=['matchid'])

## **设定训练范围** 并处理数据

In [5]:
# match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲', '欧冠', '欧罗巴']
match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲']
match_df = df[(df['game'].isin(match_group))]
match_df = match_df.dropna()

In [6]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(df['home_team'].values) | set(df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

LabelEncoder()

In [7]:
def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [8]:
# 比赛名称encode
games = list(set(match_df['game'].values))
game_encoder = preprocessing.LabelEncoder()
game_encoder.fit(games)

LabelEncoder()

In [9]:
def encode_game(df):
    df['game_encoder'] = game_encoder.transform(df['game'])
    return df

In [10]:
match_df['year'] = match_df.apply(lambda row: row.time.year, axis=1)
match_df['month'] = match_df.apply(lambda row: row.time.month, axis=1)
match_df['day'] = match_df.apply(lambda row: row.time.day, axis=1)
match_df['fix_result'] = match_df.apply(lambda row: int(row.result) if row.result < 3 else 2, axis=1)

In [11]:
match_df = encode_team(match_df)
match_df = encode_game(match_df)

In [12]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43477 entries, 1 to 12650
Data columns (total 20 columns):
matchid               43477 non-null int64
game                  43477 non-null object
home_team             43477 non-null object
visit_team            43477 non-null object
gs                    43477 non-null int64
gd                    43477 non-null int64
gn                    43477 non-null int64
time                  43477 non-null datetime64[ns]
result                43477 non-null int64
win_bet_return        43477 non-null float64
draw_bet_return       43477 non-null float64
lose_bet_return       43477 non-null float64
source                43477 non-null object
year                  43477 non-null int64
month                 43477 non-null int64
day                   43477 non-null int64
fix_result            43477 non-null int64
home_team_encoder     43477 non-null int64
visit_team_encoder    43477 non-null int64
game_encoder          43477 non-null int64
dtypes: date

## 获取特征数据

In [13]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_recent_feature_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_feature_df = pd.DataFrame(rows)
    
    connection.close()

In [14]:
def take_goal_info(prefix, df):
    target_cols = [
        '_0_1_gd', 
        '_0_1_gs', 
        '_0_gd', 
        '_0_gs', 
        '_1_gd', 
        '_1_gs',
        '_2_3_gd', 
        '_2_3_gs', 
        '_2_gd', 
        '_2_gs', 
        '_3_gd', 
        '_3_gs',
        '_4_gd', 
        '_4_gs', 
        '_5_gd', 
        '_5_gs', 
        '_6_gd', 
        '_6_gs',
        '_7_gd', 
        '_7_gs', 
        '_ab_4_gd', 
        '_ab_4_gs',
        '_abs_draw', 
        '_abs_lose', 
        '_abs_win']

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [15]:
def take_goal_pref_info(prefix, df):
    target_cols = [
        '_draw', '_g', '_gd',
        '_gs', '_lose', '_win',
    ]

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [16]:
train_feature_df = take_goal_info('h', train_feature_df)
train_feature_df = take_goal_info('v', train_feature_df)

train_feature_df = take_goal_pref_info('h_host', train_feature_df)
train_feature_df = take_goal_pref_info('v_visit', train_feature_df)

In [17]:
# 所有球队的主客场进球平均数

train_feature_df['h_avg_abs_gs'] = train_feature_df['h_abs_gs'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_gd'] = train_feature_df['h_abs_gd'].sum() / train_feature_df['h_count'].sum()
train_feature_df['v_avg_abs_gs'] = train_feature_df['v_abs_gs'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_gd'] = train_feature_df['v_abs_gd'].sum() / train_feature_df['v_count'].sum()

In [18]:
train_feature_df['h_avg_abs_win'] = train_feature_df['h_abs_win'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_draw'] = train_feature_df['h_abs_draw'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_lose'] = train_feature_df['h_abs_lose'].sum() / train_feature_df['h_count'].sum()

train_feature_df['v_avg_abs_win'] = train_feature_df['v_abs_win'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_draw'] = train_feature_df['v_abs_draw'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_lose'] = train_feature_df['v_abs_lose'].sum() / train_feature_df['v_count'].sum()

## 获取赔率信息

In [32]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_odd_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_match_odd_df = pd.DataFrame(rows)
    
    connection.close()

In [33]:
train_match_odd_df.head()

Unnamed: 0,avg_init_draw_odd,avg_init_lose_odd,avg_init_win_odd,avg_new_draw_kelly,avg_new_draw_odd,avg_new_draw_rate,avg_new_lose_kelly,avg_new_lose_odd,avg_new_lose_rate,avg_new_win_kelly,...,min_new_lose_kelly,min_new_lose_odd,min_new_lose_rate,min_new_win_kelly,min_new_win_odd,min_new_win_rate,min_pay_rate,std_draw,std_lose,std_win
0,13.46,30.07,1.03,0.97,13.33,7.28,1.02,31.71,3.21,0.92,...,0.18,5.5,1.17,0.9,1.0,66.71,0.83,1007.93,11029.6,0.04
1,3.38,4.29,1.75,0.91,3.43,26.55,0.91,4.46,20.5,0.91,...,0.76,3.7,16.79,0.79,1.5,48.58,0.83,1.83,12.46,0.35
2,6.72,14.49,1.14,0.92,6.97,13.17,0.95,15.31,6.19,0.91,...,0.48,7.8,3.38,0.85,1.05,69.75,0.83,37.8,1120.81,0.05
3,8.02,17.87,1.09,0.92,7.89,11.68,0.94,17.85,5.29,0.91,...,0.56,10.5,2.86,0.85,1.02,75.49,0.83,93.17,1417.25,0.04
4,3.98,6.79,1.42,0.91,4.06,22.43,0.92,7.17,12.88,0.91,...,0.69,5.35,9.54,0.84,1.3,59.57,0.83,4.23,90.73,0.14


## 合并训练数据

In [34]:
train_dataset_df = pd.merge(match_df, train_feature_df, on='matchid', how='left')
train_dataset_df = pd.merge(train_dataset_df, train_match_odd_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()
train_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30980 entries, 0 to 43278
Columns: 217 entries, matchid to std_win
dtypes: datetime64[ns](1), float64(200), int64(12), object(4)
memory usage: 51.5+ MB


In [63]:
train_dataset_df.columns.values

array(['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn',
       'time', 'result', 'win_bet_return', 'draw_bet_return',
       'lose_bet_return', 'source', 'year', 'month', 'day', 'fix_result',
       'home_team_encoder', 'visit_team_encoder', 'game_encoder',
       'h_0_1_gd', 'h_0_1_gs', 'h_0_gd', 'h_0_gs', 'h_1_gd', 'h_1_gs',
       'h_2_3_gd', 'h_2_3_gs', 'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
       'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
       'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
       'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
       'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
       'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
       'h_host_gs', 'h_host_lose', 'h_host_win', 'id_x', 'v_0_1_gd',
       'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
       'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
       'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd'

## xgb regressor 尝试预测分差

In [68]:
train_dataset_df['fix_result'] = train_dataset_df['gs'] - train_dataset_df['gd']

In [165]:
import xgboost as xgb

x_columns = [
#     'win_bet_return', 'draw_bet_return', 'lose_bet_return', 
#     'year', 'month',
#     'home_team_encoder', 'visit_team_encoder', 'game_encoder',
    
    'h_avg_abs_gs',
    'h_avg_abs_gd', 'v_avg_abs_gs', 'v_avg_abs_gd', 'h_avg_abs_win',
    'h_avg_abs_draw', 'h_avg_abs_lose', 'v_avg_abs_win',
    'v_avg_abs_draw', 'v_avg_abs_lose',
    
    'h_0_1_gd', 
    'h_0_1_gs', 
    'h_0_gd', 
    'h_0_gs', 
    'h_1_gd', 
    'h_1_gs',
    'h_2_3_gd', 
    'h_2_3_gs', 
    'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
    'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
    'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
    'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
    'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
    'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
    'h_host_gs', 'h_host_lose', 'h_host_win', 
    
    'v_0_1_gd',
    'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
    'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
    'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 'v_6_gs', 'v_7_gd',
    'v_7_gs', 'v_ab_4_gd', 'v_ab_4_gs', 'v_abs_avg_g', 'v_abs_avg_gd',
    'v_abs_avg_gs', 'v_abs_draw', 'v_abs_g', 'v_abs_gd', 'v_abs_gs',
    'v_abs_lose', 'v_abs_win', 
    
    'v_count', 
    'v_visit_count',
    'v_visit_draw', 'v_visit_g', 'v_visit_gd', 'v_visit_gs',
    'v_visit_lose', 'v_visit_win',
    
    'h_0_1_gd_rate', 'h_0_1_gs_rate',
    'h_0_gd_rate', 'h_0_gs_rate', 'h_1_gd_rate', 'h_1_gs_rate',
    'h_2_3_gd_rate', 'h_2_3_gs_rate', 'h_2_gd_rate', 'h_2_gs_rate',
    'h_3_gd_rate', 'h_3_gs_rate', 'h_4_gd_rate', 'h_4_gs_rate',
    'h_5_gd_rate', 'h_5_gs_rate', 'h_6_gd_rate', 'h_6_gs_rate',
    'h_7_gd_rate', 'h_7_gs_rate', 'h_ab_4_gd_rate', 'h_ab_4_gs_rate',
    'h_abs_draw_rate', 'h_abs_lose_rate', 'h_abs_win_rate',
    'v_0_1_gd_rate', 'v_0_1_gs_rate', 'v_0_gd_rate', 'v_0_gs_rate',
    'v_1_gd_rate', 'v_1_gs_rate', 'v_2_3_gd_rate', 'v_2_3_gs_rate',
    'v_2_gd_rate', 'v_2_gs_rate', 'v_3_gd_rate', 'v_3_gs_rate',
    'v_4_gd_rate', 'v_4_gs_rate', 'v_5_gd_rate', 'v_5_gs_rate',
    'v_6_gd_rate', 'v_6_gs_rate', 'v_7_gd_rate', 'v_7_gs_rate',
    
    'v_ab_4_gd_rate', 
    'v_ab_4_gs_rate', 
    'v_abs_draw_rate',
    'v_abs_lose_rate', 
    'v_abs_win_rate',
    'h_host_draw_rate',
    'h_host_g_rate', 
    'h_host_gd_rate', 
    'h_host_gs_rate',
    'h_host_lose_rate', 
    'h_host_win_rate', 
    'v_visit_draw_rate',
    'v_visit_g_rate', 
    'v_visit_gd_rate', 
    'v_visit_gs_rate',
    'v_visit_lose_rate', 
    'v_visit_win_rate',
    
    'avg_init_draw_odd',
    'avg_init_lose_odd', 'avg_init_win_odd', 'avg_new_draw_kelly',
    'avg_new_draw_odd', 'avg_new_draw_rate', 'avg_new_lose_kelly',
    'avg_new_lose_odd', 'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 'id_y',
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'reg:linear', 
#     'objective': 'multi:softprob',
#     'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    'max_depth':8, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
#     'eta': 0.05, # 如同学习率
    'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

t = train_dataset_df

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 100

# num_rounds = 10000
# stop_rounds = 300

model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[0]	train-rmse:1.72901	val-rmse:1.76074
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[1]	train-rmse:1.72535	val-rmse:1.75802
[2]	train-rmse:1.72187	val-rmse:1.75476
[3]	train-rmse:1.71838	val-rmse:1.7519
[4]	train-rmse:1.71494	val-rmse:1.74894
[5]	train-rmse:1.71153	val-rmse:1.74608
[6]	train-rmse:1.70818	val-rmse:1.74288
[7]	train-rmse:1.70491	val-rmse:1.74039
[8]	train-rmse:1.7016	val-rmse:1.73806
[9]	train-rmse:1.69843	val-rmse:1.73527
[10]	train-rmse:1.69529	val-rmse:1.73271
[11]	train-rmse:1.69229	val-rmse:1.73002
[12]	train-rmse:1.68925	val-rmse:1.72717
[13]	train-rmse:1.68621	val-rmse:1.72499
[14]	train-rmse:1.68326	val-rmse:1.72203
[15]	train-rmse:1.68033	val-rmse:1.71957
[16]	train-rmse:1.67751	val-rmse:1.71712
[17]	train-rmse:1.67468	val-rmse:1.71478
[18]	train-rmse:1.67186	val-rmse:1.71214
[19]	train-rmse:1.66906	val-rmse:1.70982
[20]	train-rmse:1.66635	val-rmse:1.70753
[21]	trai

In [166]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

array([-0.14979899,  0.10005543,  1.000135  , ...,  1.1058667 ,
        0.8398075 , -0.6734235 ], dtype=float32)

In [168]:
test_dataset['preds'] = preds
test_dataset[['gs', 'gd', 'fix_result', 'preds']].head(20)

Unnamed: 0,gs,gd,fix_result,preds
40535,0,3,-3,-0.149799
41590,1,2,-1,0.100055
41851,1,1,0,1.000135
41852,2,1,1,0.224394
41853,2,0,2,0.444732
41855,0,0,0,0.397154
41856,0,1,-1,1.133074
41857,0,2,-2,0.026531
41858,1,0,1,-0.088672
41859,0,2,-2,0.614954


In [185]:
math.ceil(-0.14979899), math.ceil(0.14979899)

(0, 1)

In [203]:
import math

def get_score(row):
    if row.preds > 0:
        return -(math.floor(row.preds)) + 0.5
    else:
        return -(math.ceil(row.preds)) - 0.5

# test_dataset['rq'] = test_dataset.apply(lambda row: math.floor(math.floor(row.preds * 10) / 5) * 0.5, axis=1)
test_dataset['rq'] = test_dataset.apply(lambda row: get_score(row), axis=1)

In [205]:
test_dataset['rq_result'] = test_dataset['rq'] + test_dataset['fix_result']
test_dataset['pred_rq_result'] = test_dataset.apply(lambda row: 1 if row.rq_result > 0 else 0, axis=1)

# a = test_dataset[test_dataset['rq'] >= 0]
a = test_dataset
print(len(a[a['pred_rq_result'] == 1])/ len(a))
a[['matchid', 'gs', 'gd', 'fix_result', 'rq', 'rq_result', 'pred_rq_result']].head(20)

0.6074270557029178


Unnamed: 0,matchid,gs,gd,fix_result,rq,rq_result,pred_rq_result
40535,2432035,0,3,-3,-0.5,-3.5,0
41590,2432303,1,2,-1,0.5,-0.5,0
41851,2514240,1,1,0,-0.5,-0.5,0
41852,2514242,2,1,1,0.5,1.5,1
41853,2437256,2,0,2,0.5,2.5,1
41855,2514244,0,0,0,0.5,0.5,1
41856,2411805,0,1,-1,-0.5,-1.5,0
41857,2415112,0,2,-2,0.5,-1.5,0
41858,2415109,1,0,1,-0.5,0.5,1
41859,2406862,0,2,-2,0.5,-1.5,0


In [206]:
test_dataset['rq'].drop_duplicates().values

array([-0.5,  0.5,  1.5, -1.5, -3.5])

## xgb训练

In [56]:
import xgboost as xgb

x_columns = [
#     'win_bet_return', 'draw_bet_return', 'lose_bet_return', 
#     'year', 'month',
#     'home_team_encoder', 'visit_team_encoder', 'game_encoder',
    
#     'h_avg_abs_gs',
#     'h_avg_abs_gd', 'v_avg_abs_gs', 'v_avg_abs_gd', 'h_avg_abs_win',
#     'h_avg_abs_draw', 'h_avg_abs_lose', 'v_avg_abs_win',
#     'v_avg_abs_draw', 'v_avg_abs_lose',
    
#     'h_0_1_gd', 
#     'h_0_1_gs', 
#     'h_0_gd', 
#     'h_0_gs', 
#     'h_1_gd', 
#     'h_1_gs',
#     'h_2_3_gd', 
#     'h_2_3_gs', 
#     'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
#     'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
#     'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
#     'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
#     'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
#     'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
#     'h_host_gs', 'h_host_lose', 'h_host_win', 
    
#     'v_0_1_gd',
#     'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
#     'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
#     'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 'v_6_gs', 'v_7_gd',
#     'v_7_gs', 'v_ab_4_gd', 'v_ab_4_gs', 'v_abs_avg_g', 'v_abs_avg_gd',
#     'v_abs_avg_gs', 'v_abs_draw', 'v_abs_g', 'v_abs_gd', 'v_abs_gs',
#     'v_abs_lose', 'v_abs_win', 
    
#     'v_count', 
#     'v_visit_count',
#     'v_visit_draw', 'v_visit_g', 'v_visit_gd', 'v_visit_gs',
#     'v_visit_lose', 'v_visit_win',
    
#     'h_0_1_gd_rate', 'h_0_1_gs_rate',
#     'h_0_gd_rate', 'h_0_gs_rate', 'h_1_gd_rate', 'h_1_gs_rate',
#     'h_2_3_gd_rate', 'h_2_3_gs_rate', 'h_2_gd_rate', 'h_2_gs_rate',
#     'h_3_gd_rate', 'h_3_gs_rate', 'h_4_gd_rate', 'h_4_gs_rate',
#     'h_5_gd_rate', 'h_5_gs_rate', 'h_6_gd_rate', 'h_6_gs_rate',
#     'h_7_gd_rate', 'h_7_gs_rate', 'h_ab_4_gd_rate', 'h_ab_4_gs_rate',
#     'h_abs_draw_rate', 'h_abs_lose_rate', 'h_abs_win_rate',
#     'v_0_1_gd_rate', 'v_0_1_gs_rate', 'v_0_gd_rate', 'v_0_gs_rate',
#     'v_1_gd_rate', 'v_1_gs_rate', 'v_2_3_gd_rate', 'v_2_3_gs_rate',
#     'v_2_gd_rate', 'v_2_gs_rate', 'v_3_gd_rate', 'v_3_gs_rate',
#     'v_4_gd_rate', 'v_4_gs_rate', 'v_5_gd_rate', 'v_5_gs_rate',
#     'v_6_gd_rate', 'v_6_gs_rate', 'v_7_gd_rate', 'v_7_gs_rate',
    
#     'v_ab_4_gd_rate', 
#     'v_ab_4_gs_rate', 
#     'v_abs_draw_rate',
#     'v_abs_lose_rate', 
#     'v_abs_win_rate',
#     'h_host_draw_rate',
#     'h_host_g_rate', 
#     'h_host_gd_rate', 
#     'h_host_gs_rate',
#     'h_host_lose_rate', 
#     'h_host_win_rate', 
#     'v_visit_draw_rate',
#     'v_visit_g_rate', 
#     'v_visit_gd_rate', 
#     'v_visit_gs_rate',
#     'v_visit_lose_rate', 
#     'v_visit_win_rate',
    
    'avg_init_draw_odd',
    'avg_init_lose_odd', 'avg_init_win_odd', 'avg_new_draw_kelly',
    'avg_new_draw_odd', 'avg_new_draw_rate', 'avg_new_lose_kelly',
    'avg_new_lose_odd', 'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 'id_y',
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'multi:softmax', 
#     'objective': 'multi:softprob',
    'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    
    'max_depth':8, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
#     'eta': 0.05, # 如同学习率
    'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

# t = train_dataset_df[
#     (train_dataset_df['win_bet_return'] <= 2) |
#     (train_dataset_df['draw_bet_return'] <= 2) |
#     (train_dataset_df['lose_bet_return'] <= 2)
# ]

t = train_dataset_df

print(len(t))

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 100

# num_rounds = 10000
# stop_rounds = 300


model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

30980
[0]	train-merror:0.46384	val-merror:0.536404
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 100 rounds.
[1]	train-merror:0.45606	val-merror:0.536404
[2]	train-merror:0.453146	val-merror:0.534918
[3]	train-merror:0.453695	val-merror:0.533432
[4]	train-merror:0.453935	val-merror:0.523031
[5]	train-merror:0.455066	val-merror:0.526003
[6]	train-merror:0.454209	val-merror:0.534918
[7]	train-merror:0.452667	val-merror:0.531946
[8]	train-merror:0.450747	val-merror:0.523031
[9]	train-merror:0.45061	val-merror:0.524517
[10]	train-merror:0.450644	val-merror:0.524517
[11]	train-merror:0.450781	val-merror:0.527489
[12]	train-merror:0.450713	val-merror:0.530461
[13]	train-merror:0.451056	val-merror:0.530461
[14]	train-merror:0.450233	val-merror:0.527489
[15]	train-merror:0.449925	val-merror:0.524517
[16]	train-merror:0.45037	val-merror:0.526003
[17]	train-merror:0.449685	val-merror:0.527489
[18]	train-merro

In [50]:
# 求可能性的时候用：'objective': 'multi:softprob',

pred_probs = model.predict(xgtest, ntree_limit=model.best_iteration)
pred_probs

array([[0.3452244 , 0.3264327 , 0.3283429 ],
       [0.3335006 , 0.3313826 , 0.33511677],
       [0.32226524, 0.32644787, 0.35128686],
       ...,
       [0.32169273, 0.32734713, 0.3509601 ],
       [0.32342005, 0.32687503, 0.34970492],
       [0.34933007, 0.32826293, 0.32240704]], dtype=float32)

In [51]:
test_dataset['fix_result'].values

array([0, 0, 1, ..., 1, 1, 1])

In [52]:
def get_result(items):
    if items[0] <= items[1] and items[0] <= items[2]:
        return [1,2]
    elif items[1] <= items[0] and items[1] <= items[2]:
        return [0,2]
    elif items[2] <= items[0] and items[2] <= items[1]:
        return [0,1]
    
fix_results = test_dataset['fix_result'].values

results = []
for i in range(len(pred_probs)):
    items = pred_probs[i]
    probs = get_result(items)
    
    if fix_results[i] in probs:
        results.append(1)
    else:
        results.append(0)

In [53]:
np.array(results).sum() / len(results)

0.7400530503978779

In [57]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

accuracy_score(test_dataset['fix_result'], preds)

0.4880636604774536

## SVM多分类训练

In [58]:
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV


In [59]:
# Seperating Predictors and Outcome values from train and test sets
X_train = train_dataset[x_columns]
Y_train_label = train_dataset['fix_result'].values.astype(object)

X_test = test_dataset[x_columns]
Y_test_label = test_dataset['fix_result'].values.astype(object)

encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [60]:
# params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [10, 100]}]

In [61]:
# svm_model = GridSearchCV(SVC(), params_grid, cv=5)
# svm_model.fit(X_train_scaled, Y_train)
final_model = SVC(C=1, kernel='rbf', degree=3, gamma='auto', verbose=True)
final_model.fit(X_train_scaled, Y_train)

[LibSVM]

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [62]:
# final_model = svm_model.best_estimator_

print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))

Training set score for SVM: 0.509186
Testing  set score for SVM: 0.503095
