# 结论

+ 该方案可以直接适用于新的数据源，可以在更大的数据结果里进行尝试
+ 77%的成果率，数据非常稳定

In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

# 训练

## 获取全量的竞彩比赛列表

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_game_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_game_list_df['source'] = 'jc'

## 获取全量的胜负彩比赛列表

In [3]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_lottery_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_lottery_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_lottery_game_list_df['source'] = 'lottery'

## 合并竞彩比赛列表和胜负彩比赛列表

In [4]:
tmp = train_lottery_game_list_df.drop(['issue'], axis=1)
df = pd.concat([train_game_list_df, tmp])
df = df[['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn', 'time', 'result', 'win_bet_return', 'draw_bet_return', 'lose_bet_return', 'source']]
df = df.drop_duplicates(subset=['matchid'])

In [5]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(df['home_team'].values) | set(df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

LabelEncoder()

In [6]:
def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [7]:
# 比赛名称encode
games = list(set(df['game'].values))
game_encoder = preprocessing.LabelEncoder()
game_encoder.fit(games)

LabelEncoder()

In [8]:
def encode_game(df):
    df['game_encoder'] = game_encoder.transform(df['game'])
    return df

In [9]:
df['year'] = df.apply(lambda row: row.time.year, axis=1)
df['month'] = df.apply(lambda row: row.time.month, axis=1)
df['day'] = df.apply(lambda row: row.time.day, axis=1)
df['fix_result'] = df.apply(lambda row: int(row.result) if row.result < 3 else 2, axis=1)

In [10]:
df = encode_team(df)
df = encode_game(df)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57053 entries, 0 to 12650
Data columns (total 20 columns):
matchid               57053 non-null int64
game                  57053 non-null object
home_team             57053 non-null object
visit_team            57053 non-null object
gs                    57053 non-null int64
gd                    57053 non-null int64
gn                    57053 non-null int64
time                  57053 non-null datetime64[ns]
result                57053 non-null int64
win_bet_return        57053 non-null float64
draw_bet_return       57053 non-null float64
lose_bet_return       57053 non-null float64
source                57053 non-null object
year                  57053 non-null int64
month                 57053 non-null int64
day                   57053 non-null int64
fix_result            57053 non-null int64
home_team_encoder     57053 non-null int64
visit_team_encoder    57053 non-null int64
game_encoder          57053 non-null int64
dtypes: date

## 获取赔率信息

In [12]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_odd_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_match_odd_df = pd.DataFrame(rows)
    
    connection.close()

In [13]:
train_match_odd_df['avg_win_odd_change'] = (train_match_odd_df['avg_new_win_odd'] - train_match_odd_df['avg_init_win_odd'])/train_match_odd_df['avg_init_win_odd']
train_match_odd_df['avg_draw_odd_change'] = (train_match_odd_df['avg_new_draw_odd'] - train_match_odd_df['avg_init_draw_odd'])/train_match_odd_df['avg_init_draw_odd']
train_match_odd_df['avg_lose_odd_change'] = (train_match_odd_df['avg_new_lose_odd'] - train_match_odd_df['avg_init_lose_odd'])/train_match_odd_df['avg_init_lose_odd']

## 合并训练数据

In [14]:
train_dataset_df = pd.merge(df, train_match_odd_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()
train_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42622 entries, 0 to 56503
Data columns (total 69 columns):
matchid                42622 non-null int64
game                   42622 non-null object
home_team              42622 non-null object
visit_team             42622 non-null object
gs                     42622 non-null int64
gd                     42622 non-null int64
gn                     42622 non-null int64
time                   42622 non-null datetime64[ns]
result                 42622 non-null int64
win_bet_return         42622 non-null float64
draw_bet_return        42622 non-null float64
lose_bet_return        42622 non-null float64
source                 42622 non-null object
year                   42622 non-null int64
month                  42622 non-null int64
day                    42622 non-null int64
fix_result             42622 non-null int64
home_team_encoder      42622 non-null int64
visit_team_encoder     42622 non-null int64
game_encoder           42622 non-nul

In [15]:
train_dataset_df.columns.values

array(['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn',
       'time', 'result', 'win_bet_return', 'draw_bet_return',
       'lose_bet_return', 'source', 'year', 'month', 'day', 'fix_result',
       'home_team_encoder', 'visit_team_encoder', 'game_encoder',
       'avg_init_draw_odd', 'avg_init_lose_odd', 'avg_init_win_odd',
       'avg_new_draw_kelly', 'avg_new_draw_odd', 'avg_new_draw_rate',
       'avg_new_lose_kelly', 'avg_new_lose_odd', 'avg_new_lose_rate',
       'avg_new_win_kelly', 'avg_new_win_odd', 'avg_new_win_rate',
       'avg_pay_rate', 'dispersion_draw', 'dispersion_lose',
       'dispersion_win', 'id', 'max_init_draw_odd', 'max_init_lose_odd',
       'max_init_win_odd', 'max_new_draw_kelly', 'max_new_draw_odd',
       'max_new_draw_rate', 'max_new_lose_kelly', 'max_new_lose_odd',
       'max_new_lose_rate', 'max_new_win_kelly', 'max_new_win_odd',
       'max_new_win_rate', 'max_pay_rate', 'min_init_draw_odd',
       'min_init_lose_odd', 'min_init_win_odd'

# xgb训练

In [16]:
x_columns = [
    'year', 'month',
    'home_team_encoder', 'visit_team_encoder', 
    'game_encoder',
    'avg_init_draw_odd','avg_init_lose_odd', 'avg_init_win_odd', 
    
    'avg_new_draw_kelly',
    'avg_new_draw_odd', 
    'avg_new_draw_rate', 
    'avg_new_lose_kelly',
    'avg_new_lose_odd', 
    'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 
    'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win', 'avg_win_odd_change', 'avg_draw_odd_change',
    'avg_lose_odd_change'
]

In [25]:
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'multi:softmax', 
#     'objective': 'multi:softprob',
    'num_class':3, # 类数，与 multisoftmax 并用
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    'max_depth':8, # 构建树的深度 [1:]
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    'eta': 0.01, # 如同学习率
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

t = train_dataset_df

scaler = StandardScaler()
scaler.fit(t[x_columns])

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(scaler.transform(train_dataset[x_columns]), label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(scaler.transform(test_dataset[x_columns]), label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(scaler.transform(valid_dataset[x_columns]), label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 300

  return self.partial_fit(X, y)
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


## 单场正确性预测

In [30]:
params['objective'] = 'multi:softmax'
model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[0]	train-merror:0.46154	val-merror:0.503394
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 300 rounds.
[1]	train-merror:0.452661	val-merror:0.495475
[2]	train-merror:0.45182	val-merror:0.496606
[3]	train-merror:0.452216	val-merror:0.49095
[4]	train-merror:0.451326	val-merror:0.487557
[5]	train-merror:0.452216	val-merror:0.492081
[6]	train-merror:0.453502	val-merror:0.492081
[7]	train-merror:0.452043	val-merror:0.487557
[8]	train-merror:0.451721	val-merror:0.495475
[9]	train-merror:0.453106	val-merror:0.492081
[10]	train-merror:0.451796	val-merror:0.492081
[11]	train-merror:0.451153	val-merror:0.49095
[12]	train-merror:0.451375	val-merror:0.49095
[13]	train-merror:0.451326	val-merror:0.485294
[14]	train-merror:0.451672	val-merror:0.483032
[15]	train-merror:0.452364	val-merror:0.486425
[16]	train-merror:0.451771	val-merror:0.485294
[17]	train-merror:0.451573	val-merror:0.484163
[18]	train-merror:0.452

In [31]:
xgtest = xgb.DMatrix(scaler.transform(test_dataset[x_columns]), label=test_dataset['fix_result'])
preds_new_test = model.predict(xgtest, ntree_limit=model.best_iteration)
preds_new_test

  """Entry point for launching an IPython kernel.
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


array([0., 2., 2., ..., 2., 2., 0.], dtype=float32)

In [32]:
accuracy_score(test_dataset['fix_result'], preds_new_test)

0.4908116385911179

## 验证选2场的结果

In [26]:
params['objective'] = 'multi:softprob'
model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[0]	train-merror:0.46154	val-merror:0.503394
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 300 rounds.
[1]	train-merror:0.452661	val-merror:0.495475
[2]	train-merror:0.45182	val-merror:0.496606
[3]	train-merror:0.452216	val-merror:0.49095
[4]	train-merror:0.451326	val-merror:0.487557
[5]	train-merror:0.452216	val-merror:0.492081
[6]	train-merror:0.453502	val-merror:0.492081
[7]	train-merror:0.452043	val-merror:0.487557
[8]	train-merror:0.451721	val-merror:0.495475
[9]	train-merror:0.453106	val-merror:0.492081
[10]	train-merror:0.451796	val-merror:0.492081
[11]	train-merror:0.451153	val-merror:0.49095
[12]	train-merror:0.451375	val-merror:0.49095
[13]	train-merror:0.451326	val-merror:0.485294
[14]	train-merror:0.451672	val-merror:0.483032
[15]	train-merror:0.452364	val-merror:0.486425
[16]	train-merror:0.451771	val-merror:0.485294
[17]	train-merror:0.451573	val-merror:0.484163
[18]	train-merror:0.452

In [27]:
pred_probs = model.predict(xgtest, ntree_limit=model.best_iteration)
pred_probs

array([[0.33936977, 0.32758775, 0.33304247],
       [0.33418375, 0.33119747, 0.3346188 ],
       [0.3138591 , 0.32415015, 0.36199075],
       ...,
       [0.31571126, 0.31986183, 0.36442688],
       [0.31429586, 0.3236578 , 0.36204633],
       [0.3512218 , 0.3286724 , 0.3201058 ]], dtype=float32)

In [28]:
def get_result(items):
    if items[0] <= items[1] and items[0] <= items[2]:
        return [1,2]
    elif items[1] <= items[0] and items[1] <= items[2]:
        return [0,2]
    elif items[2] <= items[0] and items[2] <= items[1]:
        return [0,1]
    
def cal_win_rate(fix_results, pred_probs):
    results = []
    for i in range(len(pred_probs)):
        items = pred_probs[i]
        probs = get_result(items)

        if fix_results[i] in probs:
            results.append(1)
        else:
            results.append(0)
    return results

In [29]:
fix_results = test_dataset['fix_result'].values
results = cal_win_rate(fix_results, pred_probs)
np.array(results).sum() / len(results)

0.7611026033690659