# 结论

+ 该方案可以直接适用于新的数据源，可以在更大的数据结果里进行尝试
+ 77%的成果率，数据非常稳定

In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

# 训练

## 获取全量的竞彩比赛列表

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_game_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_game_list_df['source'] = 'jc'

## 获取全量的胜负彩比赛列表

In [3]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_lottery_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_lottery_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_lottery_game_list_df['source'] = 'lottery'

## 合并竞彩比赛列表和胜负彩比赛列表

In [18]:
tmp = train_lottery_game_list_df.drop(['issue'], axis=1)
df = pd.concat([train_game_list_df, tmp])
df = df[['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn', 'time', 'result', 'win_bet_return', 'draw_bet_return', 'lose_bet_return', 'source']]
df = df.drop_duplicates(subset=['matchid'])

In [19]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(df['home_team'].values) | set(df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

LabelEncoder()

In [20]:
def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [23]:
# 比赛名称encode
games = list(set(df['game'].values))
game_encoder = preprocessing.LabelEncoder()
game_encoder.fit(games)

LabelEncoder()

In [24]:
def encode_game(df):
    df['game_encoder'] = game_encoder.transform(df['game'])
    return df

In [26]:
df['year'] = df.apply(lambda row: row.time.year, axis=1)
df['month'] = df.apply(lambda row: row.time.month, axis=1)
df['day'] = df.apply(lambda row: row.time.day, axis=1)
df['fix_result'] = df.apply(lambda row: int(row.result) if row.result < 3 else 2, axis=1)

In [27]:
df = encode_team(df)
df = encode_game(df)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57053 entries, 0 to 12650
Data columns (total 20 columns):
matchid               57053 non-null int64
game                  57053 non-null object
home_team             57053 non-null object
visit_team            57053 non-null object
gs                    57053 non-null int64
gd                    57053 non-null int64
gn                    57053 non-null int64
time                  57053 non-null datetime64[ns]
result                57053 non-null int64
win_bet_return        57053 non-null float64
draw_bet_return       57053 non-null float64
lose_bet_return       57053 non-null float64
source                57053 non-null object
year                  57053 non-null int64
month                 57053 non-null int64
day                   57053 non-null int64
fix_result            57053 non-null int64
home_team_encoder     57053 non-null int64
visit_team_encoder    57053 non-null int64
game_encoder          57053 non-null int64
dtypes: date

## 获取赔率信息

In [29]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_odd_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_match_odd_df = pd.DataFrame(rows)
    
    connection.close()

In [30]:
train_match_odd_df['avg_win_odd_change'] = (train_match_odd_df['avg_new_win_odd'] - train_match_odd_df['avg_init_win_odd'])/train_match_odd_df['avg_init_win_odd']
train_match_odd_df['avg_draw_odd_change'] = (train_match_odd_df['avg_new_draw_odd'] - train_match_odd_df['avg_init_draw_odd'])/train_match_odd_df['avg_init_draw_odd']
train_match_odd_df['avg_lose_odd_change'] = (train_match_odd_df['avg_new_lose_odd'] - train_match_odd_df['avg_init_lose_odd'])/train_match_odd_df['avg_init_lose_odd']

## 合并训练数据

In [31]:
train_dataset_df = pd.merge(df, train_match_odd_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()
train_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42622 entries, 0 to 56503
Data columns (total 69 columns):
matchid                42622 non-null int64
game                   42622 non-null object
home_team              42622 non-null object
visit_team             42622 non-null object
gs                     42622 non-null int64
gd                     42622 non-null int64
gn                     42622 non-null int64
time                   42622 non-null datetime64[ns]
result                 42622 non-null int64
win_bet_return         42622 non-null float64
draw_bet_return        42622 non-null float64
lose_bet_return        42622 non-null float64
source                 42622 non-null object
year                   42622 non-null int64
month                  42622 non-null int64
day                    42622 non-null int64
fix_result             42622 non-null int64
home_team_encoder      42622 non-null int64
visit_team_encoder     42622 non-null int64
game_encoder           42622 non-nul

In [32]:
train_dataset_df.columns.values

array(['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn',
       'time', 'result', 'win_bet_return', 'draw_bet_return',
       'lose_bet_return', 'source', 'year', 'month', 'day', 'fix_result',
       'home_team_encoder', 'visit_team_encoder', 'game_encoder',
       'avg_init_draw_odd', 'avg_init_lose_odd', 'avg_init_win_odd',
       'avg_new_draw_kelly', 'avg_new_draw_odd', 'avg_new_draw_rate',
       'avg_new_lose_kelly', 'avg_new_lose_odd', 'avg_new_lose_rate',
       'avg_new_win_kelly', 'avg_new_win_odd', 'avg_new_win_rate',
       'avg_pay_rate', 'dispersion_draw', 'dispersion_lose',
       'dispersion_win', 'id', 'max_init_draw_odd', 'max_init_lose_odd',
       'max_init_win_odd', 'max_new_draw_kelly', 'max_new_draw_odd',
       'max_new_draw_rate', 'max_new_lose_kelly', 'max_new_lose_odd',
       'max_new_lose_rate', 'max_new_win_kelly', 'max_new_win_odd',
       'max_new_win_rate', 'max_pay_rate', 'min_init_draw_odd',
       'min_init_lose_odd', 'min_init_win_odd'

# xgb训练

In [57]:
x_columns = [
#     'year', 'month',
#     'home_team_encoder', 'visit_team_encoder', 
#     'game_encoder',
    'avg_init_draw_odd',
    'avg_init_lose_odd', 'avg_init_win_odd', 'avg_new_draw_kelly',
    'avg_new_draw_odd', 'avg_new_draw_rate', 'avg_new_lose_kelly',
    'avg_new_lose_odd', 'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win', 'avg_win_odd_change', 'avg_draw_odd_change',
    'avg_lose_odd_change'
]

In [65]:
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'multi:softmax', 
#     'objective': 'multi:softprob',
    'num_class':3, # 类数，与 multisoftmax 并用
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    'max_depth':10, # 构建树的深度 [1:]
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    'eta': 0.01, # 如同学习率
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

t = train_dataset_df

scaler = StandardScaler()
scaler.fit(t[x_columns])

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(scaler.transform(train_dataset[x_columns]), label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(scaler.transform(test_dataset[x_columns]), label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(scaler.transform(valid_dataset[x_columns]), label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 300

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


## 单场正确性预测

In [66]:
params['objective'] = 'multi:softmax'
model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[0]	train-merror:0.440715	val-merror:0.495475
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 300 rounds.
[1]	train-merror:0.426444	val-merror:0.480769
[2]	train-merror:0.420162	val-merror:0.49095
[3]	train-merror:0.41477	val-merror:0.479638
[4]	train-merror:0.415958	val-merror:0.478507
[5]	train-merror:0.414301	val-merror:0.486425
[6]	train-merror:0.412643	val-merror:0.488688
[7]	train-merror:0.414301	val-merror:0.49095
[8]	train-merror:0.414523	val-merror:0.488688
[9]	train-merror:0.415191	val-merror:0.492081
[10]	train-merror:0.415315	val-merror:0.489819
[11]	train-merror:0.413905	val-merror:0.493213
[12]	train-merror:0.414152	val-merror:0.493213
[13]	train-merror:0.415488	val-merror:0.493213
[14]	train-merror:0.416329	val-merror:0.492081
[15]	train-merror:0.41623	val-merror:0.488688
[16]	train-merror:0.416724	val-merror:0.494344
[17]	train-merror:0.415587	val-merror:0.493213
[18]	train-merror:0.41

In [67]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

accuracy_score(test_dataset['fix_result'], preds)

0.49540581929555894

## 验证选2场的结果

In [68]:
params['objective'] = 'multi:softprob'
model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[0]	train-merror:0.440715	val-merror:0.495475
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 300 rounds.
[1]	train-merror:0.426444	val-merror:0.480769
[2]	train-merror:0.420162	val-merror:0.49095
[3]	train-merror:0.41477	val-merror:0.479638
[4]	train-merror:0.415958	val-merror:0.478507
[5]	train-merror:0.414301	val-merror:0.486425
[6]	train-merror:0.412643	val-merror:0.488688
[7]	train-merror:0.414301	val-merror:0.49095
[8]	train-merror:0.414523	val-merror:0.488688
[9]	train-merror:0.415191	val-merror:0.492081
[10]	train-merror:0.415315	val-merror:0.489819
[11]	train-merror:0.413905	val-merror:0.493213
[12]	train-merror:0.414152	val-merror:0.493213
[13]	train-merror:0.415488	val-merror:0.493213
[14]	train-merror:0.416329	val-merror:0.492081
[15]	train-merror:0.41623	val-merror:0.488688
[16]	train-merror:0.416724	val-merror:0.494344
[17]	train-merror:0.415587	val-merror:0.493213
[18]	train-merror:0.41

In [69]:
pred_probs = model.predict(xgtest, ntree_limit=model.best_iteration)
pred_probs

array([[0.33142418, 0.32970038, 0.33887544],
       [0.3333948 , 0.3333471 , 0.33325803],
       [0.32676286, 0.32840765, 0.3448295 ],
       ...,
       [0.3280665 , 0.33121812, 0.34071535],
       [0.32626048, 0.32933497, 0.3444045 ],
       [0.3401803 , 0.33019677, 0.32962295]], dtype=float32)

In [70]:
def get_result(items):
    if items[0] <= items[1] and items[0] <= items[2]:
        return [1,2]
    elif items[1] <= items[0] and items[1] <= items[2]:
        return [0,2]
    elif items[2] <= items[0] and items[2] <= items[1]:
        return [0,1]
    
def cal_win_rate(fix_results, pred_probs):
    results = []
    for i in range(len(pred_probs)):
        items = pred_probs[i]
        probs = get_result(items)

        if fix_results[i] in probs:
            results.append(1)
        else:
            results.append(0)
    return results

In [71]:
fix_results = test_dataset['fix_result'].values
results = cal_win_rate(fix_results, pred_probs)
np.array(results).sum() / len(results)

0.7687595712098009

# SVM训练

In [43]:
# Seperating Predictors and Outcome values from train and test sets
X_train = train_dataset[x_columns]
Y_train_label = train_dataset['fix_result'].values.astype(object)

X_test = test_dataset[x_columns]
Y_test_label = test_dataset['fix_result'].values.astype(object)

encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
final_model = SVC(C=1000, kernel='rbf', degree=3, gamma=0.001, probability=True)
final_model.fit(X_train_scaled, Y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
# final_model = svm_model.best_estimator_

print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))

Training set score for SVM: 0.518797
Testing  set score for SVM: 0.497703


In [55]:
pred_probs = final_model.predict_proba(X_test_scaled)
pred_probs

array([[0.47405197, 0.23189443, 0.2940536 ],
       [0.32617446, 0.23190144, 0.4419241 ],
       [0.17796286, 0.28299914, 0.539038  ],
       ...,
       [0.18358106, 0.27806675, 0.53835219],
       [0.17718353, 0.28303152, 0.53978494],
       [0.56536336, 0.27501483, 0.15962181]])

In [56]:
fix_results = test_dataset['fix_result'].values
results = cal_win_rate(fix_results, pred_probs)
np.array(results).sum() / len(results)

0.7679938744257274

## GridSearch寻找最优解

In [60]:
params_grid = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}
svm_model = GridSearchCV(SVC(), params_grid, cv=3)
svm_model.fit(X_train_scaled, Y_train)
final_model = svm_model.best_estimator_

In [62]:
final_model

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [61]:
print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))

Training set score for SVM: 0.519094
Testing  set score for SVM: 0.500766
