In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime
from sklearn.metrics import accuracy_score


## 获取全量的竞彩比赛列表

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_game_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_game_list_df['source'] = 'jc'

## 获取全量的胜负彩比赛列表

In [3]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_lottery_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_lottery_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_lottery_game_list_df['source'] = 'lottery'

## 合并竞彩比赛列表和胜负彩比赛列表

In [4]:
tmp = train_lottery_game_list_df.drop(['issue'], axis=1)
df = pd.concat([train_game_list_df, tmp])
df = df[['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn', 'time', 'result', 'win_bet_return', 'draw_bet_return', 'lose_bet_return', 'source']]
df = df.drop_duplicates(subset=['matchid'])

## **设定训练范围** 并处理数据

In [5]:
# match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲', '欧冠', '欧罗巴']
match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲']
match_df = df[(df['game'].isin(match_group))]
match_df = match_df.dropna()

In [6]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(df['home_team'].values) | set(df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

LabelEncoder()

In [7]:
def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [8]:
# 比赛名称encode
games = list(set(match_df['game'].values))
game_encoder = preprocessing.LabelEncoder()
game_encoder.fit(games)

LabelEncoder()

In [9]:
def encode_game(df):
    df['game_encoder'] = game_encoder.transform(df['game'])
    return df

In [10]:
match_df['year'] = match_df.apply(lambda row: row.time.year, axis=1)
match_df['month'] = match_df.apply(lambda row: row.time.month, axis=1)
match_df['day'] = match_df.apply(lambda row: row.time.day, axis=1)
match_df['fix_result'] = match_df.apply(lambda row: int(row.result) if row.result < 3 else 2, axis=1)

In [11]:
match_df = encode_team(match_df)
match_df = encode_game(match_df)

In [12]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43477 entries, 1 to 12650
Data columns (total 20 columns):
matchid               43477 non-null int64
game                  43477 non-null object
home_team             43477 non-null object
visit_team            43477 non-null object
gs                    43477 non-null int64
gd                    43477 non-null int64
gn                    43477 non-null int64
time                  43477 non-null datetime64[ns]
result                43477 non-null int64
win_bet_return        43477 non-null float64
draw_bet_return       43477 non-null float64
lose_bet_return       43477 non-null float64
source                43477 non-null object
year                  43477 non-null int64
month                 43477 non-null int64
day                   43477 non-null int64
fix_result            43477 non-null int64
home_team_encoder     43477 non-null int64
visit_team_encoder    43477 non-null int64
game_encoder          43477 non-null int64
dtypes: date

## 获取特征数据

In [13]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_recent_feature_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_feature_df = pd.DataFrame(rows)
    
    connection.close()

In [14]:
def take_goal_info(prefix, df):
    target_cols = [
        '_0_1_gd', 
        '_0_1_gs', 
        '_0_gd', 
        '_0_gs', 
        '_1_gd', 
        '_1_gs',
        '_2_3_gd', 
        '_2_3_gs', 
        '_2_gd', 
        '_2_gs', 
        '_3_gd', 
        '_3_gs',
        '_4_gd', 
        '_4_gs', 
        '_5_gd', 
        '_5_gs', 
        '_6_gd', 
        '_6_gs',
        '_7_gd', 
        '_7_gs', 
        '_ab_4_gd', 
        '_ab_4_gs',
        '_abs_draw', 
        '_abs_lose', 
        '_abs_win']

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [15]:
def take_goal_pref_info(prefix, df):
    target_cols = [
        '_draw', '_g', '_gd',
        '_gs', '_lose', '_win',
    ]

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [16]:
train_feature_df = take_goal_info('h', train_feature_df)
train_feature_df = take_goal_info('v', train_feature_df)

train_feature_df = take_goal_pref_info('h_host', train_feature_df)
train_feature_df = take_goal_pref_info('v_visit', train_feature_df)

In [17]:
# 所有球队的主客场进球平均数

train_feature_df['h_avg_abs_gs'] = train_feature_df['h_abs_gs'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_gd'] = train_feature_df['h_abs_gd'].sum() / train_feature_df['h_count'].sum()
train_feature_df['v_avg_abs_gs'] = train_feature_df['v_abs_gs'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_gd'] = train_feature_df['v_abs_gd'].sum() / train_feature_df['v_count'].sum()

In [18]:
train_feature_df['h_avg_abs_win'] = train_feature_df['h_abs_win'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_draw'] = train_feature_df['h_abs_draw'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_lose'] = train_feature_df['h_abs_lose'].sum() / train_feature_df['h_count'].sum()

train_feature_df['v_avg_abs_win'] = train_feature_df['v_abs_win'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_draw'] = train_feature_df['v_abs_draw'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_lose'] = train_feature_df['v_abs_lose'].sum() / train_feature_df['v_count'].sum()

## 获取赔率信息

In [19]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_odd_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_match_odd_df = pd.DataFrame(rows)
    
    connection.close()

In [20]:
train_match_odd_df.head()

Unnamed: 0,avg_init_draw_odd,avg_init_lose_odd,avg_init_win_odd,avg_new_draw_kelly,avg_new_draw_odd,avg_new_draw_rate,avg_new_lose_kelly,avg_new_lose_odd,avg_new_lose_rate,avg_new_win_kelly,...,min_new_lose_kelly,min_new_lose_odd,min_new_lose_rate,min_new_win_kelly,min_new_win_odd,min_new_win_rate,min_pay_rate,std_draw,std_lose,std_win
0,13.46,30.07,1.03,0.97,13.33,7.28,1.02,31.71,3.21,0.92,...,0.18,5.5,1.17,0.9,1.0,66.71,0.83,1007.93,11029.6,0.04
1,3.38,4.29,1.75,0.91,3.43,26.55,0.91,4.46,20.5,0.91,...,0.76,3.7,16.79,0.79,1.5,48.58,0.83,1.83,12.46,0.35
2,6.72,14.49,1.14,0.92,6.97,13.17,0.95,15.31,6.19,0.91,...,0.48,7.8,3.38,0.85,1.05,69.75,0.83,37.8,1120.81,0.05
3,8.02,17.87,1.09,0.92,7.89,11.68,0.94,17.85,5.29,0.91,...,0.56,10.5,2.86,0.85,1.02,75.49,0.83,93.17,1417.25,0.04
4,3.98,6.79,1.42,0.91,4.06,22.43,0.92,7.17,12.88,0.91,...,0.69,5.35,9.54,0.84,1.3,59.57,0.83,4.23,90.73,0.14


## 合并训练数据

In [21]:
train_dataset_df = pd.merge(match_df, train_feature_df, on='matchid', how='left')
train_dataset_df = pd.merge(train_dataset_df, train_match_odd_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()
train_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31493 entries, 0 to 43278
Columns: 217 entries, matchid to std_win
dtypes: datetime64[ns](1), float64(200), int64(12), object(4)
memory usage: 52.4+ MB


In [22]:
train_dataset_df.columns.values

array(['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn',
       'time', 'result', 'win_bet_return', 'draw_bet_return',
       'lose_bet_return', 'source', 'year', 'month', 'day', 'fix_result',
       'home_team_encoder', 'visit_team_encoder', 'game_encoder',
       'h_0_1_gd', 'h_0_1_gs', 'h_0_gd', 'h_0_gs', 'h_1_gd', 'h_1_gs',
       'h_2_3_gd', 'h_2_3_gs', 'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
       'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
       'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
       'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
       'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
       'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
       'h_host_gs', 'h_host_lose', 'h_host_win', 'id_x', 'v_0_1_gd',
       'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
       'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
       'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd'

## xgb regressor 尝试预测净胜球

In [23]:
train_dataset_df['fix_result'] = train_dataset_df['gs'] - train_dataset_df['gd']

In [None]:
import xgboost as xgb

x_columns = [
#     'win_bet_return', 'draw_bet_return', 'lose_bet_return', 
    'year', 'month',
#     'home_team_encoder', 'visit_team_encoder', 'game_encoder',
    
    'h_avg_abs_gs',
    'h_avg_abs_gd', 
    'v_avg_abs_gs', 
    'v_avg_abs_gd', 
    'h_avg_abs_win',
    'h_avg_abs_draw', 'h_avg_abs_lose', 'v_avg_abs_win',
    'v_avg_abs_draw', 'v_avg_abs_lose',
    
    'h_0_1_gd', 
    'h_0_1_gs', 
    'h_0_gd', 
    'h_0_gs', 
    'h_1_gd', 
    'h_1_gs',
    'h_2_3_gd', 
    'h_2_3_gs', 
    'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
    'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
    'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
    'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
    'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
    'h_host_count', 
    'h_host_draw', 
    'h_host_g', 'h_host_gd',
    'h_host_gs', 
    'h_host_lose', 'h_host_win', 
    
    'v_0_1_gd',
    'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
    'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
    'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 'v_6_gs', 'v_7_gd',
    'v_7_gs', 'v_ab_4_gd', 'v_ab_4_gs', 'v_abs_avg_g', 'v_abs_avg_gd',
    'v_abs_avg_gs', 'v_abs_draw', 'v_abs_g', 'v_abs_gd', 'v_abs_gs',
    'v_abs_lose', 'v_abs_win', 
    
    'v_count', 
    'v_visit_count',
    'v_visit_draw', 
    'v_visit_g', 'v_visit_gd', 'v_visit_gs',
    'v_visit_lose', 'v_visit_win',
    
    'h_0_1_gd_rate', 'h_0_1_gs_rate',
    'h_0_gd_rate', 'h_0_gs_rate', 'h_1_gd_rate', 'h_1_gs_rate',
    'h_2_3_gd_rate', 'h_2_3_gs_rate', 'h_2_gd_rate', 'h_2_gs_rate',
    'h_3_gd_rate', 'h_3_gs_rate', 'h_4_gd_rate', 'h_4_gs_rate',
    'h_5_gd_rate', 'h_5_gs_rate', 'h_6_gd_rate', 'h_6_gs_rate',
    'h_7_gd_rate', 'h_7_gs_rate', 'h_ab_4_gd_rate', 'h_ab_4_gs_rate',
    'h_abs_draw_rate', 'h_abs_lose_rate', 'h_abs_win_rate',
    'v_0_1_gd_rate', 'v_0_1_gs_rate', 'v_0_gd_rate', 'v_0_gs_rate',
    'v_1_gd_rate', 'v_1_gs_rate', 'v_2_3_gd_rate', 'v_2_3_gs_rate',
    'v_2_gd_rate', 'v_2_gs_rate', 'v_3_gd_rate', 'v_3_gs_rate',
    'v_4_gd_rate', 'v_4_gs_rate', 'v_5_gd_rate', 'v_5_gs_rate',
    'v_6_gd_rate', 'v_6_gs_rate', 'v_7_gd_rate', 'v_7_gs_rate',
    
    'v_ab_4_gd_rate', 
    'v_ab_4_gs_rate', 
    'v_abs_draw_rate',
    'v_abs_lose_rate', 
    'v_abs_win_rate',
    'h_host_draw_rate',
    'h_host_g_rate', 
    'h_host_gd_rate', 
    'h_host_gs_rate',
    'h_host_lose_rate', 
    'h_host_win_rate', 
    'v_visit_draw_rate',
    'v_visit_g_rate', 
    'v_visit_gd_rate', 
    'v_visit_gs_rate',
    'v_visit_lose_rate', 
    'v_visit_win_rate',
    
    'avg_init_draw_odd',
    'avg_init_lose_odd', 'avg_init_win_odd', 'avg_new_draw_kelly',
    'avg_new_draw_odd', 'avg_new_draw_rate', 'avg_new_lose_kelly',
    'avg_new_lose_odd', 'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'reg:linear', 
#     'objective': 'multi:softprob',
#     'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    'max_depth':8, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
#     'eta': 0.05, # 如同学习率
    'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

t = train_dataset_df

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 100

# num_rounds = 10000
# stop_rounds = 300

model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=stop_rounds)
print(model.best_iteration)

In [25]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

array([-0.19064993,  0.08287174,  0.9153619 , ...,  0.9269872 ,
        0.8028513 , -0.55736434], dtype=float32)

In [43]:
import math

test_dataset['preds'] = preds
# test_dataset['preds'] = test_dataset.apply(lambda row: math.floor(row.preds), axis=1)

## 通过对让球数据分析，求最终的胜率

### 获取让球的赔率信息

In [47]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_offset_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_offset_df = pd.DataFrame(rows)
    
    connection.close()

In [48]:
train_game_offset_df['new_offset'].drop_duplicates().values

array(['两半/三', '半/一', '两球', '一球', '受平/半', '平/半', '两/两半', '半球', '球半/两',
       '球半', '受半球', '受半/一', '受一/球半', '受一球', '一/球半', '平手', '受球半', '受球半/两',
       '受两球', '三球', '两半', '受两/两半', '受三球', '三/三半', '受三/三半', '六球', '四球',
       '四半', '受两半', '八半', '受两半/三', '三半/四', '三半', '受六球', '受四半/五', '受五半/六',
       '五球', '受三半', '四/四半', '五半', '受四/四半', '四半/五', '受六半', '六半/七', '七半',
       '受四半', '受五/五半', '受六/六半', '受四球', '六半', '受三半/四', '七/七半', '受五半'],
      dtype=object)

In [49]:
def take_value_from_work(row):
    map = {
        '两半/三': -2.75, 
        '半/一': -0.75, 
        '两球': -2, 
        '一球': -1, 
        '受平/半': 0.25, 
        '平/半': -0.25, 
        '两/两半': -2.25, 
        '半球': -0.5,
        '球半/两': -1.75,
        '球半': -1.5,
        '受半球': 0.5, 
        '受半/一': 0.75, 
        '受一/球半': 1.25, 
        '受一球': 1, 
        '一/球半': -1.25, 
        '平手': 0, 
        '受球半': 1.5, 
        '受球半/两': 1.75,
        '受两球': 2, 
        '三球': -3, 
        '两半': -2.5, 
        '受两/两半': 2.25, 
        '受三球': 3, 
        '三/三半': -3.25, 
        '受三/三半': 3.25, 
        '六球': -6, 
        '四球': -4,
        '四半': -4.5, 
        '受两半': 2.5, 
        '八半': -8.5, 
        '受两半/三': 2.75, 
        '三半/四': -3.75, 
        '三半': -3.5, 
        '受六球': 6, 
        '受四半/五': 4.75, 
        '受五半/六': 5.75,
        '五球': -5, 
        '受三半': 3.5, 
        '四/四半': -4.25, 
        '五半': -5.5, 
        '受四/四半': 4.25, 
        '四半/五': -4.75, 
        '受六半': 6.5, 
        '六半/七': -6.75, 
        '七半': -7.5,
        '受四半': 4.5, 
        '受五/五半': 5.25, 
        '受六/六半': 6.25, 
        '受四球': 4, 
        '六半': -6.5,
        '受三半/四': 3.75, 
        '七/七半': -7.25,
        '受五半': 5.5
    }
    
    v = map[row.new_offset]
    
    # 原来只需要返回v这里，我特别做处理
#     if v is not None:
#         num = v/0.25
#         if num % 4 > 0:
#             if num > 0:
#                 if num % 4 == 3:
#                     return (num + 1)*0.25
#                 elif num % 4 == 2
#                 return (num - 1)*0.25
#             else:
#                 return (num + 1)*0.25
    
    return v

train_game_offset_df['new_offset_val'] = train_game_offset_df.apply(lambda row: take_value_from_work(row), axis=1)

In [59]:
test_df = pd.merge(test_dataset, train_game_offset_df, on='matchid', how='left')
test_df = test_df.dropna()
test_df['offset_result'] = test_df.apply(lambda row: 1 if (row.fix_result + row.new_offset_val) > 0 else 0, axis=1)
test_dataset['rq'] = -test_dataset['preds']
test_df['gap'] = test_df['new_offset_val'] - test_df['rq']

In [174]:
def get_score(row):
    if row.preds > 0:
        return -(math.ceil(row.preds)) + 0.25
#         return -(math.floor(row.preds)) + 0.5
    else:
        return -(math.floor(row.preds)) + 0.25

test_df['pred_offset_val'] = test_df.apply(lambda row: get_score(row), axis=1)
test_df['pred_offset_result'] = test_df.apply(lambda row: 1 if (row.fix_result + row.pred_offset_val) > 0 else 0, axis=1)

In [194]:
b = test_df[['matchid', 'gs', 'gd', 'rq', 'new_offset_val', 
         'gap', 
         'pred_offset_val', 'offset_result', 'pred_offset_result']]

**猜想：让球过多，取盘口反向**，阈值：-0.38 ~ -0.5

In [195]:
b.sort_values(by=['gap'],ascending=True).head(30)

Unnamed: 0,matchid,gs,gd,rq,new_offset_val,gap,pred_offset_val,offset_result,pred_offset_result
94,2405428,4,2,-1.781775,-2.5,-0.718225,-1.75,0,1
1061,2413996,3,1,-3.037195,-3.75,-0.712805,-3.75,0,0
836,2406473,0,0,-1.340897,-2.0,-0.659103,-1.75,0,0
506,2514446,2,0,-1.002987,-1.5,-0.497013,-1.75,1,1
996,2428971,1,4,0.983111,0.5,-0.483111,1.25,0,0
1070,2429039,5,1,-2.285769,-2.75,-0.464231,-2.75,1,1
902,2415173,1,2,-1.039512,-1.5,-0.460488,-1.75,0,0
971,2437470,5,0,-1.044129,-1.5,-0.455871,-1.75,1,1
399,2436078,2,0,-1.819185,-2.25,-0.430815,-1.75,0,1
678,2508637,2,1,-0.578756,-1.0,-0.421244,-0.75,0,1


In [205]:
t = b[b['gap'] < -0.4]
(len(t) - t['offset_result'].sum())/len(t), len(t)

(0.7692307692307693, 13)

**猜想：受让球过多，取盘口相同**，阈值：0.45 ~ 0.5

In [196]:
b.sort_values(by=['gap'],ascending=False).head(30)

Unnamed: 0,matchid,gs,gd,rq,new_offset_val,gap,pred_offset_val,offset_result,pred_offset_result
59,2405431,0,2,1.337834,2.25,0.912166,2.25,1,1
989,2413991,1,1,1.848837,2.75,0.901163,2.25,1,1
227,2406712,1,0,-0.495294,0.25,0.745294,-0.75,1,1
202,2428903,1,2,1.114645,1.75,0.635355,2.25,1,1
592,2428953,1,4,0.910363,1.5,0.589637,1.25,0,0
293,2437360,3,1,-0.833288,-0.25,0.583288,-0.75,1,1
686,2514512,1,0,-1.061809,-0.5,0.561809,-1.75,1,0
319,2436063,1,4,0.69893,1.25,0.55107,1.25,0,0
816,2415165,3,0,0.707533,1.25,0.542467,1.25,1,1
408,2508530,2,1,-0.776911,-0.25,0.526911,-0.75,1,1


In [206]:
t = b[b['gap'] > 0.5]
t['offset_result'].sum()/len(t), len(t)

(0.7272727272727273, 11)

## 通过对总进球数据分析，求大小盘

In [209]:
train_dataset_df['fix_result'] = train_dataset_df['gn']

### xgboost

In [None]:
import xgboost as xgb

x_columns = [
#     'win_bet_return', 'draw_bet_return', 'lose_bet_return', 
    'year', 'month',
#     'home_team_encoder', 'visit_team_encoder', 'game_encoder',
    
    'h_avg_abs_gs',
    'h_avg_abs_gd', 
    'v_avg_abs_gs', 
    'v_avg_abs_gd', 
    'h_avg_abs_win',
    'h_avg_abs_draw', 'h_avg_abs_lose', 'v_avg_abs_win',
    'v_avg_abs_draw', 'v_avg_abs_lose',
    
    'h_0_1_gd', 
    'h_0_1_gs', 
    'h_0_gd', 
    'h_0_gs', 
    'h_1_gd', 
    'h_1_gs',
    'h_2_3_gd', 
    'h_2_3_gs', 
    'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
    'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
    'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
    'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
    'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
    'h_host_count', 
    'h_host_draw', 
    'h_host_g', 'h_host_gd',
    'h_host_gs', 
    'h_host_lose', 'h_host_win', 
    
    'v_0_1_gd',
    'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
    'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
    'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 'v_6_gs', 'v_7_gd',
    'v_7_gs', 'v_ab_4_gd', 'v_ab_4_gs', 'v_abs_avg_g', 'v_abs_avg_gd',
    'v_abs_avg_gs', 'v_abs_draw', 'v_abs_g', 'v_abs_gd', 'v_abs_gs',
    'v_abs_lose', 'v_abs_win', 
    
    'v_count', 
    'v_visit_count',
    'v_visit_draw', 
    'v_visit_g', 'v_visit_gd', 'v_visit_gs',
    'v_visit_lose', 'v_visit_win',
    
    'h_0_1_gd_rate', 'h_0_1_gs_rate',
    'h_0_gd_rate', 'h_0_gs_rate', 'h_1_gd_rate', 'h_1_gs_rate',
    'h_2_3_gd_rate', 'h_2_3_gs_rate', 'h_2_gd_rate', 'h_2_gs_rate',
    'h_3_gd_rate', 'h_3_gs_rate', 'h_4_gd_rate', 'h_4_gs_rate',
    'h_5_gd_rate', 'h_5_gs_rate', 'h_6_gd_rate', 'h_6_gs_rate',
    'h_7_gd_rate', 'h_7_gs_rate', 'h_ab_4_gd_rate', 'h_ab_4_gs_rate',
    'h_abs_draw_rate', 'h_abs_lose_rate', 'h_abs_win_rate',
    'v_0_1_gd_rate', 'v_0_1_gs_rate', 'v_0_gd_rate', 'v_0_gs_rate',
    'v_1_gd_rate', 'v_1_gs_rate', 'v_2_3_gd_rate', 'v_2_3_gs_rate',
    'v_2_gd_rate', 'v_2_gs_rate', 'v_3_gd_rate', 'v_3_gs_rate',
    'v_4_gd_rate', 'v_4_gs_rate', 'v_5_gd_rate', 'v_5_gs_rate',
    'v_6_gd_rate', 'v_6_gs_rate', 'v_7_gd_rate', 'v_7_gs_rate',
    
    'v_ab_4_gd_rate', 
    'v_ab_4_gs_rate', 
    'v_abs_draw_rate',
    'v_abs_lose_rate', 
    'v_abs_win_rate',
    'h_host_draw_rate',
    'h_host_g_rate', 
    'h_host_gd_rate', 
    'h_host_gs_rate',
    'h_host_lose_rate', 
    'h_host_win_rate', 
    'v_visit_draw_rate',
    'v_visit_g_rate', 
    'v_visit_gd_rate', 
    'v_visit_gs_rate',
    'v_visit_lose_rate', 
    'v_visit_win_rate',
    
    'avg_init_draw_odd',
    'avg_init_lose_odd', 'avg_init_win_odd', 'avg_new_draw_kelly',
    'avg_new_draw_odd', 'avg_new_draw_rate', 'avg_new_lose_kelly',
    'avg_new_lose_odd', 'avg_new_lose_rate', 'avg_new_win_kelly',
    'avg_new_win_odd', 'avg_new_win_rate', 'avg_pay_rate',
    'dispersion_draw', 'dispersion_lose', 'dispersion_win', 
    'max_init_draw_odd', 'max_init_lose_odd', 'max_init_win_odd',
    'max_new_draw_kelly', 'max_new_draw_odd', 'max_new_draw_rate',
    'max_new_lose_kelly', 'max_new_lose_odd', 'max_new_lose_rate',
    'max_new_win_kelly', 'max_new_win_odd', 'max_new_win_rate',
    'max_pay_rate', 'min_init_draw_odd', 'min_init_lose_odd',
    'min_init_win_odd', 'min_new_draw_kelly', 'min_new_draw_odd',
    'min_new_draw_rate', 'min_new_lose_kelly', 'min_new_lose_odd',
    'min_new_lose_rate', 'min_new_win_kelly', 'min_new_win_odd',
    'min_new_win_rate', 'min_pay_rate', 'std_draw', 'std_lose',
    'std_win'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'reg:linear', 
#     'objective': 'multi:softprob',
#     'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    'max_depth':8, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
#     'eta': 0.05, # 如同学习率
    'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

t = train_dataset_df

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 100

# num_rounds = 10000
# stop_rounds = 300

model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=stop_rounds)
print(model.best_iteration)

In [304]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

array([2.6338747, 2.4961886, 2.716594 , ..., 3.5467365, 3.0428164,
       2.411234 ], dtype=float32)

### svm

In [289]:
from sklearn.svm import SVC

svm_model = SVC(C=1, kernel='rbf', degree=3, gamma='auto', verbose=True)
svm_model.fit(train_dataset[x_columns], train_dataset['fix_result'])

[LibSVM]

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [290]:
preds = svm_model.predict(test_dataset[x_columns])
preds

array([2, 2, 2, ..., 2, 2, 2])

### 策略

In [305]:
import math

test_dataset['preds'] = preds
# test_dataset['preds'] = test_dataset.apply(lambda row: math.floor(row.preds), axis=1)

大球

In [328]:
t = test_dataset[['preds', 'gn']]
t = t[t['preds']>= 3]
len(t[t['gn']>=2])/len(t), len(t[t['gn']>2.5])/len(t), len(t[t['gn']>=3])/len(t)

(0.8361204013377926, 0.5986622073578596, 0.5986622073578596)

In [307]:
t = test_dataset[['preds', 'gn']]
t = t[t['preds']> 2.5]
len(t[t['gn']>=2])/len(t), len(t[t['gn']>2.5])/len(t), len(t[t['gn']>=3])/len(t)

(0.8082010582010583, 0.5542328042328042, 0.5542328042328042)

小球

In [325]:
t = test_dataset[['preds', 'gn']]
t = t[t['preds']<= 2]
len(t[t['gn']<=2])/len(t), len(t[t['gn']<2.5])/len(t), len(t[t['gn']<=3])/len(t)

(0.6341463414634146, 0.6341463414634146, 0.8048780487804879)

In [326]:
t = test_dataset[['preds', 'gn']]
t = t[t['preds']< 2.5]
len(t[t['gn']<=2])/len(t), len(t[t['gn']<2.5])/len(t), len(t[t['gn']<=3])/len(t)

(0.5946666666666667, 0.5946666666666667, 0.792)

In [329]:
t.head(20)

Unnamed: 0,preds,gn
41866,3.016717,2
41869,3.098668,2
41879,3.145733,3
41881,3.257589,2
41886,3.075311,1
41892,3.054484,1
41895,3.108136,2
41900,3.19391,5
41901,3.59171,1
41902,3.18025,5
