In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime

# 训练

## 获取全量的比赛平均赔率信息

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ai', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = "select * from `breadt_match_odd_info` where odd_type = 'avg';"
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_odd_info_df = pd.DataFrame(rows)
    
    connection.close()

In [3]:
train_odd_info_df = train_odd_info_df.drop(['id', 'odd_type'], axis=1)

In [4]:
train_odd_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55484 entries, 0 to 55483
Data columns (total 14 columns):
init_draw_odd     55484 non-null float64
init_lose_odd     55484 non-null float64
init_win_odd      55484 non-null float64
matchid           55484 non-null int64
new_draw_kelly    55484 non-null float64
new_draw_odd      55484 non-null float64
new_draw_rate     55484 non-null float64
new_lose_kelly    55484 non-null float64
new_lose_odd      55484 non-null float64
new_lose_rate     55484 non-null float64
new_win_kelly     55484 non-null float64
new_win_odd       55484 non-null float64
new_win_rate      55484 non-null float64
pay_rate          55484 non-null float64
dtypes: float64(13), int64(1)
memory usage: 5.9 MB


## 获取全量的比赛结果信息

In [5]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ai', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_result_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

In [6]:
train_game_list_df['season_year'] = train_game_list_df.apply(lambda row: int(row.season.split('-')[0]), axis=1)
train_game_list_df = train_game_list_df.drop(['id', 'gd', 'gs', 'gn'], axis=1)

In [7]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(train_game_list_df['home_team'].values) | set(train_game_list_df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [8]:
train_game_list_df['year'] = train_game_list_df.apply(lambda row: row.time.year, axis=1)
train_game_list_df['month'] = train_game_list_df.apply(lambda row: row.time.month, axis=1)
train_game_list_df['day'] = train_game_list_df.apply(lambda row: row.time.day, axis=1)

train_game_list_df = encode_team(train_game_list_df)

In [9]:
train_game_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70570 entries, 0 to 70569
Data columns (total 14 columns):
home_team             70570 non-null object
league                70570 non-null object
matchid               70570 non-null int64
result                70570 non-null int64
season                70570 non-null object
time                  70570 non-null object
turn                  70570 non-null int64
visit_team            70570 non-null object
season_year           70570 non-null int64
year                  70570 non-null int64
month                 70570 non-null int64
day                   70570 non-null int64
home_team_encoder     70570 non-null int64
visit_team_encoder    70570 non-null int64
dtypes: int64(9), object(5)
memory usage: 7.5+ MB


## 获取比赛进球数信息

In [10]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ai', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_match_goal_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_match_goal_df = pd.DataFrame(rows)
    
    connection.close()

In [11]:
def offset_result(row):
    if row.home_or_visit == 'visit':
        if row.result == 0:
            return 2
        elif row.result == 2:
            return 0
        else:
            return row.result
    else:
        return row.result

def get_season_goal(row, df):
    return df.xs((row.season, row.team_name))[:row.turn-1].sum()

def get_home_or_visit_df(df, tag):
    return df[df['home_or_visit']==tag]

def get_home_or_visit_goal(row, df):
    return get_season_goal(row, get_home_or_visit_df(df, row.home_or_visit))

def get_home_or_visit_avg(row, df, col):
    df = get_home_or_visit_df(df, row.home_or_visit)
    values = np.array(df.xs((row.season, row.team_name))[:row.turn-1].values)
    count = (values >= 0).sum()
    
    if count == 0:
        return 0
    else:
        return row[col] / count

def get_count(row, df, result):
    values = np.array(df.xs((row.season, row.team_name))[:row.turn-1].values)
    return (values == result).sum()

def get_home_or_visit_count(row, df, result):
    df = get_home_or_visit_df(df, row.home_or_visit)
    return get_count(row, df, result)

def get_pivot_table(df, values, fill_value):
    return df.pivot_table(index=["season","team_name"], columns='turn',values=values,fill_value=fill_value)

In [12]:
# 从结果的视角修改result
train_match_goal_df['result'] = train_match_goal_df.apply(lambda row: offset_result(row), axis=1)
train_match_goal_df['league'] = train_match_goal_df['league'].astype(np.int16)

In [13]:
stat_df = train_match_goal_df[['home_or_visit', 'league', 'team_name', 'turn', 'season', 'matchid']].drop_duplicates()
gd_df = train_match_goal_df[(train_match_goal_df['goal_type']=='gd')]
gs_df = train_match_goal_df[(train_match_goal_df['goal_type']=='gs')]

In [15]:
stat_df['season_gd'] = stat_df.apply(lambda row: get_season_goal(row, get_pivot_table(gd_df, 'goal_num', 0)), axis=1)
stat_df['season_gs'] = stat_df.apply(lambda row: get_season_goal(row, get_pivot_table(gs_df, 'goal_num', 0)), axis=1)
stat_df['season_ag'] = stat_df['season_gs'] - stat_df['season_gd']

stat_df['avg_season_gs'] = stat_df['season_gs'] / (stat_df['turn'] - 1)
stat_df['avg_season_gd'] = stat_df['season_gd'] / (stat_df['turn'] - 1)
stat_df['avg_season_ag'] = stat_df['season_ag'] / (stat_df['turn'] - 1)

stat_df['season_act_gs'] = stat_df.apply(lambda row: get_home_or_visit_goal(row, get_pivot_table(gs_df, 'goal_num', 0)), axis=1)
stat_df['season_act_gd'] = stat_df.apply(lambda row: get_home_or_visit_goal(row, get_pivot_table(gd_df, 'goal_num', 0)), axis=1)
stat_df['season_act_ag'] = stat_df['season_act_gs'] - stat_df['season_act_gd']

stat_df['avg_season_act_gs'] = stat_df.apply(lambda row: get_home_or_visit_avg(row, get_pivot_table(gs_df, 'goal_num', 0), 'season_act_gs'), axis=1)
stat_df['avg_season_act_gd'] = stat_df.apply(lambda row: get_home_or_visit_avg(row, get_pivot_table(gd_df, 'goal_num', 0), 'season_act_gd'), axis=1)
stat_df['avg_season_act_ag'] = stat_df['avg_season_act_gs'] - stat_df['avg_season_act_gd']

df = get_pivot_table(gs_df, 'result', -1)
stat_df['season_win_count'] = stat_df.apply(lambda row: get_count(row, df, 2), axis=1)
stat_df['season_draw_count'] = stat_df.apply(lambda row: get_count(row, df, 1), axis=1)
stat_df['season_lose_count'] = stat_df.apply(lambda row: get_count(row, df, 0), axis=1)

stat_df['season_win_rate'] = stat_df['season_win_count'] / (stat_df['turn'] - 1)
stat_df['season_draw_rate'] = stat_df['season_draw_count'] / (stat_df['turn'] - 1)
stat_df['season_lose_rate'] = stat_df['season_lose_count'] / (stat_df['turn'] - 1)

stat_df['season_act_win_count'] = stat_df.apply(lambda row: get_home_or_visit_count(row, df, 2), axis=1)
stat_df['season_act_draw_count'] = stat_df.apply(lambda row: get_home_or_visit_count(row, df, 1), axis=1)
stat_df['season_act_lose_count'] = stat_df.apply(lambda row: get_home_or_visit_count(row, df, 0), axis=1)

stat_df['season_act_win_rate'] = stat_df.apply(lambda row: get_home_or_visit_avg(row, df, 'season_act_win_count'), axis=1)
stat_df['season_act_draw_rate'] = stat_df.apply(lambda row: get_home_or_visit_avg(row, df, 'season_draw_count'), axis=1)
stat_df['season_act_lose_rate'] = stat_df.apply(lambda row: get_home_or_visit_avg(row, df, 'season_lose_count'), axis=1)

KeyError: ('home_or_visit', 'occurred at index 0')

## 合并比赛列表和比赛的赔率信息

In [None]:
train_dataset_df = pd.merge(train_odd_info_df, train_game_list_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()

In [None]:
t = stat_df[stat_df['home_or_visit']=='home'].drop(['home_or_visit', 'league', 'turn', 'season', 'team_name'], axis=1)
for name in t.columns.values:
    if name != 'matchid':
        t = t.rename(columns={name:'home_'+name})
        
train_dataset_df = pd.merge(train_dataset_df, t, on='matchid', how='left')

In [None]:
t = stat_df[stat_df['home_or_visit']=='visit'].drop(['home_or_visit', 'league', 'turn', 'season', 'team_name'], axis=1)
for name in t.columns.values:
    if name != 'matchid':
        t = t.rename(columns={name:'visit_'+name})
        
train_dataset_df = pd.merge(train_dataset_df, t, on='matchid', how='left')

In [None]:
train_dataset_df = train_dataset_df.dropna()
train_dataset_df['league'] = train_dataset_df['league'].astype(np.int64)
train_dataset_df = train_dataset_df[train_dataset_df['turn'] > 5]

In [None]:
train_dataset_df.columns.values

## xgb训练

In [None]:
import xgboost as xgb

x_columns = [
    'init_draw_odd', 'init_lose_odd', 'init_win_odd', 'matchid',
#     'new_draw_kelly', 
    'new_draw_odd', 'new_draw_rate',
#     'new_lose_kelly', 
    'new_lose_odd', 'new_lose_rate', 
#     'new_win_kelly',
    'new_win_odd', 'new_win_rate', 
#     'pay_rate', 
    'league',
    'turn', 'season_year',
    'year', 'month', 'home_team_encoder', 'visit_team_encoder',
    'home_season_gs', 'home_season_gd', 'home_season_ag',
    'home_avg_season_gs', 'home_avg_season_gd', 'home_avg_season_ag',
    'home_season_act_gs', 'home_season_act_gd', 'home_season_act_ag',
    'home_avg_season_act_gs', 'home_avg_season_act_gd',
    'home_avg_season_act_ag', 'home_season_win_count',
    'home_season_draw_count', 'home_season_lose_count',
    'home_season_win_rate', 'home_season_draw_rate',
    'home_season_lose_rate', 'home_season_act_win_count',
    'home_season_act_draw_count', 'home_season_act_lose_count',
    'home_season_act_win_rate', 'home_season_act_draw_rate',
    'home_season_act_lose_rate', 'visit_season_gs', 'visit_season_gd',
    'visit_season_ag', 'visit_avg_season_gs', 'visit_avg_season_gd',
    'visit_avg_season_ag', 'visit_season_act_gs',
    'visit_season_act_gd', 'visit_season_act_ag',
    'visit_avg_season_act_gs', 'visit_avg_season_act_gd',
    'visit_avg_season_act_ag', 'visit_season_win_count',
    'visit_season_draw_count', 'visit_season_lose_count',
    'visit_season_win_rate', 'visit_season_draw_rate',
    'visit_season_lose_rate', 'visit_season_act_win_count',
    'visit_season_act_draw_count', 'visit_season_act_lose_count',
    'visit_season_act_win_rate', 'visit_season_act_draw_rate',
    'visit_season_act_lose_rate'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'multi:softmax', 
    'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    
    'max_depth':7, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.3, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.3, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
    'eta': 0.01, # 如同学习率
#     'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

train_dataset = train_dataset_df[train_dataset_df['year'] < 2019]
test_dataset = train_dataset_df[train_dataset_df['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 4]
test_dataset = test_dataset[test_dataset['month'] >= 4]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

In [None]:
num_rounds = 10000
stop_rounds = 20

# num_rounds = 10000
# stop_rounds = 300

model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

In [None]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_dataset['result'], preds)