In [1]:
import pymysql.cursors
import pandas as pd
from sklearn import preprocessing
import numpy as np
import re
import datetime
from sklearn.metrics import accuracy_score


# 训练

## 获取全量的竞彩比赛列表

In [2]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_game_list`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_game_list_df['source'] = 'jc'

## 获取全量的胜负彩比赛列表

In [3]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_lottery_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_lottery_game_list_df = pd.DataFrame(rows)
    
    connection.close()

train_lottery_game_list_df['source'] = 'lottery'

## 合并竞彩比赛列表和胜负彩比赛列表

In [4]:
tmp = train_lottery_game_list_df.drop(['issue'], axis=1)
df = pd.concat([train_game_list_df, tmp])
df = df[['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn', 'time', 'result', 'win_bet_return', 'draw_bet_return', 'lose_bet_return', 'source']]
df = df.drop_duplicates(subset=['matchid'])

## **设定训练范围** 并处理数据

In [5]:
# match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲', '欧冠', '欧罗巴']
match_group = ['澳超', '英超', '德甲', '德乙', '法甲', '西甲', '意甲', '日职', '英甲', '英冠', '苏超', '法乙', '葡超', '荷甲', '荷乙', '韩K联', '瑞典超', '挪超', '美职', '日乙', '俄超', '比甲', '瑞典甲', '法丙', '挪甲', '英乙', '苏冠', '巴甲', '智利甲', '墨超', '智利乙', '阿甲']
match_df = df[(df['game'].isin(match_group))]
match_df = match_df.dropna()

In [6]:
# 对team做encode 这个encoder后面预测的时候还会用到
teams = list(set(df['home_team'].values) | set(df['visit_team'].values))
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(teams)

LabelEncoder()

In [7]:
def encode_team(df):
    df['home_team_encoder'] = team_encoder.transform(df['home_team'])
    df['visit_team_encoder'] = team_encoder.transform(df['visit_team'])
    return df

In [8]:
# 比赛名称encode
games = list(set(match_df['game'].values))
game_encoder = preprocessing.LabelEncoder()
game_encoder.fit(games)

LabelEncoder()

In [9]:
def encode_game(df):
    df['game_encoder'] = game_encoder.transform(df['game'])
    return df

In [10]:
match_df['year'] = match_df.apply(lambda row: row.time.year, axis=1)
match_df['month'] = match_df.apply(lambda row: row.time.month, axis=1)
match_df['day'] = match_df.apply(lambda row: row.time.day, axis=1)
match_df['fix_result'] = match_df.apply(lambda row: int(row.result) if row.result < 3 else 2, axis=1)

In [11]:
match_df = encode_team(match_df)
match_df = encode_game(match_df)

In [12]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43477 entries, 1 to 12650
Data columns (total 20 columns):
matchid               43477 non-null int64
game                  43477 non-null object
home_team             43477 non-null object
visit_team            43477 non-null object
gs                    43477 non-null int64
gd                    43477 non-null int64
gn                    43477 non-null int64
time                  43477 non-null datetime64[ns]
result                43477 non-null int64
win_bet_return        43477 non-null float64
draw_bet_return       43477 non-null float64
lose_bet_return       43477 non-null float64
source                43477 non-null object
year                  43477 non-null int64
month                 43477 non-null int64
day                   43477 non-null int64
fix_result            43477 non-null int64
home_team_encoder     43477 non-null int64
visit_team_encoder    43477 non-null int64
game_encoder          43477 non-null int64
dtypes: date

## 获取特征数据

In [13]:
connection = pymysql.connect(host='localhost', user='root', password='breadt@2019', db='breadt-football-ml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    sql = 'select * from `breadt_football_recent_feature_info`;'
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    train_feature_df = pd.DataFrame(rows)
    
    connection.close()

In [14]:
def take_goal_info(prefix, df):
    target_cols = [
        '_0_1_gd', 
        '_0_1_gs', 
        '_0_gd', 
        '_0_gs', 
        '_1_gd', 
        '_1_gs',
        '_2_3_gd', 
        '_2_3_gs', 
        '_2_gd', 
        '_2_gs', 
        '_3_gd', 
        '_3_gs',
        '_4_gd', 
        '_4_gs', 
        '_5_gd', 
        '_5_gs', 
        '_6_gd', 
        '_6_gs',
        '_7_gd', 
        '_7_gs', 
        '_ab_4_gd', 
        '_ab_4_gs',
        '_abs_draw', 
        '_abs_lose', 
        '_abs_win']

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [15]:
def take_goal_pref_info(prefix, df):
    target_cols = [
        '_draw', '_g', '_gd',
        '_gs', '_lose', '_win',
    ]

    for k in target_cols:
        df[prefix + k + '_rate'] = df[prefix + k] / df[prefix + '_count']
    
    return df

In [16]:
train_feature_df = take_goal_info('h', train_feature_df)
train_feature_df = take_goal_info('v', train_feature_df)

train_feature_df = take_goal_pref_info('h_host', train_feature_df)
train_feature_df = take_goal_pref_info('v_visit', train_feature_df)

In [17]:
train_feature_df['h_avg_abs_gs'] = train_feature_df['h_abs_gs'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_gd'] = train_feature_df['h_abs_gd'].sum() / train_feature_df['h_count'].sum()
train_feature_df['v_avg_abs_gs'] = train_feature_df['v_abs_gs'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_gd'] = train_feature_df['v_abs_gd'].sum() / train_feature_df['v_count'].sum()

In [18]:
train_feature_df['h_avg_abs_win'] = train_feature_df['h_abs_win'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_draw'] = train_feature_df['h_abs_draw'].sum() / train_feature_df['h_count'].sum()
train_feature_df['h_avg_abs_lose'] = train_feature_df['h_abs_lose'].sum() / train_feature_df['h_count'].sum()

train_feature_df['v_avg_abs_win'] = train_feature_df['v_abs_win'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_draw'] = train_feature_df['v_abs_draw'].sum() / train_feature_df['v_count'].sum()
train_feature_df['v_avg_abs_lose'] = train_feature_df['v_abs_lose'].sum() / train_feature_df['v_count'].sum()

## 合并训练数据

In [19]:
train_dataset_df = pd.merge(match_df, train_feature_df, on='matchid', how='left')
train_dataset_df = train_dataset_df.dropna()
train_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41306 entries, 0 to 43283
Columns: 171 entries, matchid to v_avg_abs_lose
dtypes: datetime64[ns](1), float64(154), int64(12), object(4)
memory usage: 54.2+ MB


In [20]:
train_dataset_df.columns.values

array(['matchid', 'game', 'home_team', 'visit_team', 'gs', 'gd', 'gn',
       'time', 'result', 'win_bet_return', 'draw_bet_return',
       'lose_bet_return', 'source', 'year', 'month', 'day', 'fix_result',
       'home_team_encoder', 'visit_team_encoder', 'game_encoder',
       'h_0_1_gd', 'h_0_1_gs', 'h_0_gd', 'h_0_gs', 'h_1_gd', 'h_1_gs',
       'h_2_3_gd', 'h_2_3_gs', 'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
       'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
       'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
       'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
       'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
       'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
       'h_host_gs', 'h_host_lose', 'h_host_win', 'id', 'v_0_1_gd',
       'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
       'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
       'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 

## xgb训练

In [21]:
import xgboost as xgb

x_columns = [
    'win_bet_return', 'draw_bet_return', 'lose_bet_return', 
    'year', 'month',
    'home_team_encoder', 'visit_team_encoder', 'game_encoder',
    
    'h_avg_abs_gs',
    'h_avg_abs_gd', 'v_avg_abs_gs', 'v_avg_abs_gd', 'h_avg_abs_win',
    'h_avg_abs_draw', 'h_avg_abs_lose', 'v_avg_abs_win',
    'v_avg_abs_draw', 'v_avg_abs_lose',
    
    'h_0_1_gd', 
    'h_0_1_gs', 
    'h_0_gd', 
    'h_0_gs', 
    'h_1_gd', 
    'h_1_gs',
    'h_2_3_gd', 
    'h_2_3_gs', 
    'h_2_gd', 'h_2_gs', 'h_3_gd', 'h_3_gs',
    'h_4_gd', 'h_4_gs', 'h_5_gd', 'h_5_gs', 'h_6_gd', 'h_6_gs',
    'h_7_gd', 'h_7_gs', 'h_ab_4_gd', 'h_ab_4_gs', 'h_abs_avg_g',
    'h_abs_avg_gd', 'h_abs_avg_gs', 'h_abs_draw', 'h_abs_g',
    'h_abs_gd', 'h_abs_gs', 'h_abs_lose', 'h_abs_win', 'h_count',
    'h_host_count', 'h_host_draw', 'h_host_g', 'h_host_gd',
    'h_host_gs', 'h_host_lose', 'h_host_win', 
    
    'v_0_1_gd',
    'v_0_1_gs', 'v_0_gd', 'v_0_gs', 'v_1_gd', 'v_1_gs', 'v_2_3_gd',
    'v_2_3_gs', 'v_2_gd', 'v_2_gs', 'v_3_gd', 'v_3_gs', 'v_4_gd',
    'v_4_gs', 'v_5_gd', 'v_5_gs', 'v_6_gd', 'v_6_gs', 'v_7_gd',
    'v_7_gs', 'v_ab_4_gd', 'v_ab_4_gs', 'v_abs_avg_g', 'v_abs_avg_gd',
    'v_abs_avg_gs', 'v_abs_draw', 'v_abs_g', 'v_abs_gd', 'v_abs_gs',
    'v_abs_lose', 'v_abs_win', 
    
    'v_count', 
    'v_visit_count',
    'v_visit_draw', 'v_visit_g', 'v_visit_gd', 'v_visit_gs',
    'v_visit_lose', 'v_visit_win',
    
    'h_0_1_gd_rate', 'h_0_1_gs_rate',
    'h_0_gd_rate', 'h_0_gs_rate', 'h_1_gd_rate', 'h_1_gs_rate',
    'h_2_3_gd_rate', 'h_2_3_gs_rate', 'h_2_gd_rate', 'h_2_gs_rate',
    'h_3_gd_rate', 'h_3_gs_rate', 'h_4_gd_rate', 'h_4_gs_rate',
    'h_5_gd_rate', 'h_5_gs_rate', 'h_6_gd_rate', 'h_6_gs_rate',
    'h_7_gd_rate', 'h_7_gs_rate', 'h_ab_4_gd_rate', 'h_ab_4_gs_rate',
    'h_abs_draw_rate', 'h_abs_lose_rate', 'h_abs_win_rate',
    'v_0_1_gd_rate', 'v_0_1_gs_rate', 'v_0_gd_rate', 'v_0_gs_rate',
    'v_1_gd_rate', 'v_1_gs_rate', 'v_2_3_gd_rate', 'v_2_3_gs_rate',
    'v_2_gd_rate', 'v_2_gs_rate', 'v_3_gd_rate', 'v_3_gs_rate',
    'v_4_gd_rate', 'v_4_gs_rate', 'v_5_gd_rate', 'v_5_gs_rate',
    'v_6_gd_rate', 'v_6_gs_rate', 'v_7_gd_rate', 'v_7_gs_rate',
    
    'v_ab_4_gd_rate', 
    'v_ab_4_gs_rate', 
    'v_abs_draw_rate',
    'v_abs_lose_rate', 
    'v_abs_win_rate',
    'h_host_draw_rate',
    'h_host_g_rate', 
    'h_host_gd_rate', 
    'h_host_gs_rate',
    'h_host_lose_rate', 
    'h_host_win_rate', 
    'v_visit_draw_rate',
    'v_visit_g_rate', 
    'v_visit_gd_rate', 
    'v_visit_gs_rate',
    'v_visit_lose_rate', 
    'v_visit_win_rate'
]
    
params={
    'booster':'gbtree',
    # 这里手写数字是0-9，是一个多类的问题，因此采用了multisoft多分类器，
    'objective': 'multi:softmax', 
#     'objective': 'multi:softprob',
    'num_class':3, # 类数，与 multisoftmax 并用
    
    'gamma':0.01,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 。[0:]
    
    'max_depth':10, # 构建树的深度 [1:]
    
    #'lambda':450,  # L2 正则项权重
    'subsample':0.7, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
    'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
    #'min_child_weight':12, # 节点的最少特征数
    'silent':1 ,
    
#     这部分需要调整
#     'eta': 0.05, # 如同学习率
    'eta': 0.01, # 如同学习率
    
    
    'seed':2018,
    'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

# t = train_dataset_df[
#     (train_dataset_df['win_bet_return'] <= 2) |
#     (train_dataset_df['draw_bet_return'] <= 2) |
#     (train_dataset_df['lose_bet_return'] <= 2)
# ]

t = train_dataset_df

print(len(t))

train_dataset = t[t['year'] < 2019]
test_dataset = t[t['year'] == 2019]

valid_dataset = test_dataset[test_dataset['month'] < 3]
test_dataset = test_dataset[test_dataset['month'] >= 3]

xgtrain = xgb.DMatrix(train_dataset[x_columns], label=train_dataset['fix_result'])
xgtest = xgb.DMatrix(test_dataset[x_columns], label=test_dataset['fix_result'])
xgvalid = xgb.DMatrix(valid_dataset[x_columns], label=valid_dataset['fix_result'])

watchlist = [(xgtrain, 'train'),(xgvalid, 'val')]

num_rounds = 10000
stop_rounds = 100

# num_rounds = 10000
# stop_rounds = 300


model = xgb.train(params, xgtrain, num_rounds, watchlist,early_stopping_rounds=stop_rounds)
print(model.best_iteration)

[122]	train-merror:0.379858	val-merror:0.514963
[123]	train-merror:0.379807	val-merror:0.516209
[124]	train-merror:0.379705	val-merror:0.516209
[125]	train-merror:0.379526	val-merror:0.514963
[126]	train-merror:0.379245	val-merror:0.514963
[127]	train-merror:0.379041	val-merror:0.514963
[128]	train-merror:0.378454	val-merror:0.514963
[129]	train-merror:0.378403	val-merror:0.513716
[130]	train-merror:0.377739	val-merror:0.513716
Stopping. Best iteration:
[30]	train-merror:0.400388	val-merror:0.503741

30


In [60]:
# 求可能性的时候用：'objective': 'multi:softprob',

pred_probs = model.predict(xgtest, ntree_limit=model.best_iteration)
pred_probs

array([[0.3329681 , 0.32910088, 0.337931  ],
       [0.33841   , 0.32967126, 0.33191878],
       [0.35295928, 0.32484668, 0.322194  ],
       ...,
       [0.32852158, 0.33240506, 0.33907336],
       [0.32646173, 0.3299921 , 0.34354618],
       [0.32665062, 0.32860482, 0.3447446 ]], dtype=float32)

In [61]:
test_dataset['fix_result'].values

array([2, 0, 0, ..., 1, 2, 0])

In [62]:
def get_result(items):
    if items[0] <= items[1] and items[0] <= items[2]:
        return [1,2]
    elif items[1] <= items[0] and items[1] <= items[2]:
        return [0,2]
    elif items[2] <= items[0] and items[2] <= items[1]:
        return [0,1]
    
fix_results = test_dataset['fix_result'].values

results = []
for i in range(len(pred_probs)):
    items = pred_probs[i]
    probs = get_result(items)
    
    if fix_results[i] in probs:
        results.append(1)
    else:
        results.append(0)

In [63]:
np.array(results).sum() / len(results)

0.7742175856929955

In [22]:
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
preds

accuracy_score(test_dataset['fix_result'], preds)

0.5

## SVM多分类训练

In [23]:
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV


In [24]:
# Seperating Predictors and Outcome values from train and test sets
X_train = train_dataset[x_columns]
Y_train_label = train_dataset['fix_result'].values.astype(object)

X_test = test_dataset[x_columns]
Y_test_label = test_dataset['fix_result'].values.astype(object)

encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [26]:
# params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [10, 100]}]

In [None]:
svm_model = GridSearchCV(SVC(), params_grid, cv=5)
svm_model.fit(X_train_scaled, Y_train)

In [None]:
final_model = svm_model.best_estimator_

print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))