In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
%matplotlib inline

In [2]:
first_flag = True
df_all = None
for folder in os.listdir('football_data/'):
    if ('.' in folder):
        continue
    for csv in os.listdir('football_data/{}/'.format(folder)):
        if csv.endswith(".csv"):
            try:
                df_part = pd.read_csv('football_data/{}/{}'.format(folder, csv), error_bad_lines=False, warn_bad_lines=False)
            except Exception as e:
                print('football_data/{}/{}'.format(folder, csv))


            if 'Div' in (df_part.columns) and 'Country' not in (df_part.columns): #format1
                df_part['Country'] = folder
            else:
                df_part.rename(columns={'League': 'Div',
                                       'Home': 'HomeTeam',
                                       'Away': 'AwayTeam',
                                       'HG': 'FTHG',
                                       'AG': 'FTAG'}, inplace=True)
            df_part = df_part.loc[:, ['Country', 'Div', 'Date', 'HomeTeam',
                               'AwayTeam','FTHG','FTAG']]
            if first_flag:
                first_flag = False
                df_all = df_part
            else:
                df_all = df_all.append(df_part, ignore_index=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [3]:
df_all.dropna(inplace=True)
df_all.reset_index(inplace=True, drop=True)
df_all.tail()

Unnamed: 0,Country,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG
223526,Israel,I0,25/12/99,M. Ironi Ashdod,H. Haifa,0.0,0.0
223527,Israel,I0,25/12/99,M. Tel Aviv,H. Kfar Saba,0.0,1.0
223528,Israel,I0,25/12/99,Netanya,B. Jerusalem,2.0,3.0
223529,Israel,I0,27/12/99,M. Haifa,H. Tel Aviv,1.0,0.0
223530,Israel,I0,28/12/99,M. Petach Tikva,H. Ironi Rishon,0.0,0.0


In [4]:
df_all['Date'] = df_all['Date'].astype(str)
df_all['Country'] = df_all['Country'].str.strip()
df_all['Div'] = df_all['Div'].str.strip()

In [5]:
df_all.sort_values(['Country', 'Div', 'Date'], inplace=True)

In [6]:
df_all = df_all[['Country', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]

df_all['total_goals'] = df_all['FTHG'] + df_all['FTAG']

df_all['total_goals_odd'] = df_all['total_goals'] % 2

# df_all['total_goals_over2'] = df_all['total_goals'] > 2

# #drop 0 : 0
### df_all = df_all[df_all['total_goals'] > 0]

In [7]:
def year_fn(x):
    if len(x) > 2:
        return x
    elif int(x) > 30:
        return '19' + x
    else:
        return '20' + x

df_all['year'] = df_all['Date'].map(lambda x: x.split('/')[-1][-2:])
df_all['year'] = df_all['year'].map(lambda x: year_fn(x))
df_all['month'] = df_all['Date'].map(lambda x: x.split('/')[1])
#match season
df_all.loc[df_all['month']>'06', 'year'] = df_all.loc[df_all['month']>'06', 'year'].map(lambda x: str(int(x)+1))

In [8]:
df_all.groupby('Country').count()

Unnamed: 0_level_0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,total_goals,total_goals_odd,year,month
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Argentina,2679,2679,2679,2679,2679,2679,2679,2679,2679,2679
Australia,1700,1700,1700,1700,1700,1700,1700,1700,1700,1700
Austria,1190,1190,1190,1190,1190,1190,1190,1190,1190,1190
Belgium,6235,6235,6235,6235,6235,6235,6235,6235,6235,6235
Brazil,2659,2659,2659,2659,2659,2659,2659,2659,2659,2659
China,1200,1200,1200,1200,1200,1200,1200,1200,1200,1200
Denmark,1442,1442,1442,1442,1442,1442,1442,1442,1442,1442
England,57814,57814,57814,57814,57814,57814,57814,57814,57814,57814
Finland,1394,1394,1394,1394,1394,1394,1394,1394,1394,1394
France,17028,17028,17028,17028,17028,17028,17028,17028,17028,17028


In [9]:
df_all_raw = df_all.copy()
df_all_raw = df_all_raw[['Country', 'Div', 'year', 'month','total_goals_odd']]
df_all_raw = df_all_raw[df_all_raw['year'] >='2000']
df_all_raw['Country_Div'] = df_all_raw['Country'] + '_' +df_all_raw['Div']

In [10]:
df_all_raw['year'].unique()

array(['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2006',
       '2007', '2012', '2008', '2011', '2009', '2010', '2004', '2000',
       '2001', '2002', '2005', '2003'], dtype=object)

In [11]:
df_all_raw['Country_Div'].value_counts()

England_E1                    10540
England_E3                    10350
England_E2                    10336
Spain_SP2                      8365
Italy_I2                       7831
England_EC                     7334
Spain_SP1                      7277
England_E0                     7246
France_F1                      6831
Italy_I1                       6828
France_F2                      6761
Germany_D1                     5839
Turkey_T1                      5739
Netherlands_N1                 5679
Germany_D2                     5487
Portugal_P1                    5156
Belgium_B1                     5011
Wales_W0                       4968
Israel_I1                      4556
Israel_I0                      4475
Scotland_SC0                   4126
Greece_G1                      3669
Scotland_SC2                   3408
Scotland_SC1                   3392
Scotland_SC3                   3316
SaudiArabia_SPL                2921
Argentina_Primera Division     2679
Brazil_Serie A              

# Get candidates

In [12]:
for start_year in ['2000', '2005', '2010', '2015']:
    df_all = df_all_raw[df_all_raw['year'] >= start_year]
    df_all_cnt = df_all.groupby(['Country_Div']).count().reset_index()
    df_all_cnt.rename(columns={'year': 'Count'}, inplace=True)
    df_all_cnt = df_all_cnt[df_all_cnt['Count'] > 50]
    df_all_mean = df_all.groupby(['Country_Div']).mean().reset_index()

    df_res = pd.merge(df_all_mean, df_all_cnt[['Country_Div', 'Count']])
    df_res['chance'] = (0.5 - df_res['total_goals_odd']).abs()
    df_res = df_res.sort_values('chance', ascending=False).reset_index(drop=True)
    exec("candidate_{}=df_res[['Country_Div', 'total_goals_odd', 'chance']].head(20)".format(start_year))

In [13]:
candidate_2000.columns = ['Country_Div_2000', 'total_goals_odd_2000', 'chance_2000']
candidate_2005.columns = ['Country_Div_2005', 'total_goals_odd_2005', 'chance_2005']
candidate_2010.columns = ['Country_Div_2010', 'total_goals_odd_2010', 'chance_2010']
candidate_2015.columns = ['Country_Div_2015', 'total_goals_odd_2015', 'chance_2015']

In [14]:
df_candidates = pd.concat([candidate_2000, 
                candidate_2005, 
                candidate_2010,
                candidate_2015], axis=1)

df_candidates

Unnamed: 0,Country_Div_2000,total_goals_odd_2000,chance_2000,Country_Div_2005,total_goals_odd_2005,chance_2005,Country_Div_2010,total_goals_odd_2010,chance_2010,Country_Div_2015,total_goals_odd_2015,chance_2015
0,Israel_I1,0.454346,0.045654,Israel_I1,0.458222,0.041778,Netherlands_N1,0.457516,0.042484,Israel_I1,0.453161,0.046839
1,Japan_J-League,0.539749,0.039749,Japan_J-League,0.539749,0.039749,Japan_J-League,0.539749,0.039749,Israel_I0,0.453686,0.046314
2,Netherlands_N1,0.466103,0.033897,Germany_D2,0.464747,0.035253,Israel_I1,0.461707,0.038293,Japan_J-League,0.540636,0.040636
3,Germany_D2,0.467286,0.032714,Netherlands_N1,0.465434,0.034566,Germany_D2,0.463649,0.036351,Netherlands_N1,0.461874,0.038126
4,France_F2,0.469457,0.030543,France_F1,0.467721,0.032279,Israel_I0,0.467337,0.032663,Germany_D1,0.461874,0.038126
5,Italy_I2,0.471715,0.028285,Italy_I2,0.472649,0.027351,France_F2,0.470344,0.029656,Germany_D2,0.463203,0.036797
6,France_F1,0.474162,0.025838,France_F2,0.474434,0.025566,Italy_I2,0.471462,0.028538,Italy_I2,0.4695,0.0305
7,Germany_D1,0.475424,0.024576,Israel_I0,0.476627,0.023373,Germany_D1,0.471964,0.028036,Australia_A1,0.529801,0.029801
8,Belgium_B1,0.47715,0.02285,Germany_D1,0.47735,0.02265,France_F1,0.474021,0.025979,Finland_Veikkausliiga,0.471111,0.028889
9,Poland_Ekstraklasa,0.477564,0.022436,Poland_Ekstraklasa,0.477564,0.022436,Belgium_B1,0.475546,0.024454,Poland_Ekstraklasa,0.476048,0.023952


In [15]:
candidates = []

for col in df_candidates.columns:
    if 'Coun' in col:
        candidates += df_candidates[col].tolist()

candidates = pd.Series(candidates)
print(candidates.value_counts().head(20))
candidates = candidates.value_counts().head(20).index.tolist()
candidates

Australia_A1                  4
France_F2                     4
Israel_I0                     4
Netherlands_N1                4
Italy_I2                      4
Austria_Bundesliga            4
Poland_Ekstraklasa            4
Japan_J-League                4
Germany_D1                    4
Israel_I1                     4
Finland_Veikkausliiga         4
Germany_D2                    4
Spain_SP2                     3
Italy_I1                      3
France_F1                     3
Switzerland_Super League      3
Argentina_Primera Division    3
Denmark_Superliga             3
Belgium_B1                    3
Scotland_SC1                  2
dtype: int64


['Australia_A1',
 'France_F2',
 'Israel_I0',
 'Netherlands_N1',
 'Italy_I2',
 'Austria_Bundesliga',
 'Poland_Ekstraklasa',
 'Japan_J-League',
 'Germany_D1',
 'Israel_I1',
 'Finland_Veikkausliiga',
 'Germany_D2',
 'Spain_SP2',
 'Italy_I1',
 'France_F1',
 'Switzerland_Super League',
 'Argentina_Primera Division',
 'Denmark_Superliga',
 'Belgium_B1',
 'Scotland_SC1']

# Global

In [16]:
df_res_all = None

for start_year in ['2000', '2005', '2010', '2015', '2019']:
    df_all = df_all_raw[df_all_raw['year'] >= start_year]
    df_all = df_all[df_all['Country_Div'].isin(candidates)]
    df_all_cnt = df_all.groupby(['Country_Div']).count().reset_index()
    df_all_cnt.rename(columns={'year': 'Count'}, inplace=True)
    df_all_cnt = df_all_cnt[df_all_cnt['Count'] > 50]
    df_all_mean = df_all.groupby(['Country_Div']).mean().reset_index()
    df_res = pd.merge(df_all_mean, df_all_cnt[['Country_Div']])
    df_res['chance'] = (0.5 - df_res['total_goals_odd']).abs()
    df_res = df_res.sort_values('chance', ascending=False).reset_index(drop=True)
    df_res['chance_rank'] = range(1, len(df_res)+1, 1)
    df_res.rename(columns={'total_goals_odd': 'odd{}'.format(start_year),
#                           'Count': 'Count_{}'.format(start_year),
                          'chance': 'chance{}'.format(start_year),
                          'chance_rank':'rank{}'.format(start_year)}, inplace=True)
    
    if df_res_all is None:
        df_res_all = df_res
    else:
        df_res_all = pd.merge(df_res_all, df_res)
    

In [17]:
df_res_all['rank'] = (df_res_all['rank2000'] + df_res_all['rank2005'] +
                                  df_res_all['rank2010'] + df_res_all['rank2015'] + 
                                  df_res_all['rank2019']) / 5
df_res_all.sort_values('rank', inplace=True)
df_res_all.reset_index(inplace=True, drop=True)

In [18]:
show_cols = ['Country_Div', 'odd2000',
       'rank2000', 'odd2005', 
       'rank2005', 'odd2010', 
       'rank2010', 'odd2015', 
       'rank2015', 'odd2019', 
       'rank2019', 'rank']

In [19]:
df_res_all[show_cols]

Unnamed: 0,Country_Div,odd2000,rank2000,odd2005,rank2005,odd2010,rank2010,odd2015,rank2015,odd2019,rank2019,rank
0,Israel_I1,0.454346,1,0.458222,1,0.461707,3,0.453161,1,0.429752,6,2.4
1,Japan_J-League,0.539749,2,0.539749,2,0.539749,2,0.540636,3,0.571429,5,2.8
2,Germany_D2,0.467286,4,0.464747,3,0.463649,4,0.463203,6,0.425926,3,4.0
3,Netherlands_N1,0.466103,3,0.465434,4,0.457516,1,0.461874,4,0.48366,15,5.4
4,Germany_D1,0.475424,8,0.47735,9,0.471964,8,0.461874,5,0.437908,7,7.4
5,Israel_I0,0.479777,15,0.476627,8,0.467337,5,0.453686,2,0.44898,8,7.6
6,Italy_I2,0.471715,6,0.472649,6,0.471462,7,0.4695,7,0.493421,18,8.8
7,Poland_Ekstraklasa,0.477564,10,0.477564,10,0.477564,11,0.476048,10,0.427632,4,9.0
8,France_F2,0.469457,5,0.474434,7,0.470344,6,0.481265,11,0.5,19,9.6
9,Belgium_B1,0.47715,9,0.48259,16,0.475546,10,0.491964,16,0.425,2,10.6


# Kelly formula

In [41]:
p = 0.53
b = 0.94
q = 1 - p
fare = (b*p - q) / b
fare

0.030000000000000006

In [94]:
df_all_year_mean = df_all.groupby(['Country_Div', 'year']).mean()

df_all_year_mean.reset_index(level='year', inplace=True)

for cd in  df_all_year_mean.index.unique():
    df_plot = df_all_year_mean.loc[cd]
    df_plot['total_goals_odd'] -= 0.5
    df_plot.set_index('year', inplace=True)
    df_plot.plot.bar(title=cd, legend=False)

# Simulation

In [1]:
import random
import pandas as pd

## case1: constant 10

In [56]:
res = []
game_rounds = []
max_round = 1000
yield_rate = 0.91
win_prob = 0.53
initial_money = 100


for cnt in range(100000):
    money = initial_money
    game_round = 1
    while money > 10 and game_round <= max_round:
        bet = min(10, money)

        if random.random() < win_prob:
            money += bet * yield_rate
        else:
            money -= bet
        game_round += 1
    
    res.append(money)
    game_rounds.append(game_round)
    
res = np.array(res)
print(np.mean(game_rounds))

(res.mean() - initial_money) / initial_money

487.84968


0.5984850500000195

## case2: ratio

In [68]:
for ratio in np.arange(0.01, 0.21, 0.01):
    res = []
    game_rounds = []
    max_round = 100
    yield_rate = 0.92
    win_prob = 0.53
#     ratio = 0.10
    initial_money = 100


    for cnt in range(100000):
        money = initial_money
        game_round = 1
        bet = 10
        loss_cnt = 0
        while money > 10 and game_round <= max_round:
#             if money >= 2 * initial_money:
#                 break
            if loss_cnt > 4:
        
            
            
            bet = max(money*ratio, 10)

            if random.random() < win_prob:
                money += bet * yield_rate
            else:
                money -= bet
            game_round += 1

    #     print(cnt, money)
        res.append(money)
        game_rounds.append(game_round)

    res = np.array(res)
    print(ratio, np.mean(game_rounds), (res.mean() - initial_money) / initial_money)


0.01 101.0 0.018220390879906744
0.02 101.0 0.03662048319799601
0.03 101.0 0.05496061613021902
0.04 101.0 0.07311732430894108
0.05 101.0 0.09333152419095996
0.060000000000000005 101.0 0.1113630146499122
0.06999999999999999 101.0 0.1335197296019403
0.08 100.99829 0.15185895906121483
0.09 100.99534 0.1680669553054713
0.09999999999999999 100.98413 0.1898350475061659
0.11 100.95465 0.21644061867102504
0.12 100.90448 0.23744493795022345
0.13 100.80381 0.2589276980271252
0.14 100.62841 0.288583088325709
0.15000000000000002 100.40499 0.2816149267020937
0.16 100.05268 0.3307174744228978
0.17 99.59645 0.3135486068276132
0.18000000000000002 98.99489 0.36906860559129084
0.19 98.33463 0.38676182020229455
0.2 97.3956 0.4369041672774538


# case3: multiple bet

In [75]:
for max_loss_cnt in range(1, 5, 1):
    for multiple in np.arange(1, 2.1, 0.1):
        res = []
        game_rounds = []
        max_round = 500
        yield_rate = 0.92
        win_prob = 0.53
        initial_money = 100
        initial_bet = 10



        for cnt in range(100000):
            money = initial_money
            game_round = 1
            loss_cnt = 0
            bet = initial_bet

            while game_round <= max_round:
                #pay
                money -= bet
                if money < 0:
                    break
                

                if random.random() < win_prob:
                    money += bet * (1 + yield_rate)
                    bet = max(initial_bet, money * 0.05)
                    loss_cnt = 0
                else:
                    bet *= multiple
                    bet = min(bet, money)
                    loss_cnt += 1
                    
                if loss_cnt >= max_loss_cnt:
                    bet = max(initial_bet, money * 0.05)
                    loss_cnt = 0

                game_round += 1

        #     print(cnt, money)
            res.append(money)
            game_rounds.append(game_round)

        res = np.array(res)
        print(max_loss_cnt, multiple, np.mean(game_rounds), (res.mean() - initial_money) / initial_money)


1 1.0 305.69577 0.6608708789274419
1 1.1 306.40975 0.663025775909891
1 1.2000000000000002 306.23135 0.6640035516901759
1 1.3000000000000003 305.62953 0.6587292951820575
1 1.4000000000000004 305.97651 0.6545685286851003
1 1.5000000000000004 306.93426 0.6738179790036876
1 1.6000000000000005 306.65499 0.6562617888499832
1 1.7000000000000006 306.63414 0.6589306337105952
1 1.8000000000000007 306.69091 0.6565035488437906
1 1.9000000000000008 306.01923 0.6482358676308649
1 2.000000000000001 305.67143 0.664368398407475
2 1.0 309.1374 0.6683903234757753
2 1.1 302.33539 0.6898223558138864
2 1.2000000000000002 295.41401 0.69480953984834
2 1.3000000000000003 287.88708 0.7446633541287536
2 1.4000000000000004 280.762 0.7774246919018373
2 1.5000000000000004 272.57473 0.7752640114942946
2 1.6000000000000005 264.59965 0.8158812159706994
2 1.7000000000000006 258.15348 0.8140406704686504
2 1.8000000000000007 249.44265 0.8184739555507474
2 1.9000000000000008 243.93001 0.8964249294106773
2 2.00000000000000

In [3]:
from collections import defaultdict

round_money_dict = {}


for max_loss_cnt in range(2, 5, 1):
    for multiple in np.arange(1.0, 2.1, 0.1):
        res = []
        game_rounds = []
        max_round = 1000
        yield_rate = 0.92
        win_prob = 0.53
        initial_money = 100
        initial_bet = 10

        round_dict = defaultdict(int)
        money_dict = defaultdict(int)

        for cnt in range(100000):
            money = initial_money
            game_round = 1
            loss_cnt = 0
            bet = initial_bet

            while game_round <= max_round:
                round_dict[game_round] += 1
                money_dict[game_round] += money
#                 round_money.append((game_round, money))
                
                if money < initial_bet:
                    break
                #pay
                money -= bet
                
                if random.random() < win_prob:
                    money += bet * (1 + yield_rate)
                    bet = max(initial_bet, money * 0.05)
                    loss_cnt = 0
                else:
                    bet *= multiple
                    bet = min(bet, money)
                    loss_cnt += 1
                    
                if loss_cnt >= max_loss_cnt:
                    bet = max(initial_bet, money * 0.05)
                    loss_cnt = 0
                
                game_round += 1

        #     print(cnt, money)
            res.append(money)
            game_rounds.append(game_round)
        
        
#         round_money = pd.DataFrame(round_money, columns=['round', 'money'])
#         round_money = round_money.groupby('round').mean()
        round_money_dict[(max_loss_cnt, multiple)] = (round_dict, money_dict)
        res = np.array(res)
        print(max_loss_cnt, multiple, np.mean(game_rounds), np.mean(res - initial_money) / initial_money, 
              np.percentile(res - initial_money, 50) / initial_money,
              np.percentile(res - initial_money, 70) / initial_money,
              np.percentile(res - initial_money, 80) / initial_money,
              np.percentile(res - initial_money, 90) / initial_money,
              np.percentile(res - initial_money, 95) / initial_money,
             )

2 1.0 497.34161 1.7321506385029302 -0.9173935614665993 0.5010617426645766 2.1177743092921775 5.796684062253616 11.379306575260046
2 1.1 484.3472 1.786139171998059 -0.9209329477142154 0.24259945301470814 2.0142261892414894 5.830130874277447 11.828039848975068
2 1.2000000000000002 468.4776 1.8363206652745152 -0.9228900137896031 -0.17062451801395062 1.8908430081332375 5.7904892018563086 11.907184977476389
2 1.3000000000000003 455.60052 1.903034259626721 -0.92603110514081 -0.9003999999999999 1.7123287023323757 5.744202180128225 12.092654729135438
2 1.4000000000000004 439.23783 1.9606454782626002 -0.9294079999999986 -0.9023999999999995 1.474082483814032 5.5205938395188765 12.225417319331722
2 1.5000000000000004 423.45731 2.192238778559939 -0.9319999999999999 -0.9048799999999982 1.2744763235052183 5.342549567493485 12.28701364817184
2 1.6000000000000005 406.76995 2.0346663048167506 -0.9352000000000003 -0.9057497190684711 0.9769142204266684 4.930957673027763 12.14601923941272
2 1.700000000000