In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skellam
import seaborn as sns
%matplotlib inline




In [37]:
Pred_Data=pd.read_csv("../Data/2009-2010.csv")
Pred_Data=Pred_Data[['HomeTeam','AwayTeam','FTHG','FTAG','FTR']]
Pred_Data.columns=['Home','Away','HG','AG','Result']
Team=Pred_Data.Home.value_counts().index


In [38]:
len(Pred_Data[Pred_Data['Result']=='H'])/len(Pred_Data)

0.48947368421052634

# Useful function for our predictions


In [4]:
#Function to calculate the mean for all the Data
def score_coef(att,dif,delta,Data):
    mu1=np.array([])
    mu2=np.array([])
    for i in range(len(Pred_Data)):
        m1=np.exp(delta+att.loc[Data.loc[i].Home]-dif.loc[Data.loc[i].Away])
        m2=np.exp(att.loc[Data.loc[i].Away]-dif.loc[Data.loc[i].Home])
        mu1=np.append(mu1,m1)
        mu2=np.append(mu2,m2)
    return mu1, mu2

In [46]:
#Function to calculate the mean for a given match
def calc_score(att_h,dif_h,att_a,dif_a):
    m1=np.exp(att_h-dif_a)
    m2=np.exp(att_a-dif_h)
    return m1, m2

In [47]:
#Calculate the probability of winning against another team without home coef
def probawin(A,B,k=0):
    mu_h,mu_a=calc_score(A[0],A[1],B[0],B[1])
    return 1-skellam.cdf(k,  mu_h,  mu_a)
 

# Prediction Using Metropolis within Gibbs results

In [7]:
#Get the results obtained with our MCMC
Coef_club=pd.read_csv('../Data/Coef_MCMC_Poiss.csv')
Coef_club=Coef_club.set_index('Team').sort_index()
Team=Pred_Data.Home.value_counts().index
delta=Coef_club['Home_adv'][0]
classement_mean=pd.DataFrame({'Team':Team})


In [8]:
Sum_Coef_club=Coef_club[['Sum_Att','Sum_Dif']]
Sum_Coef_club


Unnamed: 0_level_0,Sum_Att,Sum_Dif
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Atalanta,-0.024221,-0.075977
Bari,-0.046574,0.028818
Bologna,-0.206872,-0.142026
Cagliari,-0.006719,-0.06977
Catania,-0.192182,-0.017661
Chievo,-0.290591,0.079018
Fiorentina,0.03582,0.153798
Genoa,0.040527,-0.067544
Inter,0.406135,0.310904
Juventus,0.290654,0.070085


In [9]:
#Generate the means for our Poisson
mu1, mu2=score_coef(Coef_club['Sum_Att'],Coef_club['Sum_Dif'],delta,Pred_Data)

In [10]:
#Results for one simulation of Poisson with our means mu1 and mu2
Pred_Data['predH']=np.random.poisson(mu1)
Pred_Data['predA']=np.random.poisson(mu2)

pred_result=np.array([])
for i in range (len(Pred_Data)):
    if Pred_Data.iloc[i]['predH']>Pred_Data.iloc[i]['predA']:
        pred_result=np.concatenate([pred_result,['H']])
    elif Pred_Data.iloc[i]['predH']<Pred_Data.iloc[i]['predA']:
        pred_result=np.concatenate([pred_result,['A']])
    else : pred_result=np.concatenate([pred_result,['D']])
Pred_Data['pred_result']=pred_result
#Count the number of mispredicted matches
diff_score=np.array([])
wrong_res=0
for i in range(379):
    if Pred_Data['Result'][i]==Pred_Data['pred_result'][i]:
        diff_score=np.append(diff_score,0)
    else:
        diff_score=np.append(diff_score,1)
        wrong_res+=1

(380-wrong_res)/380

0.4342105263157895

In [27]:
#Generate 1000 simulations of Poisson distribution
totpredH= list()
totpredA= list()
for i in range(1000):
    home_pred = "iter"+str(i)+"home"
    away_pred = "iter"+str(i)+"away"
    Pred_Data[home_pred]=np.random.poisson(mu1)
    Pred_Data[away_pred]=np.random.poisson(mu2)
    totpredH.append(home_pred)
    totpredA.append(away_pred)



In [28]:
#Take the mean of the sum of the goals for each simulations
avg_home=0
avg_away=0
for i in totpredH:
    avg_home+=Pred_Data[i]
avg_home=avg_home/len(totpredH)

for j in totpredA:
    avg_away+=Pred_Data[j]
avg_away=avg_away/len(totpredA)


In [29]:
home_goal=(avg_home)
away_goal=(avg_away)
Pred_Data['New_HG']=home_goal
Pred_Data['New_AG']=away_goal

In [30]:
#Create a column with the results
pred_result=np.array([])
for i in range (len(Pred_Data)):
    if Pred_Data.iloc[i]['New_HG']-Pred_Data.iloc[i]['New_AG']>0.25:
        pred_result=np.concatenate([pred_result,['H']])
    elif Pred_Data.iloc[i]['New_HG']-Pred_Data.iloc[i]['New_AG']<0.:
        pred_result=np.concatenate([pred_result,['A']])
    else : pred_result=np.concatenate([pred_result,['D']])
Pred_Data['New_pred_result']=pred_result


In [31]:
#Count the number of mispredicted results
diff_score=np.array([])
wrong_res=0
for i in range(379):
    if Pred_Data['Result'][i]==Pred_Data['New_pred_result'][i]:
        diff_score=np.append(diff_score,0)
    else:
        diff_score=np.append(diff_score,1)
        wrong_res+=1

(380-wrong_res)/380

0.5026315789473684

In [23]:
#Create a Ranking with the new results obtained
Score=np.array([])
for i in Team :
    s=0
    dom=Pred_Data[Pred_Data.Home==i]
    ext=Pred_Data[Pred_Data.Away==i]
    for j in range(len(dom)):
        if dom.iloc[j]['New_pred_result']=='H':
                s+=3
        elif dom.iloc[j]['New_pred_result']=='D':
                s+=1
    for j in range(len(ext)):
        if ext.iloc[j]['New_pred_result']=='A':
                s+=3
        elif ext.iloc[j]['New_pred_result']=='D':
                s+=1   
    Score=np.concatenate([Score,[s]])
classement_mean['Classifica']=Score
#classement=classement.set_index('Team')
classement_mean.sort_values(by='Classifica',ascending=False)

Unnamed: 0,Team,Classifica
10,Inter,110.0
9,Milan,101.0
11,Roma,99.0
14,Juventus,89.0
15,Fiorentina,76.0
4,Palermo,63.0
7,Napoli,52.0
1,Bari,52.0
18,Sampdoria,51.0
5,Genoa,51.0


We have that our ranking is consistent with the real results but does not give much information on the number of well predicted results

In [39]:
prob=pd.DataFrame({'Team':Team})
prob=prob.set_index('Team').sort_index()


In [52]:
#Create Matrix containing all the probabilities
proba_tot=[]
for i in range(len(Sum_Coef_club)):
    A=Sum_Coef_club.iloc[i]
    proba_A=[]
    for j in range(len(Sum_Coef_club)):
        if j !=i:
            B=Sum_Coef_club.iloc[j]
            proba_A.append(probawin(A,B))
        if j==i:
            proba_A.append(0)
    proba_tot.append(proba_A)

    

In [50]:
XA=Team.sort_values()
j=0
for i in XA:
    prob[i]=np.transpose(proba_tot)[j]
    j+=1

In [51]:
prob.style.background_gradient(
    cmap = sns.palettes.diverging_palette(h_neg=0, h_pos=243, s=75, l=40,
                                          as_cmap=True))


Unnamed: 0_level_0,Atalanta,Bari,Bologna,Cagliari,Catania,Chievo,Fiorentina,Genoa,Inter,Juventus,Lazio,Livorno,Milan,Napoli,Palermo,Parma,Roma,Sampdoria,Siena,Udinese
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Atalanta,0.0,0.323468,0.412146,0.344593,0.368024,0.357069,0.271713,0.33313,0.163102,0.237584,0.327003,0.477813,0.212732,0.313816,0.302274,0.412203,0.214793,0.320598,0.392243,0.345851
Bari,0.366332,0.0,0.425829,0.360625,0.381551,0.369056,0.286857,0.349713,0.17898,0.25548,0.34187,0.489169,0.228838,0.329404,0.320085,0.426416,0.232705,0.336334,0.40524,0.363618
Bologna,0.28285,0.26041,0.0,0.277507,0.300958,0.293115,0.215845,0.266892,0.123097,0.183549,0.263701,0.40116,0.16464,0.25146,0.23877,0.338942,0.16425,0.257127,0.32284,0.276323
Cagliari,0.357436,0.329979,0.419494,0.0,0.374876,0.363609,0.277547,0.339968,0.167412,0.243302,0.333534,0.485453,0.217835,0.320273,0.308887,0.419622,0.220177,0.327161,0.399296,0.353018
Catania,0.312682,0.288332,0.368712,0.307361,0.0,0.318399,0.242005,0.297019,0.145837,0.211614,0.291505,0.429218,0.189664,0.279676,0.26926,0.368774,0.191404,0.285766,0.350489,0.308389
Chievo,0.304057,0.280414,0.356172,0.299108,0.31763,0.0,0.236577,0.289614,0.146493,0.209552,0.2833,0.41279,0.187815,0.272529,0.264005,0.356576,0.19051,0.278402,0.33828,0.301135
Fiorentina,0.419874,0.389182,0.479062,0.414139,0.432295,0.416787,0.0,0.403495,0.220044,0.305496,0.392407,0.541488,0.273841,0.380331,0.374144,0.480492,0.281218,0.38793,0.456428,0.420183
Genoa,0.37328,0.344815,0.436571,0.367204,0.390742,0.378877,0.290666,0.0,0.176586,0.255817,0.34844,0.503501,0.228994,0.33491,0.323663,0.436814,0.231846,0.34205,0.415721,0.369186
Inter,0.590914,0.553849,0.649374,0.585075,0.598055,0.576597,0.490074,0.574867,0.0,0.466124,0.556932,0.70836,0.420874,0.545324,0.545728,0.652297,0.437401,0.554508,0.6232,0.597372
Juventus,0.494975,0.460399,0.559897,0.488593,0.508609,0.491463,0.397689,0.47675,0.26349,0.0,0.464028,0.626339,0.327267,0.450421,0.443835,0.561497,0.336411,0.459049,0.535201,0.495774


In [27]:
Pred_Data['Result'].value_counts()

H    186
D    102
A     92
Name: Result, dtype: int64

In [28]:
Pred_Data['New_pred_result'].value_counts()

H    246
A     77
D     57
Name: New_pred_result, dtype: int64

In [29]:
Pred_Data['pred_result'].value_counts()

H    172
D    110
A     98
Name: pred_result, dtype: int64

#  Prediction Using MALA results


In [55]:
Coef_club1=pd.read_csv('Coef_MALA_Poiss.csv')
Coef_club1=Coef_club1.set_index('Team').sort_index()
Team=Pred_Data.Home.value_counts().index
delta1=Coef_club1['Home_Adv'][0]
classement_mean=pd.DataFrame({'Team':Team})
Sum_Coef_club1=Coef_club1[['Sum_Att','Sum_Dif']]

In [56]:
mu1, mu2=score_coef(Coef_club1['Sum_Att'],Coef_club1['Sum_Dif'],delta1,Pred_Data)
totpredH_MALA= list()
totpredA_MALA= list()
for i in range(1000):
    home_pred_M = "iter"+str(i)+"home"
    away_pred_M = "iter"+str(i)+"away"
    Pred_Data[home_pred_M]=np.random.poisson(mu1)
    Pred_Data[away_pred_M]=np.random.poisson(mu2)
    totpredH_MALA.append(home_pred_M)
    totpredA_MALA.append(away_pred_M)

avg_home=0
avg_away=0
for i in totpredH_MALA:
    avg_home+=Pred_Data[i]
avg_home=avg_home/len(totpredH_MALA)

for j in totpredA_MALA:
    avg_away+=Pred_Data[j]
avg_away=avg_away/len(totpredA_MALA)

home_goal=(avg_home)
away_goal=(avg_away)

Pred_Data['New_HG_MALA']=home_goal
Pred_Data['New_AG_MALA']=away_goal

pred_result=np.array([])
for i in range (len(Pred_Data)):
    if Pred_Data.iloc[i]['New_HG_MALA']-Pred_Data.iloc[i]['New_AG_MALA']>0.25:
        pred_result=np.concatenate([pred_result,['H']])
    elif Pred_Data.iloc[i]['New_HG_MALA']-Pred_Data.iloc[i]['New_AG_MALA']<0:
        pred_result=np.concatenate([pred_result,['A']])
    else : pred_result=np.concatenate([pred_result,['D']])
Pred_Data['New_pred_result_MALA']=pred_result


diff_score=np.array([])
wrong_res=0
for i in range(379):
    if Pred_Data['Result'][i]==Pred_Data['New_pred_result_MALA'][i]:
        diff_score=np.append(diff_score,0)
    else:
        diff_score=np.append(diff_score,1)
        wrong_res+=1

(380-wrong_res)/380

0.5026315789473684

In [57]:
Score=np.array([])
for i in Team :
    s=0
    dom=Pred_Data[Pred_Data.Home==i]
    ext=Pred_Data[Pred_Data.Away==i]
    for j in range(len(dom)):
        if dom.iloc[j]['New_pred_result_MALA']=='H':
                s+=3
        elif dom.iloc[j]['New_pred_result_MALA']=='D':
                s+=1
    for j in range(len(ext)):
        if ext.iloc[j]['New_pred_result_MALA']=='A':
                s+=3
        elif ext.iloc[j]['New_pred_result_MALA']=='D':
                s+=1   
    Score=np.concatenate([Score,[s]])
classement_mean['Classifica']=Score
#classement=classement.set_index('Team')
classement_mean.sort_values(by='Classifica',ascending=False)

Unnamed: 0,Team,Classifica
16,Inter,84.0
10,Milan,68.0
2,Roma,67.0
4,Palermo,57.0
6,Fiorentina,56.0
8,Juventus,55.0
0,Sampdoria,55.0
7,Catania,53.0
9,Cagliari,53.0
11,Napoli,53.0


In [58]:
Pred_Data['Result'].value_counts()

H    186
D    102
A     92
Name: Result, dtype: int64

In [59]:
Pred_Data['New_pred_result_MALA'].value_counts()

H    339
D     36
A      5
Name: New_pred_result_MALA, dtype: int64

In [18]:
prob=pd.DataFrame({'Team':Team})
prob=prob.set_index('Team').sort_index()

#Create Matrix containing all the probabilities
proba_tot=[]
for i in range(len(Sum_Coef_club1)):
    A=Sum_Coef_club1.iloc[i]
    proba_A=[]
    for j in range(len(Sum_Coef_club1)):
        if j !=i:
            B=Sum_Coef_club1.iloc[j]
            proba_A.append(probawin(A,B))
        if j==i:
            proba_A.append(0)
    proba_tot.append(proba_A)
XA=Team.sort_values()
j=0
for i in XA:
    prob[i]=np.transpose(proba_tot)[j]
    j+=1  
prob.style.background_gradient(
    cmap = sns.palettes.diverging_palette(h_neg=0, h_pos=243, s=75, l=40,
                                          as_cmap=True)).to_excel("proba_MALA.xlsx")


# Prediction Using Gradient Descent Method


In [60]:
Coef_club2=pd.read_csv('Coef_Gradient_Descent.csv')
Coef_club2=Coef_club2.set_index('Team').sort_index()
Team=Pred_Data.Home.value_counts().index
delta2=Coef_club2['Home_Adv'][0]
classement_mean=pd.DataFrame({'Team':Team})
Coef_club2=Coef_club2[['Pred_Att', 'Pred_Dif']]

In [67]:
mu1, mu2=score_coef(Coef_club2['Pred_Att'],Coef_club2['Pred_Dif'],delta2,Pred_Data)
totpredH_Grad= list()
totpredA_Grad= list()
for i in range(1000):
    home_pred_G = "iter"+str(i)+"home"
    away_pred_G = "iter"+str(i)+"away"
    Pred_Data[home_pred_G]=np.random.poisson(mu1)
    Pred_Data[away_pred_G]=np.random.poisson(mu2)
    totpredH_Grad.append(home_pred_G)
    totpredA_Grad.append(away_pred_G)

avg_home=0
avg_away=0
for i in totpredH_Grad:
    avg_home+=Pred_Data[i]
avg_home=avg_home/len(totpredH_Grad)

for j in totpredA_Grad:
    avg_away+=Pred_Data[j]
avg_away=avg_away/len(totpredA_Grad)

home_goal=(avg_home)
away_goal=(avg_away)

Pred_Data['New_HG_Grad']=home_goal
Pred_Data['New_AG_Grad']=away_goal

pred_result=np.array([])
for i in range (len(Pred_Data)):
    if Pred_Data.iloc[i]['New_HG_Grad']-Pred_Data.iloc[i]['New_AG_Grad']>0.25:
        pred_result=np.concatenate([pred_result,['H']])
    elif Pred_Data.iloc[i]['New_HG_Grad']-Pred_Data.iloc[i]['New_AG_Grad']<0:
        pred_result=np.concatenate([pred_result,['A']])
    else : pred_result=np.concatenate([pred_result,['D']])
Pred_Data['New_pred_result_Grad']=pred_result


diff_score=np.array([])
wrong_res=0
for i in range(379):
    if Pred_Data['Result'][i]==Pred_Data['New_pred_result_Grad'][i]:
        diff_score=np.append(diff_score,0)
    else:
        diff_score=np.append(diff_score,1)
        wrong_res+=1

(380-wrong_res)/380

0.5078947368421053

In [68]:
Score=np.array([])
for i in Team :
    s=0
    dom=Pred_Data[Pred_Data.Home==i]
    ext=Pred_Data[Pred_Data.Away==i]
    for j in range(len(dom)):
        if dom.iloc[j]['New_pred_result_Grad']=='H':
                s+=3
        elif dom.iloc[j]['New_pred_result_Grad']=='D':
                s+=1
    for j in range(len(ext)):
        if ext.iloc[j]['New_pred_result_Grad']=='A':
                s+=3
        elif ext.iloc[j]['New_pred_result_Grad']=='D':
                s+=1   
    Score=np.concatenate([Score,[s]])
classement_mean['Classifica']=Score
#classement=classement.set_index('Team')
classement_mean.sort_values(by='Classifica',ascending=False)

Unnamed: 0,Team,Classifica
16,Inter,108.0
2,Roma,99.0
10,Milan,97.0
8,Juventus,88.0
6,Fiorentina,74.0
4,Palermo,60.0
15,Udinese,56.0
11,Napoli,54.0
14,Genoa,51.0
0,Sampdoria,51.0


In [29]:
prob=pd.DataFrame({'Team':Team})
prob=prob.set_index('Team').sort_index()

#Create Matrix containing all the probabilities
proba_tot=[]
for i in range(len(Coef_club2)):
    A=Coef_club2.iloc[i]
    proba_A=[]
    for j in range(len(Coef_club2)):
        if j !=i:
            B=Sum_Coef_club1.iloc[j]
            proba_A.append(probawin(A,B))
        if j==i:
            proba_A.append(0)
    proba_tot.append(proba_A)
XA=Team.sort_values()
j=0
for i in XA:
    prob[i]=np.transpose(proba_tot)[j]
    j+=1  
prob.style.background_gradient(
    cmap = sns.palettes.diverging_palette(h_neg=0, h_pos=243, s=75, l=40,
                                          as_cmap=True)).to_excel("prob_Gradient.xlsx")


In [69]:
Pred_Data['Result'].value_counts()

H    186
D    102
A     92
Name: Result, dtype: int64

In [70]:
Pred_Data['New_pred_result_Grad'].value_counts()

H    248
A     68
D     64
Name: New_pred_result_Grad, dtype: int64

In [71]:
delta2

0.3917771531211016