In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('results.csv')

df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [3]:
df = df.dropna(axis = 0)

df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [4]:
tourn_dict = {'Friendly':20,
             'FIFA World Cup':60,
             'AFC Asian Cup': 50,
             'African Cup of Nations':50,
             'African Nations Championship':50,
             'Gold Cup': 50,
             'CONCACAF Nations League': 50,
             'Copa América': 50,
             'Oceania Nations Cup': 50,
             'UEFA Euro': 50,
             'UEFA Nations League':50}


df['tournament_weight'] = df['tournament'].replace(tourn_dict)

df['tournament_weight'] = df.apply(lambda x: 40 if 'qualification' in x['tournament'] else x['tournament_weight'], axis=1)

df["tournament_weight"] = df["tournament_weight"].replace(regex='([a-zA-Z])', value=30)


In [5]:
df['home_goal_difference'] = df['home_score']-df['away_score']

In [6]:
def parse_homeresult(x):
    if x > 0:
       return 1
    elif x == 0:
       return 0.5
    else:
       return 0

df['result'] = df['home_goal_difference'].apply(parse_homeresult)

In [7]:
df['home_bonus'] = [100 if x else 0 for x in df['neutral']]

In [8]:
def parse_goaldiffindex(x):
    x = np.abs(x)
    if x <= 1:
        return 1
    elif x == 2:
        return 3/2
    else:
        return (11+x)/8

df['g_index'] = df['home_goal_difference'].apply(parse_goaldiffindex)


In [9]:
df['home_elo'] = 0
df['away_elo'] = 0

In [10]:
df

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,tournament_weight,home_goal_difference,result,home_bonus,g_index,home_elo,away_elo
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,20,0.0,0.5,0,1.00,0,0
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,20,2.0,1.0,0,1.50,0,0
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,20,1.0,1.0,0,1.00,0,0
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,20,0.0,0.5,0,1.00,0,0
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,20,3.0,1.0,0,1.75,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44099,2022-11-16,Turkey,Scotland,2.0,1.0,Friendly,Diyarbakir,Turkey,False,20,1.0,1.0,0,1.00,0,0
44100,2022-11-16,United Arab Emirates,Argentina,0.0,5.0,Friendly,Abu Dhabi,United Arab Emirates,False,20,-5.0,0.0,0,2.00,0,0
44101,2022-11-16,Uzbekistan,Kazakhstan,2.0,0.0,Friendly,Tashkent,Uzbekistan,False,20,2.0,1.0,0,1.50,0,0
44102,2022-11-16,Lithuania,Iceland,0.0,0.0,Baltic Cup,Kaunas,Lithuania,False,30,0.0,0.5,0,1.00,0,0


In [11]:
class PremTeam:
    def __init__(self, name):
        self.team_name = name
        self.elo_rating = 1500

In [12]:
class EloCalculator:

    def update_single_fixture(self,fixture, teams):

        home_rating = teams[fixture['home_team']].elo_rating + fixture['home_bonus']
        away_rating = teams[fixture['away_team']].elo_rating

        dr = home_rating-away_rating

        expected_result = 1/(10**(-dr/600)+1)
        
        points_change = fixture['tournament_weight']*fixture['g_index']*(fixture['result'] - expected_result)

      
        # update elo ranking by adding the coefficient
        teams[fixture['home_team']].elo_rating += points_change
        teams[fixture['away_team']].elo_rating -= points_change

        

In [13]:
# Create list of all team names over parsed seasons.
# For every team that has a fixture in our data, add
# this team to the set of all team names

team_names = set()
for team in np.unique(df[['home_team', 'away_team']].values):
    team_names.add(team)

In [14]:
# Create a dictionary of all teams.
# The key is the team name and the value an
# instance of PremTeam class
teams = {}
for team in team_names:
    teams[team] = PremTeam(team)

In [15]:
elo = EloCalculator()

In [16]:
  
# Count the number of matches in the season
n_matches = df.shape[0]
    
# As we have sorted our SeasonData dataframe, we can access each fixture
# in order and update the each teams elo rating. The teams are stored in the
# dict called teams which we pass into this function
for i in range(n_matches):
    df.loc[i,['home_elo']] = teams[df.iloc[i]['home_team']].elo_rating
    df.loc[i,['away_elo']] = teams[df.iloc[i]['away_team']].elo_rating
    
    elo.update_single_fixture(df.iloc[i], teams)

# Print the up to date elo_ratings
for team in teams.keys():
    print(team, teams[team].elo_rating) 

San Marino 610.4957930241227
Timor-Leste 701.8043826930356
Åland Islands 1520.4712138879593
El Salvador 1622.9466454938408
Serbia 2038.8507820324926
Iraqi Kurdistan 1664.3905141173384
Cambodia 959.1325148242402
Benin 1521.6299512638443
Ivory Coast 1821.8904051291734
Uzbekistan 1806.9258242254325
Bermuda 1400.6981814152573
Biafra 1512.5641370972644
Moldova 1180.8000626992077
Saint Vincent and the Grenadines 1233.3902971412538
Latvia 1312.6096080675436
Chameria 1544.4574580823198
Russia 1895.8661378516238
Republic of St. Pauli 1497.9527109455262
Artsakh 1582.9590203160665
Western Australia 1443.1380412707686
Belize 1176.9127293314887
Romani people 1491.9230588083742
Singapore 1280.2903329156109
Yemen 1134.5162726929157
Chad 1227.443936947773
Estonia 1408.650538411149
Sint Maarten 1084.989576638182
Brittany 1522.3418753245662
Barbados 1111.2693715018768
Gibraltar 1021.0318756990733
Isle of Wight 1651.8868344954903
Cape Verde 1600.4543262935526
Bangladesh 920.0139562228017
South Sudan 1118

In [17]:
df['date'] =  pd.to_datetime(df['date'], format='%Y-%m-%d')
df

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,tournament_weight,home_goal_difference,result,home_bonus,g_index,home_elo,away_elo
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,20,0.0,0.5,0,1.00,1500.000000,1500.000000
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,20,2.0,1.0,0,1.50,1500.000000,1500.000000
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,20,1.0,1.0,0,1.00,1485.000000,1515.000000
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,20,0.0,0.5,0,1.00,1504.424989,1495.575011
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,20,3.0,1.0,0,1.75,1495.744810,1504.255190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44099,2022-11-16,Turkey,Scotland,2.0,1.0,Friendly,Diyarbakir,Turkey,False,20,1.0,1.0,0,1.00,1793.369721,1885.664434
44100,2022-11-16,United Arab Emirates,Argentina,0.0,5.0,Friendly,Abu Dhabi,United Arab Emirates,False,20,-5.0,0.0,0,2.00,1739.182993,2345.898568
44101,2022-11-16,Uzbekistan,Kazakhstan,2.0,0.0,Friendly,Tashkent,Uzbekistan,False,20,2.0,1.0,0,1.50,1801.970610,1379.777685
44102,2022-11-16,Lithuania,Iceland,0.0,0.0,Baltic Cup,Kaunas,Lithuania,False,30,0.0,0.5,0,1.00,1179.660053,1621.812622


In [18]:
teams['Qatar'].elo_rating

1904.5789896389106

In [19]:
teams['Ecuador'].elo_rating

1987.5760642929533

In [20]:
diff = teams['Qatar'].elo_rating + 100 - teams['Ecuador'].elo_rating
Homewin = 1/(10**(-diff/600)+1)

In [21]:
Homewin

0.5163069988861212

In [22]:
df['home_xgf'] = 0

In [23]:
home_team_stats = df[['date','home_team','home_score','away_score']]
away_team_stats = df[['date','away_team','home_score','away_score']]

home_team_stats.columns = ['date','team','goals_for','goals_against']
away_team_stats.columns = ['date','team','goals_against','goals_for']

team_stats_per_match = home_team_stats.append(away_team_stats)

team_stats_per_match

  team_stats_per_match = home_team_stats.append(away_team_stats)


Unnamed: 0,date,team,goals_for,goals_against
0,1872-11-30,Scotland,0.0,0.0
1,1873-03-08,England,4.0,2.0
2,1874-03-07,Scotland,2.0,1.0
3,1875-03-06,England,2.0,2.0
4,1876-03-04,Scotland,3.0,0.0
...,...,...,...,...
44099,2022-11-16,Scotland,1.0,2.0
44100,2022-11-16,Argentina,5.0,0.0
44101,2022-11-16,Kazakhstan,0.0,2.0
44102,2022-11-16,Iceland,0.0,0.0


In [24]:
team_stats_per_match['xG10'] = team_stats_per_match['goals_for'].rolling(10,min_periods=1).mean()
team_stats_per_match['xGA10'] = team_stats_per_match['goals_against'].rolling(10,min_periods=1).mean()

team_stats_per_match

Unnamed: 0,date,team,goals_for,goals_against,xG10,xGA10
0,1872-11-30,Scotland,0.0,0.0,0.0,0.00
1,1873-03-08,England,4.0,2.0,2.0,1.00
2,1874-03-07,Scotland,2.0,1.0,2.0,1.00
3,1875-03-06,England,2.0,2.0,2.0,1.25
4,1876-03-04,Scotland,3.0,0.0,2.2,1.00
...,...,...,...,...,...,...
44099,2022-11-16,Scotland,1.0,2.0,0.9,1.00
44100,2022-11-16,Argentina,5.0,0.0,1.2,0.80
44101,2022-11-16,Kazakhstan,0.0,2.0,1.0,0.90
44102,2022-11-16,Iceland,0.0,0.0,0.8,0.80


In [25]:
# Re-segment the home and away teams.

home_team_stats = team_stats_per_match.iloc[:int(team_stats_per_match.shape[0]/2),:]
away_team_stats = team_stats_per_match.iloc[int(team_stats_per_match.shape[0]/2):,:]

In [26]:
home_team_stats.columns = ['team_1_'+str(col) for col in home_team_stats.columns]
away_team_stats.columns = ['team_2_'+str(col) for col in away_team_stats.columns]

match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)

In [27]:
df = pd.concat([df, match_stats], axis=1, ignore_index=False)

In [28]:
df.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'tournament_weight',
       'home_goal_difference', 'result', 'home_bonus', 'g_index', 'home_elo',
       'away_elo', 'home_xgf', 'team_1_date', 'team_1_team',
       'team_1_goals_for', 'team_1_goals_against', 'team_1_xG10',
       'team_1_xGA10', 'team_2_date', 'team_2_team', 'team_2_goals_for',
       'team_2_goals_against', 'team_2_xG10', 'team_2_xGA10'],
      dtype='object')

In [29]:
df_final = df[['home_score', 'home_elo', 'away_elo', 'team_1_xG10', 'team_1_xGA10', 'team_2_xG10', 'team_2_xGA10']]

In [30]:
df_final['xG'] = df_final['team_1_xG10']*df_final['team_2_xGA10']
df_final['xGA'] = df_final['team_1_xGA10']*df_final['team_2_xG10']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['xG'] = df_final['team_1_xG10']*df_final['team_2_xGA10']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['xGA'] = df_final['team_1_xGA10']*df_final['team_2_xG10']


In [31]:
df_final

Unnamed: 0,home_score,home_elo,away_elo,team_1_xG10,team_1_xGA10,team_2_xG10,team_2_xGA10,xG,xGA
0,0.0,1500.000000,1500.000000,0.0,0.00,0.7,0.9,0.00,0.00
1,4.0,1500.000000,1500.000000,2.0,1.00,0.9,1.2,2.40,0.90
2,2.0,1485.000000,1515.000000,2.0,1.00,0.9,1.4,2.80,0.90
3,2.0,1504.424989,1495.575011,2.0,1.25,1.0,1.6,3.20,1.25
4,3.0,1495.744810,1504.255190,2.2,1.00,1.0,1.8,3.96,1.00
...,...,...,...,...,...,...,...,...,...
44099,2.0,1793.369721,1885.664434,1.0,0.90,0.9,1.0,1.00,0.81
44100,0.0,1739.182993,2345.898568,0.8,1.20,1.2,0.8,0.64,1.44
44101,2.0,1801.970610,1379.777685,0.9,1.00,1.0,0.9,0.81,1.00
44102,0.0,1179.660053,1621.812622,0.8,0.80,0.8,0.8,0.64,0.64


In [4]:
from sklearn.model_selection import train_test_split
X = df_final.drop('home_score',axis = 1)
y = df_final['home_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, random_state=10)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
from sklearn.linear_model import PoissonRegressor
from sklearn.pipeline import Pipeline
from sklearn import metrics

pipeline = Pipeline([('model', PoissonRegressor())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2_test = metrics.r2_score(y_test, y_pred)

r2_test

ModuleNotFoundError: No module named 'sklearn'