In [None]:
import pandas as pd
import numpy as np

: 

In [None]:
df = pd.read_csv('results.csv')

: 

In [None]:
df = df.dropna(axis = 0)
df['date'] =  pd.to_datetime(df['date'], format='%Y-%m-%d')

: 

In [None]:
tourn_dict = {'Friendly':20,
             'FIFA World Cup':60,
             'AFC Asian Cup': 50,
             'African Cup of Nations':50,
             'African Nations Championship':50,
             'Gold Cup': 50,
             'CONCACAF Nations League': 50,
             'Copa América': 50,
             'Oceania Nations Cup': 50,
             'UEFA Euro': 50,
             'UEFA Nations League':50}


df['tournament_weight'] = df['tournament'].replace(tourn_dict)

df['tournament_weight'] = df.apply(lambda x: 40 if 'qualification' in x['tournament'] else x['tournament_weight'], axis=1)

df["tournament_weight"] = df["tournament_weight"].replace(regex='([a-zA-Z])', value=30)

: 

In [None]:
df['home_goal_difference'] = df['home_score']-df['away_score']

def parse_homeresult(x):
    if x > 0:
       return 1
    elif x == 0:
       return 0.5
    else:
       return 0

df['result'] = df['home_goal_difference'].apply(parse_homeresult)

df['home_bonus'] = [100 if x else 0 for x in df['neutral']]

: 

In [None]:
def parse_goaldiffindex(x):
    x = np.abs(x)
    if x <= 1:
        return 1
    elif x == 2:
        return 3/2
    else:
        return (11+x)/8

df['g_index'] = df['home_goal_difference'].apply(parse_goaldiffindex)

: 

In [None]:
class PremTeam:
    def __init__(self, name):
        self.team_name = name
        self.elo_rating = 1500

class EloCalculator:

    def update_single_fixture(self,fixture, teams):

        home_rating = teams[fixture['home_team']].elo_rating + fixture['home_bonus']
        away_rating = teams[fixture['away_team']].elo_rating

        dr = home_rating-away_rating

        expected_result = 1/(10**(-dr/600)+1)
        
        points_change = fixture['tournament_weight']*fixture['g_index']*(fixture['result'] - expected_result)

      
        # update elo ranking by adding the coefficient
        teams[fixture['home_team']].elo_rating += points_change
        teams[fixture['away_team']].elo_rating -= points_change


: 

In [None]:
# Create list of all team names over parsed seasons.
# For every team that has a fixture in our data, add
# this team to the set of all team names

team_names = set()
for team in np.unique(df[['home_team', 'away_team']].values):
    team_names.add(team)

# Create a dictionary of all teams.
# The key is the team name and the value an
# instance of PremTeam class
teams = {}
for team in team_names:
    teams[team] = PremTeam(team)


elo = EloCalculator()

: 

In [None]:
# Count the number of matches in the season
n_matches = df.shape[0]
    
# As we have sorted our SeasonData dataframe, we can access each fixture
# in order and update the each teams elo rating. The teams are stored in the
# dict called teams which we pass into this function
for i in range(n_matches):
    df.loc[i,['home_elo']] = teams[df.iloc[i]['home_team']].elo_rating
    df.loc[i,['away_elo']] = teams[df.iloc[i]['away_team']].elo_rating
    
    elo.update_single_fixture(df.iloc[i], teams)

# Print the up to date elo_ratings
for team in teams.keys():
    print(team, teams[team].elo_rating) 

: 

In [None]:
home_team_stats = df[['date','home_team','home_score','away_score']]
away_team_stats = df[['date','away_team','home_score','away_score']]

home_team_stats.columns = ['date','team','goals_for','goals_against']
away_team_stats.columns = ['date','team','goals_against','goals_for']

team_stats_per_match = home_team_stats.append(away_team_stats)

team_stats_per_match['xG10'] = team_stats_per_match['goals_for'].rolling(10,min_periods=1).mean()
team_stats_per_match['xGA10'] = team_stats_per_match['goals_against'].rolling(10,min_periods=1).mean()

home_team_stats = team_stats_per_match.iloc[:int(team_stats_per_match.shape[0]/2),:]
away_team_stats = team_stats_per_match.iloc[int(team_stats_per_match.shape[0]/2):,:]

home_team_stats.columns = ['team_1_'+str(col) for col in home_team_stats.columns]
away_team_stats.columns = ['team_2_'+str(col) for col in away_team_stats.columns]

match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)

df = pd.concat([df, match_stats], axis=1, ignore_index=False)

df_final = df[['home_score', 'home_elo', 'away_elo', 'team_1_xG10', 'team_1_xGA10', 'team_2_xG10', 'team_2_xGA10']]

df_final['xG'] = df_final['team_1_xG10']*df_final['team_2_xGA10']
df_final['xGA'] = df_final['team_1_xGA10']*df_final['team_2_xG10']

df_final

: 

In [None]:
from sklearn.model_selection import train_test_split
X = df_final.drop('home_score',axis = 1)
y = df_final['home_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, random_state=10)


: 

In [None]:
from sklearn.linear_model import PoissonRegressor
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.preprocessing import RobustScaler

pipeline = Pipeline([('scaler', RobustScaler()),
                    ('model', PoissonRegressor())])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2_test = metrics.r2_score(y_test, y_pred)

r2_test

: 

: 