In [51]:
import pandas as pd
import numpy as np
import seaborn as sn

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
match = pd.read_csv('match.csv')
match['home_score'] = np.sign(match['home_team_goal'] - match['away_team_goal'])
match['away_score'] = np.sign(match['away_team_goal'] - match['home_team_goal'])
match = match[['home_team_id', 'away_team_id', 'home_score', 'away_score']]
match.sort_values('home_team_id')

Unnamed: 0,home_team_id,away_team_id,home_score,away_score
16656,1601,8021,1,-1
16526,1601,2182,1,-1
17248,1601,10265,-1,1
17087,1601,8033,-1,1
16046,1601,8021,-1,1
...,...,...,...,...
1300,274581,9986,-1,1
1324,274581,8635,1,-1
1304,274581,9989,1,-1
1379,274581,8571,-1,1


In [3]:
wr1 = match.groupby('home_team_id').home_score.value_counts().unstack().reset_index()
wr1.columns = ["team_id", "lose", "tie", "win"]
wr2 = match.groupby('away_team_id').away_score.value_counts().unstack().reset_index()
wr2.columns = ["team_id", "lose", "tie", "win"]
wr = wr1.set_index('team_id').join(wr2.set_index('team_id'), how='inner', lsuffix='_home', rsuffix='_away')
wr['win'] = wr['win_home'] + wr['win_away']
wr['tie'] = wr['tie_home'] + wr['tie_away']
wr['lose'] = wr['lose_home'] + wr['lose_away']
wr['WR'] = (wr['win'] + 0.5 * wr['tie']) / (wr['win'] + wr['tie'] + wr['lose'])
wr = wr.reset_index()[['team_id', 'WR']]
wr

Unnamed: 0,team_id,WR
0,1601,0.502083
1,1773,0.400000
2,1957,0.495833
3,2033,0.396667
4,2182,0.641667
...,...,...
294,158085,0.433673
295,177361,0.416667
296,188163,0.323529
297,208931,0.381579


In [4]:
match = match.set_index('home_team_id').join(wr.set_index('team_id'), how='inner', rsuffix='_home').reset_index()
match = match.set_index('away_team_id').join(wr.set_index('team_id'), how='inner', rsuffix='_away').reset_index()
match.columns = ['home_team_id', 'away_team_id', 'home_score', 'away_score', 'WR_home', 'WR_away']
match['WR_home'] = match['WR_home'].fillna(0)
match['WR_away'] = match['WR_away'].fillna(0)
match

Unnamed: 0,home_team_id,away_team_id,home_score,away_score,WR_home,WR_away
0,1601,1957,1,-1,0.495833,0.502083
1,1601,1957,1,-1,0.495833,0.502083
2,1601,1957,1,-1,0.495833,0.502083
3,1601,1957,-1,1,0.495833,0.502083
4,1601,1957,1,-1,0.495833,0.502083
...,...,...,...,...,...,...
25974,274581,9997,-1,1,0.377119,0.350000
25975,274581,10000,0,0,0.514151,0.350000
25976,274581,10000,1,-1,0.514151,0.350000
25977,274581,10001,-1,1,0.423077,0.350000


In [5]:
home = match.groupby('home_team_id').agg({'WR_away': ['count', 'sum']}).reset_index()
away = match.groupby('away_team_id').agg({'WR_home': ['count', 'sum']}).reset_index()
owr = home.set_index('home_team_id').join(away.set_index('away_team_id'), how='inner').reset_index()

owr['OWR'] = (owr['WR_away']['sum'] + owr['WR_home']['sum']) / (owr['WR_away']['count'] + owr['WR_home']['count'])
owr = owr[['home_team_id', 'OWR']]
owr.columns = ['team_id', 'OWR']

In [6]:
match = match.set_index('home_team_id').join(owr.set_index('team_id'), how='inner', rsuffix='_home').reset_index()
match = match.set_index('away_team_id').join(owr.set_index('team_id'), how='inner', rsuffix='_away').reset_index()
match.columns = ['home_team_id', 'away_team_id', 'home_score', 'away_score', 'WR_home', 'WR_away', "OWR_home", 'OWR_away']
match

Unnamed: 0,home_team_id,away_team_id,home_score,away_score,WR_home,WR_away,OWR_home,OWR_away
0,1601,1957,0,0,0.502083,0.495833,0.495833,0.502083
1,1601,1957,1,-1,0.502083,0.495833,0.495833,0.502083
2,1601,1957,0,0,0.502083,0.495833,0.495833,0.502083
3,1601,1957,1,-1,0.502083,0.495833,0.495833,0.502083
4,1601,1957,0,0,0.502083,0.495833,0.495833,0.502083
...,...,...,...,...,...,...,...,...
25974,274581,9997,-1,1,0.350000,0.377119,0.377119,0.350000
25975,274581,10000,-1,1,0.350000,0.514151,0.514151,0.350000
25976,274581,10000,0,0,0.350000,0.514151,0.514151,0.350000
25977,274581,10001,1,-1,0.350000,0.423077,0.423077,0.350000


In [7]:
home = match.groupby('home_team_id').agg({'OWR_away': ['count', 'sum']}).reset_index()
home
away = match.groupby('away_team_id').agg({'OWR_home': ['count', 'sum']}).reset_index()
oowr = home.set_index('home_team_id').join(away.set_index('away_team_id'), how='inner').reset_index()

oowr['OOWR'] = (oowr['OWR_away']['sum'] + oowr['OWR_home']['sum']) / (oowr['OWR_away']['count'] + oowr['OWR_home']['count'])
oowr = oowr[['home_team_id', 'OOWR']]
oowr.columns = ['team_id', 'OOWR']

In [8]:
match = match.set_index('home_team_id').join(oowr.set_index('team_id'), how='inner', rsuffix='_home').reset_index()
match = match.set_index('away_team_id').join(oowr.set_index('team_id'), how='inner', rsuffix='_away').reset_index()
match.columns = ['home_team_id', 'away_team_id', 'home_score', 'away_score', 'WR_home', 'WR_away', "OWR_away", 'OWR_home', "OOWR_home", 'OOWR_away']
match

Unnamed: 0,home_team_id,away_team_id,home_score,away_score,WR_home,WR_away,OWR_away,OWR_home,OOWR_home,OOWR_away
0,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083
1,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083
2,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083
3,1601,1957,-1,1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083
4,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083
...,...,...,...,...,...,...,...,...,...,...
25974,274581,9997,-1,1,0.377119,0.350000,0.350000,0.377119,0.377119,0.350000
25975,274581,10000,0,0,0.514151,0.350000,0.350000,0.514151,0.514151,0.350000
25976,274581,10000,1,-1,0.514151,0.350000,0.350000,0.514151,0.514151,0.350000
25977,274581,10001,-1,1,0.423077,0.350000,0.350000,0.423077,0.423077,0.350000


In [9]:
match.reset_index().columns

Index(['index', 'home_team_id', 'away_team_id', 'home_score', 'away_score',
       'WR_home', 'WR_away', 'OWR_away', 'OWR_home', 'OOWR_home', 'OOWR_away'],
      dtype='object')

In [10]:
match['RPI_home'] = 0.25 * match['WR_home'] + 0.5 * match['OWR_home'] + 0.25 * match['OOWR_home'] 
match['RPI_away'] = 0.25 * match['WR_away'] + 0.5 * match['OWR_away'] + 0.25 * match['OOWR_away'] 
match

Unnamed: 0,home_team_id,away_team_id,home_score,away_score,WR_home,WR_away,OWR_away,OWR_home,OOWR_home,OOWR_away,RPI_home,RPI_away
0,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083,0.495833,0.502083
1,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083,0.495833,0.502083
2,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083,0.495833,0.502083
3,1601,1957,-1,1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083,0.495833,0.502083
4,1601,1957,1,-1,0.495833,0.502083,0.502083,0.495833,0.495833,0.502083,0.495833,0.502083
...,...,...,...,...,...,...,...,...,...,...,...,...
25974,274581,9997,-1,1,0.377119,0.350000,0.350000,0.377119,0.377119,0.350000,0.377119,0.350000
25975,274581,10000,0,0,0.514151,0.350000,0.350000,0.514151,0.514151,0.350000,0.514151,0.350000
25976,274581,10000,1,-1,0.514151,0.350000,0.350000,0.514151,0.514151,0.350000,0.514151,0.350000
25977,274581,10001,-1,1,0.423077,0.350000,0.350000,0.423077,0.423077,0.350000,0.423077,0.350000


In [22]:
X, y = match[['RPI_home', 'RPI_away']], match['home_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [53]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))

0.7821690921955302


In [54]:
reg = MLPRegressor(hidden_layer_sizes=(10, 10), random_state=1, max_iter=500).fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.7818145731665848