In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder  #将字符匹配为数值
sns.set_style("darkgrid")
sns.set_context("notebook")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


# Any results you write to the current directory are saved as output.

Set some constants we need to compute Elo ratings

In [15]:
mean_elo = 1500  #平均评分
elo_width = 400
k_factor = 64

## Load the data ##

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'LogisticRegression.ipynb',
 'notebook.ipynb',
 'RegularSeasonDetailedResults.csv',
 'Result.csv',
 'SampleSubmission.csv',
 'Seasons.csv',
 'TourneyCompactResults.csv',
 'TourneySeeds.csv']

In [9]:
df_reg = pd.read_csv('RegularSeasonCompactResults.csv')
df_tour = pd.read_csv('TourneyCompactResults.csv')

In [10]:
df_tour.columns

Index(['Season', 'Daynum', 'Wteam', 'Wscore', 'Lteam', 'Lscore', 'Wloc',
       'Numot'],
      dtype='object')

- Concatenate both regular season and tournament results into one DataFrame.
- Drop the columns we don't need. 
- Sort chronologically, ie by season, then by date in that season

In [11]:
df_concat = pd.concat((df_reg, df_tour), ignore_index=True)  
df_concat.drop(labels=[ 'Wscore', 'Lscore', 'Wloc', 'Numot'], inplace=True, axis=1) #drop the columns don't need
df_concat.sort_values(by=['Season', 'Daynum'], inplace=True)

Transform team IDs to be from 0 to number_of_teams-1.
We do this so that we can use team ID as an index for lookups later.

In [12]:
le = LabelEncoder()
df_concat.Wteam = le.fit_transform(df_concat.Wteam)
df_concat.Lteam = le.fit_transform(df_concat.Lteam)

## Elo stuff preparation ##
Define the functions we need to calculate the probability of winning given two Elo ratings,
and also the change in Elo rating after a game is played.

In [26]:
def update_elo(winner_elo, loser_elo):
    """
    https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details
    """
    expected_win = expected_result(winner_elo, loser_elo)  #计算获胜的概率
    change_in_elo = k_factor * (1-expected_win)  #根据实际得分，更新评级的值
    winner_elo += change_in_elo   #增加对实际获胜的队伍的评定等级
    loser_elo -= change_in_elo    #减少对实际败北的队伍的评定等级
    return winner_elo, loser_elo   #返回两支队伍的等级

def expected_result(elo_a, elo_b):
    """
    :params elo_a:玩家a的评分;   elo_b:玩家b的评分
    https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details
    """
    expect_a = 1.0/(1+10**((elo_b - elo_a)/elo_width))  #玩家a的预期得分
    return expect_a

In [27]:
def update_end_of_season(elos):
    """
    根据平均评分更新该评分
    Regression towards the mean
    
    Following 538 nfl methods
    https://fivethirtyeight.com/datalab/nfl-elo-ratings-are-back/
    """
    diff_from_mean = elos - mean_elo   #与平均评分的差值
    elos -= diff_from_mean/3  
    return elos

In [28]:
df_concat.head()

Unnamed: 0,Season,Daynum,Wteam,Lteam,w_elo_before_game,w_elo_after_game,l_elo_before_game,l_elo_after_game,total_days
0,1985,20,127,227,0,0,0,0,5498.75
1,1985,25,5,253,0,0,0,0,5503.75
2,1985,25,11,122,0,0,0,0,5503.75
3,1985,25,64,331,0,0,0,0,5503.75
4,1985,25,91,346,0,0,0,0,5503.75


In [29]:
df_concat['w_elo_before_game'] = 0
df_concat['w_elo_after_game'] = 0
df_concat['l_elo_before_game'] = 0
df_concat['l_elo_after_game'] = 0
elo_per_season = {}
n_teams = len(le.classes_)  #队伍总数
current_elos = np.ones(shape=(n_teams)) * mean_elo 

# Make a new column with a unique time
I use days since Jan 1, 1970 to be able to convert to a datetime object later

In [30]:
df_concat['total_days'] = (df_concat.Season-1970)*365.25 + df_concat.Daynum  #将日期转化为天数（从1970-01-01始）

In [31]:
df_team_elos = pd.DataFrame(index=df_concat.total_days.unique(), 
                            columns=range(n_teams))   #行代表每一个日期，列代表每一支队伍
df_team_elos.iloc[0, :] = current_elos

In [32]:
df_team_elos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,354,355,356,357,358,359,360,361,362,363
5498.75,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
5503.75,,,,,,,,,,,...,,,,,,,,,,
5504.75,,,,,,,,,,,...,,,,,,,,,,
5505.75,,,,,,,,,,,...,,,,,,,,,,
5506.75,,,,,,,,,,,...,,,,,,,,,,


## The loop where it happens ##

- We go through each row in the DataFrame. 
- We look up the current Elo rating of both teams. 
- We calculate the expected wins for the team that *actually won*. This is also what we use for *probability of winning*.
- Write Elo before and after the game in the Data Frame. 
- Update the Elo rating for both teams in the "current_elos" list.

In [33]:
current_season = df_concat.at[0, 'Season']  
for row in df_concat.itertuples():  #遍历每一行
    if row.Season != current_season:  #如果数据不属于当前赛季
        # Check if we are starting a new season. 
        # Regress all ratings towards the mean
        current_elos = update_end_of_season(current_elos)
        # Write the beginning of new season ratings to a dict for later lookups.
        elo_per_season[row.Season] = current_elos.copy() #储存每一个赛季的平均评分
        current_season = row.Season
    idx = row.Index
    w_id = row.Wteam  #获取获胜队伍的id
    l_id = row.Lteam  #获取败北队伍的id
    # Get current elos
    w_elo_before = current_elos[w_id]  #获取赛前的评分
    l_elo_before = current_elos[l_id]
    # Update on game results
    w_elo_after, l_elo_after = update_elo(w_elo_before, l_elo_before)  #根据比赛结果更新评分
        
    # 储存更新后的评分
    df_concat.at[idx, 'w_elo_before_game'] = w_elo_before
    df_concat.at[idx, 'l_elo_before_game'] = l_elo_before
    df_concat.at[idx, 'w_elo_after_game'] = w_elo_after
    df_concat.at[idx, 'l_elo_after_game'] = l_elo_after
    current_elos[w_id] = w_elo_after
    current_elos[l_id] = l_elo_after
    
    # 储存每支队伍更新后的评分
    today = row.total_days
    df_team_elos.at[today, w_id] = w_elo_after
    df_team_elos.at[today, l_id] = l_elo_after


## Evaluation ##
Sample 10,000 games from recent seasons. 
Record the expected wins and use this to calculate the logloss.

In [34]:
n_samples = 10000
samples = df_concat[df_concat.Season > 2010].sample(n_samples)  #取2010年后10000个样本
loss=0
expected_list = []
for row in samples.itertuples(): #遍历每一个样本
    w_elo = row.w_elo_before_game
    l_elo = row.l_elo_before_game
    w_expected = expected_result(w_elo, l_elo)  #计算获胜队伍更新后的评分
    expected_list.append(w_expected)
    loss += np.log(w_expected)
print(loss/n_samples)

-0.564714299697


In [35]:
samples.head()

Unnamed: 0,Season,Daynum,Wteam,Lteam,w_elo_before_game,w_elo_after_game,l_elo_before_game,l_elo_after_game,total_days
134408,2014,128,99,105,1357,1400,1488,1444,16199.0
122908,2012,108,334,178,1775,1797,1666,1643,15448.5
139092,2015,111,348,349,1563,1595,1565,1533,16547.25
129648,2014,13,157,218,1551,1566,1336,1321,16084.0
137516,2015,79,219,131,1765,1778,1520,1508,16515.25


In [38]:
sns.distplot(expected_list, kde=False, bins=20)
plt.xlabel('Elo Expected Wins for Actual Winner')
plt.ylabel('Counts')
plt.show()

## Look at Elo ratings over time ##

- Fill all the N/As with the previous Elo rating. 
- Rename the columns to a string
- Make a new column with the datetime of the game

In [42]:
df_team_elos.fillna(method='ffill', inplace=True)
trans_dict = {i: 'team_{}'.format(i) for i in range(n_teams)}
df_team_elos.rename(columns=trans_dict, inplace=True)
epoch = (df_team_elos.index)
df_team_elos['date'] = pd.to_datetime(epoch, unit='D')

In [46]:
df_team_elos.plot(x='date', y=['team_1', 'team_2'])
plt.ylabel('Elo rating')
plt.show()