In [1]:
import pandas as pd
import math
import csv
import random
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

In [2]:
base_elo = 1600
team_elos = {} 
team_stats = {}
X = []
y = []


In [3]:
def initialize_data(Mstat, Ostat, Tstat):
    new_Mstat = Mstat.drop(['Rk', 'Arena'], axis=1)
    new_Ostat = Ostat.drop(['Rk', 'G', 'MP'], axis=1)
    new_Tstat = Tstat.drop(['Rk', 'G', 'MP'], axis=1)

    team_stats1 = pd.merge(new_Mstat, new_Ostat, how='left', on='Team')
    team_stats1 = pd.merge(team_stats1, new_Tstat, how='left', on='Team')
    return team_stats1.set_index('Team', inplace=False, drop=True)

In [4]:
def get_elo(team):
    try:
        return team_elos[team]
    except:
        # When there is no elo initially, each team is initially assigned base_elo
        team_elos[team] = base_elo
        return team_elos[team]

In [5]:
# Calculate the elo value of each team
def calc_elo(win_team, lose_team):
    winner_rank = get_elo(win_team)
    loser_rank = get_elo(lose_team)

    rank_diff = winner_rank - loser_rank
    exp = (rank_diff  * -1) / 400
    odds = 1 / (1 + math.pow(10, exp))
    # Modify K value according to rank level
    if winner_rank < 2100:
        k = 32
    elif winner_rank >= 2100 and winner_rank < 2400:
        k = 24
    else:
        k = 16
    
    # Update rank values
    new_winner_rank = round(winner_rank + (k * (1 - odds)))      
    new_loser_rank = round(loser_rank + (k * (0 - odds)))
    return new_winner_rank, new_loser_rank

In [6]:
def  build_dataSet(all_data):
    print("Building data set..")
    X = []
    skip = 0
    for index, row in all_data.iterrows():

        Wteam = row['WTeam']
        Lteam = row['LTeam']

        #Take the initial elo or the initial elo value of each team
        team1_elo = get_elo(Wteam)
        team2_elo = get_elo(Lteam)

        # Add 100 elo to home team
        if row['WLoc'] == 'Home_Team':
            team1_elo += 100
        else:
            team2_elo += 100

        # Think of elo as the first eigenvalue of each team
        team1_features = [team1_elo]
        team2_features = [team2_elo]

        # Add stats for each team we got from basketball reference.com
        for key, value in team_stats.loc[Wteam].iteritems():
            team1_features.append(value)
        for key, value in team_stats.loc[Lteam].iteritems():
            team2_features.append(value)

        # Randomly assign the characteristic values of the two teams to the left and right sides of each game data
        # And assign the corresponding 0/1 to the y value
        if random.random() > 0.5:
            X.append(team1_features + team2_features)
            y.append(0)
        else:
            X.append(team2_features + team1_features)
            y.append(1)

        if skip == 0:
            print('X',X)
            skip = 1

        # Update the team's elo value based on the data of this game
        new_winner_rank, new_loser_rank = calc_elo(Wteam, Lteam)
        team_elos[Wteam] = new_winner_rank
        team_elos[Lteam] = new_loser_rank

    return np.nan_to_num(X), y

In [7]:
def predict_winner(team_1, team_2, model):
    features = []

    # team 1, visitor team
    features.append(get_elo(team_1))
    for key, value in team_stats.loc[team_1].iteritems():
        features.append(value)

    # team 2，home team
    features.append(get_elo(team_2) + 100)
    for key, value in team_stats.loc[team_2].iteritems():
        features.append(value)

    features = np.nan_to_num(features)
    return model.predict_proba([features])

In [8]:
if __name__ == '__main__':

    Mstat = pd.read_csv(r'C:\Users\dell\Desktop\BD_project\M.csv')
    Ostat = pd.read_csv(r'C:\Users\dell\Desktop\BD_project\O.csv')
    Tstat = pd.read_csv(r'C:\Users\dell\Desktop\BD_project\T.csv')

    team_stats = initialize_data(Mstat, Ostat, Tstat)

    result_data = pd.read_csv(r'C:\Users\dell\Desktop\BD_project\2018-2019_result(new).csv')
    X, y = build_dataSet(result_data)

    # Train the network model
    print("Fitting on %d game samples.." % len(X))

    model = linear_model.LogisticRegression()
    model.fit(X, y)

    # Calculate training accuracy using 10% cross validation
    print("Doing cross-validation..")
    print(cross_val_score(model, X, y, cv = 10, scoring='accuracy', n_jobs=-1).mean())
    print('Predicting on new schedule..')
    schedule1617 = pd.read_csv(r'C:\Users\dell\Desktop\BD_project\2018-2019_schedule(new).csv')
    result = []
    for index, row in schedule1617.iterrows():
        team1 = row['Visitor']
        team2 = row['Home']
        pred = predict_winner(team1, team2, model)
        prob = pred[0][0]
        if prob > 0.5:
            winner = team1
            loser = team2
            result.append([winner, loser, prob])
        else:
            winner = team2
            loser = team1
            result.append([winner, loser, 1 - prob])

    with open('18-19Result(April,May,June).csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['win', 'lose', 'probability'])
        writer.writerows(result)
        print('done.')

Building data set..
X [[1700, 25.7, 49.0, 33.0, 52.0, 30.0, 4.44, -0.54, 3.9, 112.2, 107.8, 4.4, 99.6, 0.215, 0.381, 0.5670000000000001, 0.534, 11.5, 21.6, 0.17300000000000001, 0.514, 13.4, 77.0, 0.198, 763584.0, 18624.0, 39.5, 88.1, 0.44799999999999995, 11.5, 33.5, 0.344, 28.0, 54.6, 0.513, 17.4, 22.8, 0.764, 10.4, 35.5, 45.9, 23.7, 6.8, 3.9, 15.1, 19.5, 108.0, 42.1, 90.5, 0.465, 12.6, 34.5, 0.365, 29.5, 56.0, 0.527, 15.6, 19.5, 0.802, 9.8, 34.7, 44.5, 26.3, 8.6, 5.3, 12.8, 20.4, 112.4, 1600, 26.4, 51.0, 31.0, 48.0, 34.0, 2.7, -0.44, 2.25, 112.6, 110.0, 2.6, 101.6, 0.312, 0.342, 0.574, 0.532, 12.9, 24.5, 0.24100000000000002, 0.512, 11.1, 78.6, 0.20600000000000002, 838342.0, 20447.0, 41.7, 91.5, 0.455, 10.3, 30.0, 0.342, 31.4, 61.5, 0.511, 18.8, 24.5, 0.768, 10.0, 33.5, 43.5, 23.4, 7.7, 4.1, 12.7, 22.1, 112.5, 41.5, 88.2, 0.47100000000000003, 10.8, 30.2, 0.359, 30.7, 58.0, 0.529, 21.2, 27.5, 0.7709999999999999, 10.9, 36.9, 47.8, 26.9, 7.4, 5.3, 14.9, 21.3, 115.2]]
Fitting on 1151 game 



Doing cross-validation..
0.6656254767353166
Predicting on new schedule..
done.


In [9]:
pd.read_csv('18-19Result(April,May,June).csv',header=0)

Unnamed: 0,win,lose,probability
0,Indiana Pacers,Detroit Pistons,0.557497
1,Boston Celtics,Miami Heat,0.608762
2,Milwaukee Bucks,Brooklyn Nets,0.828164
3,Chicago Bulls,New York Knicks,0.583720
4,Toronto Raptors,Orlando Magic,0.774639
5,Portland Trail Blazers,Minnesota Timberwolves,0.744656
6,Philadelphia 76ers,Dallas Mavericks,0.799459
7,Utah Jazz,Charlotte Hornets,0.657618
8,Phoenix Suns,Cleveland Cavaliers,0.549843
9,Oklahoma City Thunder,Los Angeles Lakers,0.668382
